#=
This file is part of the Npy.jl package.
Copyright © 2019, 2020 Lukas Himbert
Npy.jl is free software:
you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.
Npy.jl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with Npy.jl.
If not, see https://www.gnu.org/licenses/.
=#
"""module Npy
Read and Write .npy files, using mmap().
See `NpyArray`.
"""
module Npy
import Base.isreadable
import Base.iswritable
import Base.size
import Base.IndexStyle
import Base.getindex
import Base.setindex!
import Base.LinearIndices
import Mmap
import Mmap.sync!
include("Header.jl")
using .Header: ParseError, Endianness, LittleEndian, BigEndian, NoEndian
export NpyArray, sync!
SupportedTypeUnion = eval(Expr(:curly, Union, Header.dtypes_T...)) # Union{Bool, ...}
SupportedTypesTypes = eval(Expr(:curly, Union, (Expr(:curly, Type, t) for t in Header.dtypes_T)...)) # Union{Type{Bool}, ...}
#=
NpyArray type and constructors
=#
"""`NpyArray{T,N} <: AbstractArray{T,N}`
A .npy data file, loaded into memory with `Mmap.mmap`, allowing data files which are larger than the main memory.
The following values are allowed for `T`: `Bool`, `Int8`, `Int16`, `Int32`, `Int64`, `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Float32`, `Float64`, `Complex{Float32}`, `Complex{Float64}`.
Changes to the `NpyArray` change the underlying file correspondingly (if it was opened with write permissions).
To force writing the changes to disk at a particular time, use `sync!`.
When no reference to a particular `NpyArray` remains, the memory is unmapped and all outstanding changes are written to disk at an unspecified future time.
NpyArray(f :: IOStream)
NpyArray(fn :: AbstractString; write :: Bool)
Read a .npy file `f` or `fn`.
The array entries may be changed if and only if `f` is writable or `write == true` (by default, `write == false`).
To check if an existing `NpyArray` is mutable, use the function `iswritable`.
Constructors may throw an `Npy.ParseError` if the file is not a valid .npy file or contains unsupported elements.
NpyArray(f :: IOStream, datatype, shape)
NpyArray(fn :: AbstractString, datatype, shape)
Create or overwrite an existing file `f` or `fn`.
The NpyArray will have the given `shape` and `datatype` and will be writable.
`datatype` needs to be one of the allowed values for `T` listed above.
If the file did not exist prior to opening it this way, the array entries are initialized to zero.
Otherwise, the array values are unspecified.
NpyArray(f :: IOStream, a :: AbstractArray)
NpyArray(fn :: IOStream, a :: AbstractArray)
Create or overwrite an existing file `f` or `fn` with values from `a`.
The `NpyArray` will be writable.
Note that the `NpyArray` must have standard (1-based) axes as of version 1.0.0.
NpyArray(f :: Function, ...)
Delegating constructor for `do` notation.
Opens or creates an `NpyArray` (depending on further arguments) and calls f with it; then `sync!`s the `NpyArray`.
Examples
--------
```jldoctest
julia> tmpdir = mktempdir(); # create a temporary directory to create files in
julia> npy_arr = NpyArray(tmpdir * "/abc.npy", Float64[23.0 π; NaN exp(-42)]) # Create a new npy file
2×2 NpyArray{Float64,2}:
23.0 3.14159
NaN 5.74952e-19
julia> sync!(npy_arr)
julia> NpyArray(tmpdir * "/abc.npy") do npy_arr2
# Open the file for reading and print the element type
println(eltype(npy_arr2))
end
Float64
julia> NpyArray(tmpdir * "/def.npy", Complex{Float32}, (2, 3)) do npy_arr3
# Create a new npy array and initialize it
for ind in CartesianIndices(npy_arr3)
npy_arr3[ind] = 1.0 / (ind[1] + ind[2]im)
end
show(stdout, MIME("text/plain"), npy_arr3)
end
2×3 NpyArray{Complex{Float32},2}:
0.5-0.5im 0.2-0.4im 0.1-0.3im
0.4-0.2im 0.25-0.25im 0.153846-0.230769im
julia> rm(tmpdir; recursive=true) # clean up
```
"""
struct NpyArray{T, N} <: AbstractArray{T,N}
endianness :: Header.Endianness
fortran_order :: Bool
shape :: NTuple{N, UInt64}
data :: Array{UInt8,1}
data_offset :: UInt64
writable :: Bool
function NpyArray(f :: IOStream)
# Open existing file f as an npy array.
data = Mmap.mmap(f, Array{UInt8,1}, fsize(f), 0; grow=false, shared=true)
header = Header.parseheader(data)
length(data) == BigInt(header.data_offset) - 1 + prod(BigInt.(header.shape)) * BigInt(sizeof(header.datatype)) || error("File size does not match header size + data size.")
length(data) == UInt64(header.data_offset) - 1 + prod(UInt64.(header.shape)) * UInt64(sizeof(header.datatype)) || throw(OverflowError("Somehow, the array size is too large for UInt64."))
return new{header.datatype, length(header.shape)}(header.endianness, header.fortran_order, tuple(UInt64.(header.shape)...), data, header.data_offset, iswritable(f))
end
function NpyArray(f :: IOStream, T :: x, shape) :: NpyArray{T,length(shape)} where {x <: SupportedTypesTypes}
# Make existing file f into an npy array.
iswritable(f) || error("File is not writable")
endianness = if sizeof(T) == 1; NoEndian; else BigEndian; end
fortran_order = false
all(x -> isinteger(x) && x > 0, shape) || error("Invalid dimensions $shape.")
# Write array, elements initialized to zero (unless the file already contains data)
header = Header.writeheader(endianness, T, shape, fortran_order)
filesize = prod(BigInt.(shape)) * sizeof(T) + length(header)
truncate(f, filesize)
# Mmap the file
data = Mmap.mmap(f, Array{UInt8,1}, fsize(f), 0; grow=false, shared=true)
# Write the header
data[1:length(header)] = header
# Create the NpyArray struct
return new{T,length(shape)}(endianness, fortran_order, shape, data, length(header)+1, true)
end
end
function NpyArray(f :: IOStream, arr :: AbstractArray{T,N}) where {T <: SupportedTypeUnion, N}
# Copy data from arr to the new npy file f
npy_arr = NpyArray(f, T, size(arr))
npy_arr .= arr
return npy_arr
end
function NpyArray(fn :: AbstractString; write = false :: Bool)
# Open existing file by name
return open(fn, read=true, write=write) do f
NpyArray(f)
end
end
function NpyArray(fn :: AbstractString, args...)
# Create file by name
return open(fn, read=true, write=true, create=true) do f
NpyArray(f, args...)
end
end
function NpyArray(f :: Function, args...; kwargs...)
# convenient context manager wrapper (for do-notation)
arr = NpyArray(args...; kwargs...)
return try
f(arr)
finally
sync!(arr)
end
end
#=
Useful file functions
=#
Base.isreadable(arr :: NpyArray) = true
Base.iswritable(arr :: NpyArray) = arr.writable
Mmap.sync!(arr :: NpyArray) = Mmap.sync!(arr.data)
#=
AbstractArray functions
=#
#=
Note:
NpyArray does allow fast linear indexing, so it is tempting to set this to IndexLinear() unconditionally.
Indeed, the getindex(::NpyArray, ::Integer) and setindex!(::NpyArray, ::Any, Integer) functions perform linear indexing into the mmapped memory.
Moreover, the variadic getindex() and setindex!() functions rely on the linear indexing as well.
However, Julia only supports the column major memory layout.
Its builtin conversion functions assume column major linearisation and there does not seem to be a documented way around that.
For this reason, arrays with row major order emulate row major order; the getindex and setindex! functions with linear indices compute the corresponding Cartesian index under the assumption of column major order, then compute the corresponding linear index under the assumption of row major order.
This is of course a waste of computational resources, but what ya gonna do.
Without this emulation, mixing linear indices of arrays with differing linearisation order would lead to bugs that are hard to detect and diagnose.
This is best seen with the following example:
If a = NpyArray(_, [1 2 3; 4 5 6]), then a[1] == a[1,1] == 1, a[2] == a[1,2] == 2, a[3] == a[1,3] == 3, etc.
Whereas if b = [1 2 3; 4 5 6], then b[1] == b[1,1] == 1, b[2] == b[2,1] == 4, b[3] == b[1,2] == 2, etc.
Similarly, broadcasting a .= b would set a[1,1] = 1, a[1,2] = 4, etc., which is incorrect.
Indexing an NpyArray works like this:
- The user indexes an array by writing arr[i] = x (for example), where i is a CartesianIndex or an Int
- This is desugared by the language into setindex!(arr::NpyArray, x, i::typeof(i)) (or a similar getindex call).
- The index is flattened into a linear index, according to the storage order:
- For a CartesianIndex, the index is flattened into a linear index according to the storage order of the array - row major or column major.
This is done by `flatten`.
- For a linear index and a Fortran contiguous array, the linear index is used unchanged.
- For a linear index and a C contiguous array, the linear index is first unflattened into a Cartesian index under the assumption of Fortran contiguous order, then re-flattened under the assumption of C contiguous order.
This emulation of column major order for row major arrays is expensive.
This is done by `unflatten_column_major` and `flatten`.
This is done by `getindex` and `setindex!`.
- The linear index is used as a pointer to the memory mapped data.
This is done by `do_getindex` and `do_setindex!`.
- The value is read/written and converted to/from the host endianness.
This is also done by `do_getindex` and `do_setindex!`.
=#
Base.IndexStyle(arr :: NpyArray) = if arr.fortran_order; IndexLinear(); else IndexCartesian() end
Base.IndexStyle(::Type{NpyArray}) = IndexCartesian()
Base.size(arr :: NpyArray) = arr.shape
function do_getindex(arr :: NpyArray{T,N}, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
x = unsafe_load(getptr(arr, i))
return if arr.endianness == LittleEndian
x_ = ltoh(x)
elseif arr.endianness == BigEndian
x_ = ntoh(x)
else
@assert sizeof(x) == 1
x_ = x
end
return x_
end
function do_setindex!(arr :: NpyArray{T,N}, x, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
iswritable(arr) || throw(ReadOnlyMemoryError())
x_ = convert(T, x)
if arr.endianness == LittleEndian
x__ = htol(x_)
elseif arr.endianness == BigEndian
x__ = hton(x_)
else
@assert sizeof(x_) == 1
x__ = x_
end
unsafe_store!(getptr(arr, i), x__)
return x_
end
function Base.getindex(arr :: NpyArray{T,N}, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
if arr.fortran_order
return do_getindex(arr, i)
else
return do_getindex(arr, flatten(arr, unflatten_column_major(arr, i)...))
end
end
function Base.getindex(arr :: NpyArray{T,N}, inds :: Vararg{I,N}) :: T where {T<:SupportedTypeUnion,I<:Integer,N}
return do_getindex(arr, flatten(arr, inds...))
end
function Base.setindex!(arr :: NpyArray{T,N}, x, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
if arr.fortran_order
return do_setindex!(arr, x, i)
else
return do_setindex!(arr, x, flatten(arr, unflatten_column_major(arr, i)...))
end
end
function Base.setindex!(arr :: NpyArray{T,N}, x, inds :: Vararg{I,N}) :: T where {T<:SupportedTypeUnion,I<:Integer,N}
return do_setindex!(arr, x, flatten(arr, inds...))
end
#=
Helper functions
=#
function getptr(arr :: NpyArray{T,N}, i :: Integer) :: Ptr{T} where {T<:SupportedTypeUnion,N}
@boundscheck 1 <= i && i <= length(arr) || throw(BoundsError(arr, i))
@boundscheck @assert checkbounds(Bool, arr.data, arr.data_offset + (i - 1) * sizeof(T))
@boundscheck @assert checkbounds(Bool, arr.data, arr.data_offset + i * sizeof(T) - 1)
return convert(Ptr{T}, pointer(arr.data) + arr.data_offset - 1 + (i - 1) * sizeof(T))
end
function fsize(f :: IOStream)
pos = position(f)
seekend(f)
ret = position(f)
seek(f, pos)
return ret
end
function flatten(arr :: NpyArray, inds :: CartesianIndex)
return flatten(arr, Tuple(inds)...)
end
function flatten(arr :: NpyArray, inds...)
@boundscheck length(inds) == length(arr.shape) || throw(BoundsError(arr, inds))
ind = 0
stride = 1
if arr.fortran_order
for (i, x) in enumerate(inds)
@boundscheck 1 <= x && x <= size(arr)[i] || throw(BoundsError(arr, inds))
ind += (x - 1) * stride
stride *= arr.shape[i]
end
else
for (i, x) in reverse(collect(enumerate(inds)))
@boundscheck 1 <= x && x <= size(arr)[i] || throw(BoundsError(arr, inds))
ind += (x - 1) * stride
stride *= arr.shape[i]
end
end
return ind + 1
end
function unflatten_column_major(arr :: NpyArray, idx)
idx -= 1
inds = zeros(Int, length(size(arr)))
for (i, n) in enumerate(size(arr))
idx, inds[i] = divrem(idx, n)
end
return Tuple(inds .+ 1)
end
end