~quf/Npy.jl

Npy.jl/src/Npy.jl -rw-r--r-- 12.7 KiB
a00a16d9Lukas Himbert Also bump version in Project.toml 5 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
#=
This file is part of the Npy.jl package.

Copyright © 2019, 2020  Lukas Himbert

Npy.jl is free software:
you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.

Npy.jl is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with Npy.jl.
If not, see https://www.gnu.org/licenses/.
=#


"""module Npy

Read and Write .npy files, using mmap().
See `NpyArray`.
"""
module Npy

import Base.isreadable
import Base.iswritable
import Base.size
import Base.IndexStyle
import Base.getindex
import Base.setindex!
import Base.LinearIndices

import Mmap
import Mmap.sync!

include("Header.jl")

using .Header: ParseError, Endianness, LittleEndian, BigEndian, NoEndian


export NpyArray, sync!


SupportedTypeUnion = eval(Expr(:curly, Union, Header.dtypes_T...)) # Union{Bool, ...}
SupportedTypesTypes = eval(Expr(:curly, Union, (Expr(:curly, Type, t) for t in Header.dtypes_T)...)) # Union{Type{Bool}, ...}


#=
  NpyArray type and constructors
=#


"""`NpyArray{T,N} <: AbstractArray{T,N}`

A .npy data file, loaded into memory with `Mmap.mmap`, allowing data files which are larger than the main memory.
The following values are allowed for `T`: `Bool`, `Int8`, `Int16`, `Int32`, `Int64`, `UInt8`, `UInt16`, `UInt32`, `UInt64`, `Float32`, `Float64`, `Complex{Float32}`, `Complex{Float64}`.

Changes to the `NpyArray` change the underlying file correspondingly (if it was opened with write permissions).
To force writing the changes to disk at a particular time, use `sync!`.
When no reference to a particular `NpyArray` remains, the memory is unmapped and all outstanding changes are written to disk at an unspecified future time.

    NpyArray(f :: IOStream)
    NpyArray(fn :: AbstractString; write :: Bool)

Read a .npy file `f` or `fn`.
The array entries may be changed if and only if `f` is writable or `write == true` (by default, `write == false`).
To check if an existing `NpyArray` is mutable, use the function `iswritable`.

Constructors may throw an `Npy.ParseError` if the file is not a valid .npy file or contains unsupported elements.

    NpyArray(f :: IOStream, datatype, shape)
    NpyArray(fn :: AbstractString, datatype, shape)

Create or overwrite an existing file `f` or `fn`.
The NpyArray will have the given `shape` and `datatype` and will be writable.
`datatype` needs to be one of the allowed values for `T` listed above.
If the file did not exist prior to opening it this way, the array entries are initialized to zero.
Otherwise, the array values are unspecified.

    NpyArray(f :: IOStream, a :: AbstractArray)
    NpyArray(fn :: IOStream, a :: AbstractArray)

Create or overwrite an existing file `f` or `fn` with values from `a`.
The `NpyArray` will be writable.
Note that the `NpyArray` must have standard (1-based) axes as of version 1.0.0.

    NpyArray(f :: Function, ...)

Delegating constructor for `do` notation.
Opens or creates an `NpyArray` (depending on further arguments) and calls f with it; then `sync!`s the `NpyArray`.


Examples
--------

```jldoctest
julia> tmpdir = mktempdir(); # create a temporary directory to create files in

julia> npy_arr = NpyArray(tmpdir * "/abc.npy", Float64[23.0 π; NaN exp(-42)]) # Create a new npy file
2×2 NpyArray{Float64,2}:
  23.0  3.14159    
 NaN    5.74952e-19

julia> sync!(npy_arr)

julia> NpyArray(tmpdir * "/abc.npy") do npy_arr2
         # Open the file for reading and print the element type
         println(eltype(npy_arr2))
       end
Float64

julia> NpyArray(tmpdir * "/def.npy", Complex{Float32}, (2, 3)) do npy_arr3
         # Create a new npy array and initialize it
         for ind in CartesianIndices(npy_arr3)
           npy_arr3[ind] = 1.0 / (ind[1] + ind[2]im)
         end
         show(stdout, MIME("text/plain"), npy_arr3)
       end
2×3 NpyArray{Complex{Float32},2}:
 0.5-0.5im   0.2-0.4im        0.1-0.3im     
 0.4-0.2im  0.25-0.25im  0.153846-0.230769im

julia> rm(tmpdir; recursive=true) # clean up
```
"""
struct NpyArray{T, N} <: AbstractArray{T,N}
  endianness :: Header.Endianness
  fortran_order :: Bool
  shape :: NTuple{N, UInt64}
  data :: Array{UInt8,1}
  data_offset :: UInt64
  writable :: Bool

  function NpyArray(f :: IOStream)
    # Open existing file f as an npy array.
    data = Mmap.mmap(f, Array{UInt8,1}, fsize(f), 0; grow=false, shared=true)
    header = Header.parseheader(data)
    length(data) == BigInt(header.data_offset) - 1 + prod(BigInt.(header.shape)) * BigInt(sizeof(header.datatype)) || error("File size does not match header size + data size.")
    length(data) == UInt64(header.data_offset) - 1 + prod(UInt64.(header.shape)) * UInt64(sizeof(header.datatype)) || throw(OverflowError("Somehow, the array size is too large for UInt64."))
    return new{header.datatype, length(header.shape)}(header.endianness, header.fortran_order, tuple(UInt64.(header.shape)...), data, header.data_offset, iswritable(f))
  end

  function NpyArray(f :: IOStream, T :: x, shape) :: NpyArray{T,length(shape)} where {x <: SupportedTypesTypes}
    # Make existing file f into an npy array.
    iswritable(f) || error("File is not writable")
    endianness = if sizeof(T) == 1; NoEndian; else BigEndian; end
    fortran_order = false
    all(x -> isinteger(x) && x > 0, shape) || error("Invalid dimensions $shape.")
    # Write array, elements initialized to zero (unless the file already contains data)
    header = Header.writeheader(endianness, T, shape, fortran_order)
    filesize = prod(BigInt.(shape)) * sizeof(T) + length(header)
    truncate(f, filesize)
    # Mmap the file
    data = Mmap.mmap(f, Array{UInt8,1}, fsize(f), 0; grow=false, shared=true)
    # Write the header
    data[1:length(header)] = header
    # Create the NpyArray struct
    return new{T,length(shape)}(endianness, fortran_order, shape, data, length(header)+1, true)
  end
end


function NpyArray(f :: IOStream, arr :: AbstractArray{T,N}) where {T <: SupportedTypeUnion, N}
  # Copy data from arr to the new npy file f
  npy_arr = NpyArray(f, T, size(arr))
  npy_arr .= arr
  return npy_arr
end


function NpyArray(fn :: AbstractString; write = false :: Bool)
  # Open existing file by name
  return open(fn, read=true, write=write) do f
    NpyArray(f)
  end
end


function NpyArray(fn :: AbstractString, args...)
  # Create file by name
  return open(fn, read=true, write=true, create=true) do f
    NpyArray(f, args...)
  end
end


function NpyArray(f :: Function, args...; kwargs...)
  # convenient context manager wrapper (for do-notation)
  arr = NpyArray(args...; kwargs...)
  return try
    f(arr)
  finally
    sync!(arr)
  end
end


#=
  Useful file functions
=#


Base.isreadable(arr :: NpyArray) = true
Base.iswritable(arr :: NpyArray) = arr.writable
Mmap.sync!(arr :: NpyArray) = Mmap.sync!(arr.data)


#=
  AbstractArray functions
=#


#=
Note:
NpyArray does allow fast linear indexing, so it is tempting to set this to IndexLinear() unconditionally.
Indeed, the getindex(::NpyArray, ::Integer) and setindex!(::NpyArray, ::Any, Integer) functions perform linear indexing into the mmapped memory.
Moreover, the variadic getindex() and setindex!() functions rely on the linear indexing as well.

However, Julia only supports the column major memory layout.
Its builtin conversion functions assume column major linearisation and there does not seem to be a documented way around that.

For this reason, arrays with row major order emulate row major order; the getindex and setindex! functions with linear indices compute the corresponding Cartesian index under the assumption of column major order, then compute the corresponding linear index under the assumption of row major order.
This is of course a waste of computational resources, but what ya gonna do.

Without this emulation, mixing linear indices of arrays with differing linearisation order would lead to bugs that are hard to detect and diagnose.
This is best seen with the following example:
If a = NpyArray(_, [1 2 3; 4 5 6]), then a[1] == a[1,1] == 1, a[2] == a[1,2] == 2, a[3] == a[1,3] == 3, etc.
Whereas if b =     [1 2 3; 4 5 6], then  b[1] == b[1,1] == 1, b[2] == b[2,1] == 4, b[3] == b[1,2] == 2, etc.
Similarly, broadcasting a .= b would set a[1,1] = 1, a[1,2] = 4, etc., which is incorrect.


Indexing an NpyArray works like this:

- The user indexes an array by writing arr[i] = x (for example), where i is a CartesianIndex or an Int
- This is desugared by the language into setindex!(arr::NpyArray, x, i::typeof(i)) (or a similar getindex call).
- The index is flattened into a linear index, according to the storage order:
  - For a CartesianIndex, the index is flattened into a linear index according to the storage order of the array - row major or column major.
    This is done by `flatten`.
  - For a linear index and a Fortran contiguous array, the linear index is used unchanged.
  - For a linear index and a C contiguous array, the linear index is first unflattened into a Cartesian index under the assumption of Fortran contiguous order, then re-flattened under the assumption of C contiguous order.
    This emulation of column major order for row major arrays is expensive.
    This is done by `unflatten_column_major` and `flatten`.
  This is done by `getindex` and `setindex!`.
- The linear index is used as a pointer to the memory mapped data.
  This is done by `do_getindex` and `do_setindex!`.
- The value is read/written and converted to/from the host endianness.
  This is also done by `do_getindex` and `do_setindex!`.
=#
Base.IndexStyle(arr :: NpyArray) = if arr.fortran_order; IndexLinear(); else IndexCartesian() end
Base.IndexStyle(::Type{NpyArray}) = IndexCartesian()


Base.size(arr :: NpyArray) = arr.shape


function do_getindex(arr :: NpyArray{T,N}, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
  x = unsafe_load(getptr(arr, i))
  return if arr.endianness == LittleEndian
    x_ = ltoh(x)
  elseif arr.endianness == BigEndian
    x_ = ntoh(x)
  else
    @assert sizeof(x) == 1
    x_ = x
  end
  return x_
end


function do_setindex!(arr :: NpyArray{T,N}, x, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
  iswritable(arr) || throw(ReadOnlyMemoryError())
  x_ = convert(T, x)
  if arr.endianness == LittleEndian
    x__ = htol(x_)
  elseif arr.endianness == BigEndian
    x__ = hton(x_)
  else
    @assert sizeof(x_) == 1
    x__ = x_
  end
  unsafe_store!(getptr(arr, i), x__)
  return x_
end


function Base.getindex(arr :: NpyArray{T,N}, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
  if arr.fortran_order
    return do_getindex(arr, i)
  else
    return do_getindex(arr, flatten(arr, unflatten_column_major(arr, i)...))
  end
end


function Base.getindex(arr :: NpyArray{T,N}, inds :: Vararg{I,N}) :: T where {T<:SupportedTypeUnion,I<:Integer,N}
  return do_getindex(arr, flatten(arr, inds...))
end


function Base.setindex!(arr :: NpyArray{T,N}, x, i :: Integer) :: T where {T<:SupportedTypeUnion,N}
  if arr.fortran_order
    return do_setindex!(arr, x, i)
  else
    return do_setindex!(arr, x, flatten(arr, unflatten_column_major(arr, i)...))
  end
end


function Base.setindex!(arr :: NpyArray{T,N}, x, inds :: Vararg{I,N}) :: T where {T<:SupportedTypeUnion,I<:Integer,N}
  return do_setindex!(arr, x, flatten(arr, inds...))
end


#=
   Helper functions
=#


function getptr(arr :: NpyArray{T,N}, i :: Integer) :: Ptr{T} where {T<:SupportedTypeUnion,N}
  @boundscheck 1 <= i && i <= length(arr) || throw(BoundsError(arr, i))
  @boundscheck @assert checkbounds(Bool, arr.data, arr.data_offset + (i - 1) * sizeof(T))
  @boundscheck @assert checkbounds(Bool, arr.data, arr.data_offset + i * sizeof(T) - 1)
  return convert(Ptr{T}, pointer(arr.data) + arr.data_offset - 1 + (i - 1) * sizeof(T))
end


function fsize(f :: IOStream)
  pos = position(f)
  seekend(f)
  ret = position(f)
  seek(f, pos)
  return ret
end


function flatten(arr :: NpyArray, inds :: CartesianIndex)
  return flatten(arr, Tuple(inds)...)
end


function flatten(arr :: NpyArray, inds...)
  @boundscheck length(inds) == length(arr.shape) || throw(BoundsError(arr, inds))
  ind = 0
  stride = 1
  if arr.fortran_order
    for (i, x) in enumerate(inds)
      @boundscheck 1 <= x && x <= size(arr)[i] || throw(BoundsError(arr, inds))
      ind += (x - 1) * stride
      stride *= arr.shape[i]
    end
  else
    for (i, x) in reverse(collect(enumerate(inds)))
      @boundscheck 1 <= x && x <= size(arr)[i] || throw(BoundsError(arr, inds))
      ind += (x - 1) * stride
      stride *= arr.shape[i]
    end
  end
  return ind + 1
end


function unflatten_column_major(arr :: NpyArray, idx)
  idx -= 1
  inds = zeros(Int, length(size(arr)))
  for (i, n) in enumerate(size(arr))
    idx, inds[i] = divrem(idx, n)
  end
  return Tuple(inds .+ 1)
end


end