Skip to content

Commit

Permalink
Use internal buffer for vecorized writing and do not close IO after w…
Browse files Browse the repository at this point in the history
…riting (#12)

* Use buffer for writing

* Allow providing predefined buffer

* Finalize transcoding stream without closing underlying io

* Add tests for buffered writing

* Update src/io.jl

Co-authored-by: Seth Axen <[email protected]>

* Update src/io.jl

Co-authored-by: Seth Axen <[email protected]>

* Update src/io.jl

Co-authored-by: Seth Axen <[email protected]>

* Move @inbounds and @views outwards

* Update test/io.jl

Co-authored-by: Seth Axen <[email protected]>

* Rename `unit_vsize` to `buffer_size`

* Extend test

* Run formatter

* Change `unit_vsize` to `buffer_size` in documentation

* Use flush instead of closewrite

* Add Manifest.toml to .gitignore

Co-authored-by: Seth Axen <[email protected]>
  • Loading branch information
andreasKroepelin and sethaxen authored Jan 20, 2023
1 parent 2d650ad commit 051bab3
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
*.jl.cov
*.jl.mem
/docs/build/
Manifest.toml
58 changes: 45 additions & 13 deletions src/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ function read_mmap(path::AbstractString, T::Type{MRCData})
end

"""
write(io::IO, ::MRCData; compress = :none)
write(fn::AbstractString, ::MRCData; compress = :auto)
write(io::IO, ::MRCData; compress = :none, buffer_size = 4096, buffer = nothing)
write(fn::AbstractString, ::MRCData; compress = :auto, buffer_size = 4096, buffer = nothing)
Write an instance of [`MRCData`](@ref) to an IO stream or new file.
Use `compress` to specify the compression with the following options:
Expand All @@ -84,34 +84,66 @@ Use `compress` to specify the compression with the following options:
- `:bz2`: BZ2
- `:xz`: XZ
- `:none`: no compression
The parameter `buffer_size` specifies the size (in bytes) of an intermediate
buffer that is used to speed up the writing by utilizing vectorized writes.
You can also directly provide a preallocated buffer as a `Vector`.
In that case, `buffer_size` has no effect.
Note that `eltype(buffer)` must match the data type of the MRC data.
"""
write(::Any, ::MRCData)
function Base.write(io::IO, d::MRCData; compress=:none, unit_vsize=4096)
function Base.write(
io::IO,
d::MRCData;
compress=:none,
buffer_size=4096,
buffer::Union{Nothing,Vector}=nothing,
)
newio = compressstream(io, compress)
h = header(d)
sz = write(newio, h)
sz += write(newio, extendedheader(d))
T = datatype(h)
data = parent(d)
fswap = bswapfromh(h.machst)
unit_vsize = div(unit_vsize, sizeof(T))
vlen = length(data)
vrem = vlen % unit_vsize
if vrem != 0
@inbounds @views sz += write(newio, fswap.(T.(data[1:vrem])))
buffer_size = div(buffer_size, sizeof(T))
if buffer === nothing
buffer = Vector{T}(undef, buffer_size)
elseif !(buffer isa Vector{T})
throw(
ArgumentError(
"`buffer` must be `nothing` or a `Vector{$T}`, but `$(typeof(buffer))` was provided",
),
)
end
for i in (vrem + 1):unit_vsize:vlen
@inbounds @views sz += write(newio, fswap.(T.(data[i:(i + unit_vsize - 1)])))
# If `buffer` was provided as a parameter then `buffer_size` is redundant and
# we must make sure that it matches `buffer`.
buffer_size = length(buffer)
vlen = length(data)
vrem = vlen % buffer_size
@inbounds @views begin
if vrem != 0
buffer[1:vrem] .= fswap.(T.(data[1:vrem]))
sz += write(newio, buffer[1:vrem])
end
for i in (vrem + 1):buffer_size:vlen
buffer .= fswap.(T.(data[i:(i + buffer_size - 1)]))
sz += write(newio, buffer)
end
end
close(newio)
write(newio, TranscodingStreams.TOKEN_END)
flush(newio)
return sz
end
function Base.write(fn::AbstractString, object::T; compress=:auto, unit_vsize=4096) where {T<:Union{MRCData}}
function Base.write(
fn::AbstractString, object::T; compress=:auto, kwargs...
) where {T<:Union{MRCData}}
if compress == :auto
compress = checkextension(fn)
end
return open(fn; write=true) do io
return write(io, object; compress=compress, unit_vsize=unit_vsize)
return write(io, object; compress=compress, kwargs...)
end
end

Expand Down
4 changes: 2 additions & 2 deletions test/header.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,11 @@ end
@test MRC.entrytobytes(:map, "abc") == [0x61, 0x62, 0x63, 0x00]
@test MRC.entrytobytes(:map, "abcde") == [0x61, 0x62, 0x63, 0x64]
@test MRC.entrytobytes(:label, ("abcd", ntuple(_ -> "", 9)...))[1:80] ==
[0x61; 0x62; 0x63; 0x64; zeros(UInt8, 76)]
[0x61; 0x62; 0x63; 0x64; zeros(UInt8, 76)]
@test MRC.entrytobytes(:testfloat, Float32(3)) == reinterpret(UInt8, [Float32(3)])
@test MRC.entrytobytes(:testint, Int32(3)) == reinterpret(UInt8, [Int32(3)])
@test MRC.entrytobytes(:testuintbool, (0x01, 0x02, 0x03, 0x04)) ==
[0x01, 0x02, 0x03, 0x04]
[0x01, 0x02, 0x03, 0x04]
end

@testset "fieldoffsets" begin
Expand Down
40 changes: 38 additions & 2 deletions test/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
@test h.machst === (0x44, 0x41, 0x0, 0x0)
@test h.rms === Float32(0.15705723)
@test h.nlabl === Int32(1)
@test h.label == ("::::EMDATABANK.org::::EMD-3001::::", "", "", "", "", "", "", "", "", "")
@test h.label ==
("::::EMDATABANK.org::::EMD-3001::::", "", "", "", "", "", "", "", "", "")
end

@testset "emd3197.map" begin
Expand Down Expand Up @@ -89,7 +90,42 @@
@test h.machst === (0x44, 0x41, 0x0, 0x0)
@test h.rms === Float32(2.399953)
@test h.nlabl === Int32(1)
@test h.label == ("::::EMDATABANK.org::::EMD-3197::::", "", "", "", "", "", "", "", "", "")
@test h.label ==
("::::EMDATABANK.org::::EMD-3197::::", "", "", "", "", "", "", "", "", "")
end
end
end

@testset "write" begin
@testset "buffer size has no effect on data" begin
emd3001 = read("$(@__DIR__)/testdata/emd_3001.map", MRCData)
buffer_sizes = [1024, 2048, 4096, 100, 42]

for buffer_size in buffer_sizes
@testset "buffer size: $buffer_size" begin
# no preallocation
io = IOBuffer(; read=true, write=true)
write(io, emd3001; buffer_size=buffer_size)
flush(io)
seekstart(io)
@test read(io, MRCData) == emd3001
close(io)

# with preallocation
io = IOBuffer(; read=true, write=true)
buffer = Vector{Float32}(undef, buffer_size)
write(io, emd3001; buffer=buffer)
flush(io)
seekstart(io)
@test read(io, MRCData) == emd3001
close(io)
end
end
end

@testset "buffer eltype must match data type" begin
emd3001 = read("$(@__DIR__)/testdata/emd_3001.map", MRCData)
buffer = Vector{Float64}(undef, 1)
@test_throws ArgumentError write(IOBuffer(), emd3001; buffer=buffer)
end
end

2 comments on commit 051bab3

@sethaxen
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/76066

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.1.0 -m "<description of version>" 051bab36c309be3a9ef09ed5b464cf67fec4ee0c
git push origin v0.1.0

Please sign in to comment.