Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimise some LongSequence constructors #320

Merged
merged 1 commit into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,18 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [3.4.0]
* Deprecate functions `n_ambiguous`, `n_gaps` and `n_certain`. Instead, use the
equivalent methods `count(f, seq)` with the appropriate function `f`.
* Deprecate method `Base.count(::Function, ::BioSequence, ::BioSequence)`, and
the other methods of `count` which are subtypes of this.
* Deprecate use of functions `matches` and `mismatches` where the input seqs
have different lengths.
* Optimise `count(==(biosymbol), biosequence)` and `count(==(biosymbol),
biosequence)`
* Optimise contruction of `LongSequence` nucleotide sequences from sequences
with a different bit-number (e.g. two-bit seqs from four-bit seqs)

## [3.3.0]
* Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence
of an unknown alphabet from e.g. a string.
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BioSequences"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
version = "3.3.0"
version = "3.4.0"

[deps]
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
Expand Down
43 changes: 43 additions & 0 deletions src/bit-manipulation/bit-manipulation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,46 @@ end
y = enumerate_nibbles(b) ⊻ 0x1111111111111111
return count_0000_nibbles(x | y)
end

@inline function four_to_two_bits(x::UInt64)::UInt32
m1 = 0x1111111111111111
m2 = m1 << 1
m3 = m1 | m2
y = (x >> 3) & m1
y |= (x >> 2) & m2
y |= (x >> 1) & m3
pack(y)
end

@inline function pack(x::UInt64)::UInt32
m1 = 0x0f0f0f0f0f0f0f0f
m2 = 0x00ff00ff00ff00ff
m3 = 0x0000ffff0000ffff
m4 = 0x00000000ffffffff
x = (x & m1) | (x & ~m1) >> 2
x = (x & m2) | (x & ~m2) >> 4
x = (x & m3) | (x & ~m3) >> 8
x = (x & m4) | (x & ~m4) >> 16
x % UInt32
end

@inline function two_to_four_bits(x::UInt32)::UInt64
m = 0x1111111111111111
y = expand(x)
z = (y & m) << 1 | (m & ~y)
m2 = y & (m << 1)
m2 = m2 | m2 >> 1
(z & m2) << 2 | (z & ~m2)
end

@inline function expand(x::UInt32)::UInt64
m1 = 0x000000000000ffff
m2 = 0x000000ff000000ff
m3 = 0x000f000f000f000f
m4 = 0x0303030303030303
y = x % UInt64
y = (y & m1) | (y & ~m1) << 16
y = (y & m2) | (y & ~m2) << 8
y = (y & m3) | (y & ~m3) << 4
(y & m4) | (y & ~m4) << 2
end
2 changes: 1 addition & 1 deletion src/longsequences/constructors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ LongSequence{A}(seq::LongSequence{A}) where {A <: Alphabet} = copy(seq)

function (::Type{T})(seq::LongSequence{<:NucleicAcidAlphabet{N}}) where
{N, T<:LongSequence{<:NucleicAcidAlphabet{N}}}
return T(copy(seq.data), seq.len)
return T(seq.data[1:seq_data_len(seq)], seq.len)
end

# Constructors from strings
Expand Down
52 changes: 52 additions & 0 deletions src/longsequences/seqview.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,58 @@

Base.view(seq::SeqOrView, part::AbstractUnitRange) = LongSubSeq(seq, part)

function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{2}}) where
{T<:LongSequence{<:NucleicAcidAlphabet{4}}}
res = T(undef, length(seq))
v = res.data
(it, (ch, rm)) = iter_chunks(seq)
i = 1
for chunk in it
@inbounds v[i] = two_to_four_bits(chunk % UInt32)
@inbounds v[i + 1] = two_to_four_bits((chunk >> 32) % UInt32)
i += 2
end
mask = UInt64(1) << (rm & 63) - 1
ch &= mask
rm = Int(rm)
while rm > 0
@inbounds v[i] = two_to_four_bits(ch % UInt32)
ch >>= 32
rm -= 32
i += 1
end
res

Check warning on line 95 in src/longsequences/seqview.jl

View check run for this annotation

Codecov / codecov/patch

src/longsequences/seqview.jl#L95

Added line #L95 was not covered by tests
end

function (::Type{T})(seq::SeqOrView{<:NucleicAcidAlphabet{4}}) where
{T<:LongSequence{<:NucleicAcidAlphabet{2}}}
# Throw an error if the sequence contains gaps or ambiguous nucleotides.
if count(iscertain, seq) != length(seq)
throw(EncodeError(Alphabet(T), first(Iterators.filter(!iscertain, seq))))
end
res = T(undef, length(seq))
v = res.data
(it, (ch, rm)) = iter_chunks(seq)
i = 1
new_chunk = zero(UInt)
shift = 0
for chunk in it
new_chunk |= (four_to_two_bits(chunk) % UInt64) << (shift & 63)
shift ⊻= 32
if iszero(shift)
@inbounds v[i] = new_chunk
new_chunk ⊻= new_chunk
i += 1
end
end
if !iszero(rm) || !iszero(shift)
mask = UInt64(1) << (rm & 63) - 1
new_chunk |= (four_to_two_bits(ch & mask) % UInt64) << (shift & 63)
@inbounds v[i] = new_chunk
end
res

Check warning on line 124 in src/longsequences/seqview.jl

View check run for this annotation

Codecov / codecov/patch

src/longsequences/seqview.jl#L124

Added line #L124 was not covered by tests
end

# Conversion
function LongSequence(s::LongSubSeq{A}) where A
_copy_seqview(LongSequence{A}, s)
Expand Down
Loading