Skip to content

Commit

Permalink
Add tryencode
Browse files Browse the repository at this point in the history
This is an internal method (so far) that is similar to `encode`, except returns
nothing when failing instead of throwing an error.

Future questions:
* Should it remain internal? It's mentioned in the docs for Alphabet, so maybe
  not
  • Loading branch information
jakobnissen committed Oct 25, 2024
1 parent 8fa5616 commit bf13afb
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 13 deletions.
46 changes: 33 additions & 13 deletions src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ and T for a DNA Alphabet that requires only 2 bits to represent each symbol.
of the alphabet's element type, as well as the decoding, the inverse process.
* An `Alphabet`'s `encode` method must not produce invalid data.
### Required methods
Every subtype `A` of `Alphabet` must implement:
* `Base.eltype(::Type{A})::Type{S}` for some eltype `S`, which must be a `BioSymbol`.
* `symbols(::A)::Tuple{Vararg{S}}`. This gives tuples of all symbols in the set of `A`.
Expand All @@ -34,7 +35,10 @@ the encoded representation `E` must be of type `UInt`, and you must also impleme
* `BitsPerSymbol(::A)::BitsPerSymbol{N}`, where the `N` must be zero
or a power of two in [1, 2, 4, 8, 16, 32, [64 for 64-bit systems]].
For increased performance, see [`BioSequences.AsciiAlphabet`](@ref)
### Optional methods
* `BitsPerSymbol` for compatibility with existing `BioSequence`s
* `AsciiAlphabet` for increased printing/writing efficiency
* `tryencode` for fallible encoding.
"""
abstract type Alphabet end

Expand Down Expand Up @@ -68,8 +72,14 @@ The number of bits required to represent a packed symbol encoding in a vector of
bits_per_symbol(A::Alphabet) = bits_per_symbol(BitsPerSymbol(A))
Base.length(A::Alphabet) = length(symbols(A))

## Bits per symbol

"""
BitsPerSymbol{N}
A trait object specifying the number of bits it takes to encode a biosymbol in an `Alphabet`
Alphabets `A` should implement `BitsPerSymbol(::A)`.
For compatibility with existing BioSequences, the number of bits should be a power of two
between 1 and 32, both inclusive.
See also: [`Alphabet`](@ref)
"""
struct BitsPerSymbol{N} end
bits_per_symbol(::BitsPerSymbol{N}) where N = N

Expand All @@ -87,7 +97,21 @@ This decoding is checked to enforce valid data element.
If `s` cannot be encoded to the given alphabet, throw an `EncodeError`
"""
encode(A::Alphabet, s::BioSymbol) = throw(EncodeError(A, s))
@inline function encode(A::Alphabet, s::BioSymbol)
y = @inline tryencode(A, s)
return y === nothing ? throw(EncodeError(A, s)) : y
end

tryencode(A::Alphabet, s::BioSymbol) = throw(EncodeError(A, s))

"""
tryencode(::Alphabet, x::S)
Try encoding BioSymbol `S` to the internal representation of [`Alphabet`](@ref),
returning `nothing` if not successful.
See also: `encode`[@ref], `decode`[@ref]
"""
function tryencode end

"""
EncodeError
Expand Down Expand Up @@ -176,11 +200,9 @@ for A in (DNAAlphabet, RNAAlphabet)
@eval begin

# 2-bit encoding
@inline function encode(::$(A){2}, nt::$(T))
function tryencode(::$(A){2}, nt::$(T))
u = reinterpret(UInt8, nt)
if count_ones(u) != 1
throw(EncodeError($(A){2}(), nt))
end
isone(count_ones(u)) || return nothing
trailing_zeros(u) % UInt
end

Expand All @@ -191,7 +213,7 @@ for A in (DNAAlphabet, RNAAlphabet)
@inline decode(::$(A){2}, x::Unsigned) = decode($(A){2}(), UInt(x))

# 4-bit encoding
@inline function encode(::$(A){4}, nt::$(T))
function tryencode(::$(A){4}, nt::$(T))
return convert(UInt, reinterpret(UInt8, nt))
end

Expand Down Expand Up @@ -227,10 +249,8 @@ function symbols(::AminoAcidAlphabet)
AA_Y, AA_V, AA_O, AA_U, AA_B, AA_J, AA_Z, AA_X, AA_Term, AA_Gap)
end

@inline function encode(::AminoAcidAlphabet, aa::AminoAcid)
if reinterpret(UInt8, aa) > reinterpret(UInt8, AA_Gap)
throw(EncodeError(AminoAcidAlphabet(), aa))
end
function tryencode(::AminoAcidAlphabet, aa::AminoAcid)
reinterpret(UInt8, aa) > reinterpret(UInt8, AA_Gap) && return nothing
return convert(UInt, reinterpret(UInt8, aa))
end

Expand Down
23 changes: 23 additions & 0 deletions test/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ end
end

encode = BioSequences.encode
tryencode = BioSequences.tryencode
EncodeError = BioSequences.EncodeError
decode = BioSequences.decode

Expand Down Expand Up @@ -121,6 +122,15 @@ end
@test_throws EncodeError encode(DNAAlphabet{2}(), DNA_N)
@test_throws EncodeError encode(DNAAlphabet{2}(), DNA_Gap)

@test tryencode(DNAAlphabet{2}(), DNA_A) == UInt(0x00)
@test tryencode(DNAAlphabet{2}(), DNA_C) == UInt(0x01)
@test tryencode(DNAAlphabet{2}(), DNA_G) == UInt(0x02)
@test tryencode(DNAAlphabet{2}(), DNA_T) == UInt(0x03)
@test tryencode(DNAAlphabet{2}(), DNA_M) === nothing
@test tryencode(DNAAlphabet{2}(), DNA_N) === nothing
@test tryencode(DNAAlphabet{2}(), DNA_Gap) === nothing
@test_throws EncodeError tryencode(DNAAlphabet{2}(), RNA_G)

# 4 bits
for nt in BioSymbols.alphabet(DNA)
@test encode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
Expand All @@ -137,18 +147,31 @@ end
@test_throws EncodeError encode(RNAAlphabet{2}(), RNA_N)
@test_throws EncodeError encode(RNAAlphabet{2}(), RNA_Gap)

@test tryencode(RNAAlphabet{2}(), RNA_A) == UInt(0x00)
@test tryencode(RNAAlphabet{2}(), RNA_C) == UInt(0x01)
@test tryencode(RNAAlphabet{2}(), RNA_G) == UInt(0x02)
@test tryencode(RNAAlphabet{2}(), RNA_U) == UInt(0x03)
@test tryencode(RNAAlphabet{2}(), RNA_M) === nothing
@test tryencode(RNAAlphabet{2}(), RNA_N) === nothing
@test tryencode(RNAAlphabet{2}(), RNA_Gap) === nothing
@test_throws EncodeError tryencode(RNAAlphabet{2}(), DNA_G)

# 4 bits
for nt in BioSymbols.alphabet(RNA)
@test encode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
@test tryencode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
end
end

@testset "AminoAcid" begin
@test encode(AminoAcidAlphabet(), AA_A) === UInt(0x00)
for aa in BioSymbols.alphabet(AminoAcid)
@test encode(AminoAcidAlphabet(), aa) === convert(UInt, reinterpret(UInt8, aa))
@test tryencode(AminoAcidAlphabet(), aa) === convert(UInt, reinterpret(UInt8, aa))
end
@test_throws BioSequences.EncodeError encode(AminoAcidAlphabet(), BioSymbols.AA_INVALID)
@test tryencode(AminoAcidAlphabet(), reinterpret(AminoAcid, typemax(UInt8))) === nothing
@test tryencode(AminoAcidAlphabet(), BioSymbols.AA_INVALID) === nothing
end
end

Expand Down

0 comments on commit bf13afb

Please sign in to comment.