Skip to content

Commit

Permalink
Add tryencode and trydecode
Browse files Browse the repository at this point in the history
The encode and decode methods are not allowed to produce invalid data. Instead,
they throw an error when encountering invalid input data.
This can lead to some frustration when checking if a symbol is permitted in an
alphabet.
One way to solve it is by checking `symbol in symbols(A)`, but this is not
particularly effective.

This PR adds a tryencode and trydecode method to existing alphabets. These
methods return nothing when given invalid data.
Methods encode and decode now internally call their try-variants.

May solve BioJulia#219
  • Loading branch information
jakobnissen committed Jun 16, 2022
1 parent bf4aac5 commit 28a96bb
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 34 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### Added
* It is now possible to `join` BioSymbols into a BioSequence.
* Added non-exported functions `trydecode` and `tryencode`.

## [3.0.1]
### Removed
Expand Down
65 changes: 31 additions & 34 deletions src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
### This file is a part of BioJulia.
### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md

# TODO: Make Alphabet require `tryencode/trydecode` and rely on the generic fallback for encode/decode.
"""
Alphabet
Expand Down Expand Up @@ -85,7 +86,18 @@ iscomplete(A::Alphabet) = Val(length(symbols(A)) === 1 << bits_per_symbol(A))
Encode BioSymbol `S` to an internal representation using an `Alphabet`.
This decoding is checked to enforce valid data element.
"""
function encode end
@inline function encode(A::Alphabet, s::BioSymbol)
y = tryencode(A, s)
return y === nothing ? throw(EncodeError(A, s)) : y
end

"""
tryencode(::Alphabet, x::S)
Try encoding BioSymbol `S` to the internal representation of `Alphabet`,
returning `nothing` if not successful.
"""
function tryencode end

struct EncodeError{A<:Alphabet,T} <: Exception
val::T
Expand All @@ -103,7 +115,10 @@ end
Decode internal representation `E` to a `BioSymbol` using an `Alphabet`.
This decoding is checked to enforce valid biosymbols.
"""
function decode end
@inline function decode(A::Alphabet, x)
y = trydecode(A, x)
y === nothing ? throw(DecodeError(A, x)) : y
end

struct DecodeError{A<:Alphabet,T} <: Exception
val::T
Expand Down Expand Up @@ -172,38 +187,27 @@ for A in (DNAAlphabet, RNAAlphabet)
@eval begin

# 2-bit encoding
@inline function encode(::$(A){2}, nt::$(T))
@inline function tryencode(::$(A){2}, nt::$(T))
if count_ones(nt) != 1 || !isvalid(nt)
throw(EncodeError($(A){2}(), nt))
return nothing
end
return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01])
end

@inline function decode(::$(A){2}, x::UInt)
if x > UInt(3)
throw(DecodeError($(A){2}(), x))
end
return reinterpret($(T), 0x01 << (x & 0x03))
@inline function trydecode(::$(A){2}, x::Unsigned)
xu = UInt(x)
return xu > UInt(3) ? nothing : reinterpret($(T), 0x01 << (xu & 0x03))
end

@inline decode(::$(A){2}, x::Unsigned) = decode($(A){2}(), UInt(x))

# 4-bit encoding
@inline function encode(::$(A){4}, nt::$(T))
if !isvalid(nt)
throw(EncodeError($(A){4}(), nt))
end
return convert(UInt, reinterpret(UInt8, nt))
@inline function tryencode(::$(A){4}, nt::$(T))
return isvalid(nt) ? convert(UInt, reinterpret(UInt8, nt)) : nothing
end

@inline function decode(::$(A){4}, x::UInt)
if !isvalid($(T), x)
throw(DecodeError($(A){4}(), x))
end
return reinterpret($(T), x % UInt8)
@inline function trydecode(::$(A){4}, x::Unsigned)
xu = UInt(x)
return isvalid($T, xu) ? reinterpret($T, xu % UInt8) : nothing
end

@inline decode(::$(A){4}, x::Unsigned) = decode($(A){4}(), UInt(x))
end
end

Expand Down Expand Up @@ -231,22 +235,15 @@ function symbols(::AminoAcidAlphabet)
AA_Y, AA_V, AA_O, AA_U, AA_B, AA_J, AA_Z, AA_X, AA_Term, AA_Gap)
end

@inline function encode(::AminoAcidAlphabet, aa::AminoAcid)
@inline function tryencode(::AminoAcidAlphabet, aa::AminoAcid)
if reinterpret(UInt8, aa) > reinterpret(UInt8, AA_Gap)
throw(EncodeError(AminoAcidAlphabet(), aa))
return nothing
end
return convert(UInt, reinterpret(UInt8, aa))
end

@inline function decode(::AminoAcidAlphabet, x::UInt)
if x > 0x1b
throw(DecodeError(AminoAcidAlphabet(), x))
end
return reinterpret(AminoAcid, x % UInt8)
end

@inline function decode(::AminoAcidAlphabet, x::Unsigned)
return decode(AminoAcidAlphabet(), UInt(x))
@inline function trydecode(::AminoAcidAlphabet, x::Unsigned)
return x > 0x1b ? nothing : reinterpret(AminoAcid, x % UInt8)
end

# AsciiAlphabet trait - add to user defined type to use speedups.
Expand Down
26 changes: 26 additions & 0 deletions test/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ end
end

encode = BioSequences.encode
tryencode = BioSequences.tryencode
EncodeError = BioSequences.EncodeError
decode = BioSequences.decode
DecodeError = BioSequences.DecodeError
Expand Down Expand Up @@ -118,11 +119,22 @@ end
@test_throws EncodeError encode(DNAAlphabet{2}(), DNA_N)
@test_throws EncodeError encode(DNAAlphabet{2}(), DNA_Gap)

@test tryencode(DNAAlphabet{2}(), DNA_A) == UInt(0x00)
@test tryencode(DNAAlphabet{2}(), DNA_C) == UInt(0x01)
@test tryencode(DNAAlphabet{2}(), DNA_G) == UInt(0x02)
@test tryencode(DNAAlphabet{2}(), DNA_T) == UInt(0x03)
@test tryencode(DNAAlphabet{2}(), DNA_M) === nothing
@test tryencode(DNAAlphabet{2}(), DNA_N) === nothing
@test tryencode(DNAAlphabet{2}(), DNA_Gap) === nothing
@test_throws MethodError tryencode(DNAAlphabet{2}(), RNA_G)

# 4 bits
for nt in BioSymbols.alphabet(DNA)
@test encode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
@test tryencode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
end
@test_throws EncodeError encode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000))
@test tryencode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000)) === nothing
end

@testset "RNA" begin
Expand All @@ -135,19 +147,33 @@ end
@test_throws EncodeError encode(RNAAlphabet{2}(), RNA_N)
@test_throws EncodeError encode(RNAAlphabet{2}(), RNA_Gap)

@test tryencode(RNAAlphabet{2}(), RNA_A) == UInt(0x00)
@test tryencode(RNAAlphabet{2}(), RNA_C) == UInt(0x01)
@test tryencode(RNAAlphabet{2}(), RNA_G) == UInt(0x02)
@test tryencode(RNAAlphabet{2}(), RNA_U) == UInt(0x03)
@test tryencode(RNAAlphabet{2}(), RNA_M) === nothing
@test tryencode(RNAAlphabet{2}(), RNA_N) === nothing
@test tryencode(RNAAlphabet{2}(), RNA_Gap) === nothing
@test_throws MethodError tryencode(RNAAlphabet{2}(), DNA_G)

# 4 bits
for nt in BioSymbols.alphabet(RNA)
@test encode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
@test tryencode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt))
end
@test_throws EncodeError encode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000))
@test tryencode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000)) === nothing
end

@testset "AminoAcid" begin
@test encode(AminoAcidAlphabet(), AA_A) === UInt(0x00)
for aa in BioSymbols.alphabet(AminoAcid)
@test encode(AminoAcidAlphabet(), aa) === convert(UInt, reinterpret(UInt8, aa))
@test tryencode(AminoAcidAlphabet(), aa) === convert(UInt, reinterpret(UInt8, aa))
end
@test_throws BioSequences.EncodeError encode(AminoAcidAlphabet(), BioSymbols.AA_INVALID)
@test tryencode(AminoAcidAlphabet(), reinterpret(AminoAcid, typemax(UInt8))) === nothing
@test tryencode(AminoAcidAlphabet(), BioSymbols.AA_INVALID) === nothing
end
end

Expand Down

0 comments on commit 28a96bb

Please sign in to comment.