diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f3c35f5..982bb44a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added * It is now possible to `join` BioSymbols into a BioSequence. +* Added non-exported functions `trydecode` and `tryencode`. ## [3.0.1] ### Removed diff --git a/src/alphabet.jl b/src/alphabet.jl index 4a0ed9b5..11c79889 100644 --- a/src/alphabet.jl +++ b/src/alphabet.jl @@ -7,6 +7,7 @@ ### This file is a part of BioJulia. ### License is MIT: https://github.com/BioJulia/BioSequences.jl/blob/master/LICENSE.md +# TODO: Make Alphabet require `tryencode/trydecode` and rely on the generic fallback for encode/decode. """ Alphabet @@ -85,7 +86,18 @@ iscomplete(A::Alphabet) = Val(length(symbols(A)) === 1 << bits_per_symbol(A)) Encode BioSymbol `S` to an internal representation using an `Alphabet`. This decoding is checked to enforce valid data element. """ -function encode end +@inline function encode(A::Alphabet, s::BioSymbol) + y = tryencode(A, s) + return y === nothing ? throw(EncodeError(A, s)) : y +end + +""" + tryencode(::Alphabet, x::S) + +Try encoding BioSymbol `S` to the internal representation of `Alphabet`, +returning `nothing` if not successful. +""" +function tryencode end struct EncodeError{A<:Alphabet,T} <: Exception val::T @@ -103,7 +115,10 @@ end Decode internal representation `E` to a `BioSymbol` using an `Alphabet`. This decoding is checked to enforce valid biosymbols. """ -function decode end +@inline function decode(A::Alphabet, x) + y = trydecode(A, x) + y === nothing ? throw(DecodeError(A, x)) : y +end struct DecodeError{A<:Alphabet,T} <: Exception val::T @@ -172,38 +187,27 @@ for A in (DNAAlphabet, RNAAlphabet) @eval begin # 2-bit encoding - @inline function encode(::$(A){2}, nt::$(T)) + @inline function tryencode(::$(A){2}, nt::$(T)) if count_ones(nt) != 1 || !isvalid(nt) - throw(EncodeError($(A){2}(), nt)) + return nothing end return convert(UInt, @inbounds twobitnucs[reinterpret(UInt8, nt) + 0x01]) end - @inline function decode(::$(A){2}, x::UInt) - if x > UInt(3) - throw(DecodeError($(A){2}(), x)) - end - return reinterpret($(T), 0x01 << (x & 0x03)) + @inline function trydecode(::$(A){2}, x::Unsigned) + xu = UInt(x) + return xu > UInt(3) ? nothing : reinterpret($(T), 0x01 << (xu & 0x03)) end - @inline decode(::$(A){2}, x::Unsigned) = decode($(A){2}(), UInt(x)) - # 4-bit encoding - @inline function encode(::$(A){4}, nt::$(T)) - if !isvalid(nt) - throw(EncodeError($(A){4}(), nt)) - end - return convert(UInt, reinterpret(UInt8, nt)) + @inline function tryencode(::$(A){4}, nt::$(T)) + return isvalid(nt) ? convert(UInt, reinterpret(UInt8, nt)) : nothing end - @inline function decode(::$(A){4}, x::UInt) - if !isvalid($(T), x) - throw(DecodeError($(A){4}(), x)) - end - return reinterpret($(T), x % UInt8) + @inline function trydecode(::$(A){4}, x::Unsigned) + xu = UInt(x) + return isvalid($T, xu) ? reinterpret($T, xu % UInt8) : nothing end - - @inline decode(::$(A){4}, x::Unsigned) = decode($(A){4}(), UInt(x)) end end @@ -231,22 +235,15 @@ function symbols(::AminoAcidAlphabet) AA_Y, AA_V, AA_O, AA_U, AA_B, AA_J, AA_Z, AA_X, AA_Term, AA_Gap) end -@inline function encode(::AminoAcidAlphabet, aa::AminoAcid) +@inline function tryencode(::AminoAcidAlphabet, aa::AminoAcid) if reinterpret(UInt8, aa) > reinterpret(UInt8, AA_Gap) - throw(EncodeError(AminoAcidAlphabet(), aa)) + return nothing end return convert(UInt, reinterpret(UInt8, aa)) end -@inline function decode(::AminoAcidAlphabet, x::UInt) - if x > 0x1b - throw(DecodeError(AminoAcidAlphabet(), x)) - end - return reinterpret(AminoAcid, x % UInt8) -end - -@inline function decode(::AminoAcidAlphabet, x::Unsigned) - return decode(AminoAcidAlphabet(), UInt(x)) +@inline function trydecode(::AminoAcidAlphabet, x::Unsigned) + return x > 0x1b ? nothing : reinterpret(AminoAcid, x % UInt8) end # AsciiAlphabet trait - add to user defined type to use speedups. diff --git a/test/alphabet.jl b/test/alphabet.jl index ade52780..ec631450 100644 --- a/test/alphabet.jl +++ b/test/alphabet.jl @@ -45,6 +45,7 @@ end end encode = BioSequences.encode +tryencode = BioSequences.tryencode EncodeError = BioSequences.EncodeError decode = BioSequences.decode DecodeError = BioSequences.DecodeError @@ -118,11 +119,22 @@ end @test_throws EncodeError encode(DNAAlphabet{2}(), DNA_N) @test_throws EncodeError encode(DNAAlphabet{2}(), DNA_Gap) + @test tryencode(DNAAlphabet{2}(), DNA_A) == UInt(0x00) + @test tryencode(DNAAlphabet{2}(), DNA_C) == UInt(0x01) + @test tryencode(DNAAlphabet{2}(), DNA_G) == UInt(0x02) + @test tryencode(DNAAlphabet{2}(), DNA_T) == UInt(0x03) + @test tryencode(DNAAlphabet{2}(), DNA_M) === nothing + @test tryencode(DNAAlphabet{2}(), DNA_N) === nothing + @test tryencode(DNAAlphabet{2}(), DNA_Gap) === nothing + @test_throws MethodError tryencode(DNAAlphabet{2}(), RNA_G) + # 4 bits for nt in BioSymbols.alphabet(DNA) @test encode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt)) + @test tryencode(DNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt)) end @test_throws EncodeError encode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000)) + @test tryencode(DNAAlphabet{4}(), reinterpret(DNA, 0b10000)) === nothing end @testset "RNA" begin @@ -135,19 +147,33 @@ end @test_throws EncodeError encode(RNAAlphabet{2}(), RNA_N) @test_throws EncodeError encode(RNAAlphabet{2}(), RNA_Gap) + @test tryencode(RNAAlphabet{2}(), RNA_A) == UInt(0x00) + @test tryencode(RNAAlphabet{2}(), RNA_C) == UInt(0x01) + @test tryencode(RNAAlphabet{2}(), RNA_G) == UInt(0x02) + @test tryencode(RNAAlphabet{2}(), RNA_U) == UInt(0x03) + @test tryencode(RNAAlphabet{2}(), RNA_M) === nothing + @test tryencode(RNAAlphabet{2}(), RNA_N) === nothing + @test tryencode(RNAAlphabet{2}(), RNA_Gap) === nothing + @test_throws MethodError tryencode(RNAAlphabet{2}(), DNA_G) + # 4 bits for nt in BioSymbols.alphabet(RNA) @test encode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt)) + @test tryencode(RNAAlphabet{4}(), nt) === UInt(reinterpret(UInt8, nt)) end @test_throws EncodeError encode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000)) + @test tryencode(RNAAlphabet{4}(), reinterpret(RNA, 0b10000)) === nothing end @testset "AminoAcid" begin @test encode(AminoAcidAlphabet(), AA_A) === UInt(0x00) for aa in BioSymbols.alphabet(AminoAcid) @test encode(AminoAcidAlphabet(), aa) === convert(UInt, reinterpret(UInt8, aa)) + @test tryencode(AminoAcidAlphabet(), aa) === convert(UInt, reinterpret(UInt8, aa)) end @test_throws BioSequences.EncodeError encode(AminoAcidAlphabet(), BioSymbols.AA_INVALID) + @test tryencode(AminoAcidAlphabet(), reinterpret(AminoAcid, typemax(UInt8))) === nothing + @test tryencode(AminoAcidAlphabet(), BioSymbols.AA_INVALID) === nothing end end