Skip to content

Commit

Permalink
Do not check for valid data in decode
Browse files Browse the repository at this point in the history
Relax the requirement of `Alphabet`, such that it is no longer necessary to
check for a valid encoding in `decode`.
This implies that storing invalid data in a `BioSequence` is disallowed, but it
speeds up `getindex(::BioSequence)`, which is critical.
  • Loading branch information
jakobnissen committed Jul 22, 2023
1 parent d5f81d6 commit c880438
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 35 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [UNRELEASED]
* Relax requirement of `decode`, such that it no longer needs to check for
invalid data. Note that this change is not breaking, since it is not possible
for correctly-implemented `Alphabet` and `BioSequence` to store invalid data.

## [3.1.6]
* The heuristics for translating sequences with ambiguous symbols is now improved.
Expand Down
22 changes: 1 addition & 21 deletions src/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ and T for a DNA Alphabet that requires only 2 bits to represent each symbol.
* Alphabets span over a *finite* set of biological symbols.
* The alphabet controls the encoding from some internal "encoded data" to a BioSymbol
of the alphabet's element type, as well as the decoding, the inverse process.
* An `Alphabet`'s `encode` and `decode` methods must not produce invalid data.
* An `Alphabet`'s `encode` method must not produce invalid data.
Every subtype `A` of `Alphabet` must implement:
* `Base.eltype(::Type{A})::Type{S}` for some eltype `S`, which must be a `BioSymbol`.
Expand Down Expand Up @@ -101,20 +101,9 @@ end
decode(::Alphabet, x::E)
Decode internal representation `E` to a `BioSymbol` using an `Alphabet`.
This decoding is checked to enforce valid biosymbols.
"""
function decode end

struct DecodeError{A<:Alphabet,T} <: Exception
val::T
end

DecodeError(::A, val::T) where {A,T} = DecodeError{A,T}(val)

function Base.showerror(io::IO, err::DecodeError{A}) where {A}
print(io, "cannot decode ", err.val, " in ", A)
end

function Base.iterate(a::Alphabet, state = 1)
state > length(a) && return nothing
@inbounds sym = symbols(a)[state]
Expand Down Expand Up @@ -186,9 +175,6 @@ for A in (DNAAlphabet, RNAAlphabet)
end

@inline function decode(::$(A){2}, x::UInt)
if x > UInt(3)
throw(DecodeError($(A){2}(), x))
end
return reinterpret($(T), 0x01 << (x & 0x03))
end

Expand All @@ -203,9 +189,6 @@ for A in (DNAAlphabet, RNAAlphabet)
end

@inline function decode(::$(A){4}, x::UInt)
if !isvalid($(T), x)
throw(DecodeError($(A){4}(), x))
end
return reinterpret($(T), x % UInt8)
end

Expand Down Expand Up @@ -245,9 +228,6 @@ end
end

@inline function decode(::AminoAcidAlphabet, x::UInt)
if x > 0x1b
throw(DecodeError(AminoAcidAlphabet(), x))
end
return reinterpret(AminoAcid, x % UInt8)
end

Expand Down
14 changes: 0 additions & 14 deletions test/alphabet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ end
encode = BioSequences.encode
EncodeError = BioSequences.EncodeError
decode = BioSequences.decode
DecodeError = BioSequences.DecodeError

# NOTE: See the docs for the interface of Alphabet
struct ReducedAAAlphabet <: Alphabet end
Expand Down Expand Up @@ -76,7 +75,6 @@ function BioSequences.encode(::ReducedAAAlphabet, aa::AminoAcid)
end

function BioSequences.decode(::ReducedAAAlphabet, x::UInt)
x length(DEC_LUT) && throw(DecodeError(ReducedAAAlphabet(), x))
DEC_LUT[x + UInt(1)]
end

Expand All @@ -100,11 +98,6 @@ end
@test_throws EncodeError encode(ReducedAAAlphabet(), AA_R)
@test_throws EncodeError encode(ReducedAAAlphabet(), AA_Gap)
@test_throws EncodeError encode(ReducedAAAlphabet(), reinterpret(AminoAcid, 0xff))

@test_throws DecodeError decode(ReducedAAAlphabet(), UInt(16))
@test_throws DecodeError decode(ReducedAAAlphabet(), UInt(255))
@test_throws DecodeError decode(ReducedAAAlphabet(), UInt(432881))
@test_throws DecodeError decode(ReducedAAAlphabet(), typemax(UInt))
end

@testset "Encoding DNA/RNA/AminoAcid" begin
Expand Down Expand Up @@ -158,14 +151,11 @@ end
@test decode(DNAAlphabet{2}(), 0x01) === DNA_C
@test decode(DNAAlphabet{2}(), 0x02) === DNA_G
@test decode(DNAAlphabet{2}(), 0x03) === DNA_T
@test_throws DecodeError decode(DNAAlphabet{2}(), 0x04)
@test_throws DecodeError decode(DNAAlphabet{2}(), 0x0e)

# 4 bits
for x in 0b0000:0b1111
@test decode(DNAAlphabet{4}(), x) === reinterpret(DNA, x)
end
@test_throws DecodeError decode(DNAAlphabet{4}(), 0b10000)
end

@testset "RNA" begin
Expand All @@ -174,22 +164,18 @@ end
@test decode(RNAAlphabet{2}(), 0x01) === RNA_C
@test decode(RNAAlphabet{2}(), 0x02) === RNA_G
@test decode(RNAAlphabet{2}(), 0x03) === RNA_U
@test_throws DecodeError decode(RNAAlphabet{2}(), 0x04)
@test_throws DecodeError decode(RNAAlphabet{2}(), 0x0e)

# 4 bits
for x in 0b0000:0b1111
@test decode(RNAAlphabet{4}(), x) === reinterpret(RNA, x)
end
@test_throws DecodeError decode(RNAAlphabet{4}(), 0b10000)
end

@testset "AminoAcid" begin
@test decode(AminoAcidAlphabet(), 0x00) === AA_A
for x in 0x00:0x1b
@test decode(AminoAcidAlphabet(), x) === reinterpret(AminoAcid, x)
end
@test_throws BioSequences.DecodeError decode(AminoAcidAlphabet(), 0x1c)
end
end

Expand Down

0 comments on commit c880438

Please sign in to comment.