diff --git a/CHANGELOG.md b/CHANGELOG.md index 1af37013..0be93ea0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,14 +4,21 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [UNRELEASED] - -* Add new `ncbi_tranlation_table` of the 15th genetic code (i.e `blepharisma_macronuclear_genetic_code`) - +## [3.3.0] +* Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence + of an unknown alphabet from e.g. a string. * Relax requirement of `decode`, such that it no longer needs to check for invalid data. Note that this change is not breaking, since it is not possible for correctly-implemented `Alphabet` and `BioSequence` to store invalid data. +## [3.2.0] +* Dropped support for Julia versions older than 1.10.0 +* Added a 'Recipes' page to the documentation +* Add new genetic code: `blepharisma_macronuclear_genetic_code` +* Improve documentation of sequence count methods and sequence string literals +* Various performance improvements to counting, `ExactSearchQuery` and + `ispalindromic` + ## [3.1.6] * The heuristics for translating sequences with ambiguous symbols is now improved. Now, `translate` does not rely on heuristics but uses an algorithm that always diff --git a/Project.toml b/Project.toml index 42d23282..22a1e74f 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BioSequences" uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" authors = ["Sabrina Jaye Ward ", "Jakob Nissen "] -version = "3.2.0" +version = "3.3.0" [deps] BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" diff --git a/docs/src/construction.md b/docs/src/construction.md index 000c35d9..63f2f4ce 100644 --- a/docs/src/construction.md +++ b/docs/src/construction.md @@ -302,6 +302,33 @@ the bodies of things like for loops. And if you use them and are unsure, use the @aa_str ``` +## Loose parsing +As of version 3.2.0, BioSequences.jl provide the [`bioseq`](@ref) function, which can be used to build a `LongSequence` +from a string (or an `AbstractVector{UInt8}`) without knowing the correct `Alphabet`. + +```jldoctest +julia> bioseq("ATGTGCTGA") +9nt DNA Sequence: +ATGTGCTGA +``` + +The function will prioritise 2-bit alphabets over 4-bit alphabets, and prefer smaller alphabets (like `DNAAlphabet{4}`) over larger (like `AminoAcidAlphabet`). +If the input cannot be encoded by any of the built-in alphabets, an error is thrown: + +```jldoctest +julia> bioseq("0!(CC!;#&&%") +ERROR: cannot encode 0x30 in AminoAcidAlphabet +[...] +``` + +Note that this function is only intended to be used for interactive, ephemeral work. +The function is necessarily type unstable, and the precise returned alphabet for a given input is a heuristic which is subject to change. + +```@docs +bioseq +guess_alphabet +``` + ## Comparison to other sequence types Following Base standards, BioSequences do not compare equal to other containers even if they have the same elements. To e.g. compare a BioSequence with a vector of DNA, compare the elements themselves: diff --git a/src/BioSequences.jl b/src/BioSequences.jl index 111d33b1..48285efc 100644 --- a/src/BioSequences.jl +++ b/src/BioSequences.jl @@ -95,6 +95,9 @@ export ### Alphabets ### + guess_alphabet, + bioseq, + # Types & aliases Alphabet, NucleicAcidAlphabet, diff --git a/src/alphabet.jl b/src/alphabet.jl index aaa269d0..790a822f 100644 --- a/src/alphabet.jl +++ b/src/alphabet.jl @@ -111,7 +111,7 @@ end EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val) function Base.showerror(io::IO, err::EncodeError{A}) where {A} - print(io, "cannot encode ", err.val, " in ", A) + print(io, "cannot encode ", repr(err.val), " in ", A) end """ @@ -305,3 +305,75 @@ for (anum, atype) in enumerate((DNAAlphabet{4}, DNAAlphabet{2}, RNAAlphabet{4}, ascii_encode(::$(atype), x::UInt8) = @inbounds $(tablename)[x + 1] end end + +const GUESS_ALPHABET_LUT = let + v = zeros(UInt8, 64) + for (offset, A) in [ + (0, DNAAlphabet{2}()), + (0, RNAAlphabet{2}()), + (1, DNAAlphabet{4}()), + (1, RNAAlphabet{4}()), + (2, DNAAlphabet{4}()), + (3, AminoAcidAlphabet()) + ] + for symbol in A + for byte in (UInt8(uppercase(Char(symbol))), UInt8(lowercase(Char(symbol)))) + v[div(byte, 2) + 1] |= 0x01 << (4*isodd(byte) + offset) + end + end + end + Tuple(v) +end + +""" + guess_alphabet(s::Union{AbstractString, AbstractVector{UInt8}}) -> Union{Integer, Alphabet} + +Pick an `Alphabet` that can encode input `s`. If no `Alphabet` can, return the index of the first +byte of the input which is not encodable in any alphabet. +This function only knows about the alphabets listed below. If multiple alphabets are possible, +pick the first from the order below (i.e. `DNAAlphabet{2}()` if possible, otherwise `RNAAlphabet{2}()` etc). +1. `DNAAlphabet{2}()` +2. `RNAAlphabet{2}()` +3. `DNAAlphabet{4}()` +4. `RNAAlphabet{4}()` +5. `AminoAcidAlphabet()` + +!!! warning + The functions `bioseq` and `guess_alphabet` are intended for use in interactive + sessions, and are not suitable for use in packages or non-ephemeral work. + They are type unstable, and their heuristics **are subject to change** in minor versions. + +# Examples +```jldoctest +julia> guess_alphabet("AGGCA") +DNAAlphabet{2}() + +julia> guess_alphabet("WKLQSTV") +AminoAcidAlphabet() + +julia> guess_alphabet("QAWT+!") +5 + +julia> guess_alphabet("UAGCSKMU") +RNAAlphabet{4}() +``` +""" +function guess_alphabet(v::AbstractVector{UInt8}) + possibilities = 0x0f + for (index, byte) in pairs(v) + lut_byte = @inbounds GUESS_ALPHABET_LUT[div(byte & 0x7f, 2) + 0x01] + lut_value = (lut_byte >>> (4 * isodd(byte))) & 0x0f + possibilities &= (lut_value * (byte < 0x80)) + iszero(possibilities) && return index + end + dna = !iszero(possibilities & 0b0100) + @assert !iszero(possibilities) # We checked that in the loop above + if !iszero(possibilities & 0b0001) + dna ? DNAAlphabet{2}() : RNAAlphabet{2}() + elseif !iszero(possibilities & 0b0010) + dna ? DNAAlphabet{4}() : RNAAlphabet{4}() + else + AminoAcidAlphabet() + end +end +guess_alphabet(s::AbstractString) = guess_alphabet(codeunits(s)) diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl index abdd2f94..03c1b15c 100644 --- a/src/longsequences/constructors.jl +++ b/src/longsequences/constructors.jl @@ -85,4 +85,38 @@ function LongSequence{A}( return copyto!(seq, 1, src, first(part), len) end -Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq) \ No newline at end of file +Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq) + +""" + bioseq(s::Union{AbstractString, AbstractVector{UInt8}}) -> LongSequence + +Parse `s` into a `LongSequence` with an appropriate `Alphabet`, or throw an exception +if no alphabet matches. +See [`guess_alphabet`](@ref) for the available alphabets and the alphabet priority. + +!!! warning + The functions `bioseq` and `guess_alphabet` are intended for use in interactive + sessions, and are not suitable for use in packages or non-ephemeral work. + They are type unstable, and their heuristics **are subject to change** in minor versions. + +# Examples +```jldoctest +julia> bioseq("QMKLPEEFW") +9aa Amino Acid Sequence: +QMKLPEEFW + +julia> bioseq("UAUGCUGUAGG") +11nt RNA Sequence: +UAUGCUGUAGG + +julia> bioseq("PKMW#3>>0;kL") +ERROR: cannot encode 0x23 in AminoAcidAlphabet +[...] +``` +""" +function bioseq(s::AbstractVector{UInt8}) + A = guess_alphabet(s) + A isa Integer && throw(EncodeError(AminoAcidAlphabet(), s[A])) + LongSequence{typeof(A)}(s) +end +bioseq(s::AbstractString) = bioseq(codeunits(s)) diff --git a/test/alphabet.jl b/test/alphabet.jl index f74edaa4..8fa24a21 100644 --- a/test/alphabet.jl +++ b/test/alphabet.jl @@ -196,3 +196,30 @@ end @test BioSequences.has_interface(Alphabet, RNAAlphabet{4}()) @test BioSequences.has_interface(Alphabet, AminoAcidAlphabet()) end + +@testset "Guess parsing and guess alphabet" begin + for (A, Ss) in [ + (DNAAlphabet{2}(), ["", "TAGCT", "AAA"]), + (RNAAlphabet{2}(), ["U", "CGGUAG", "CCCCU"]), + (DNAAlphabet{4}(), ["W", "HKW--", "TAGCTATSG", "TAGC-TAG"]), + (RNAAlphabet{4}(), ["WU", "HAUH-CD", "VKMNSU"]), + (AminoAcidAlphabet(), ["Q", "PLBMW", "***"]) + ] + for S in Ss + for T in [String, SubString, Vector{UInt8}, Test.GenericString] + @test guess_alphabet(T(S)) == A + @test bioseq(T(S)) isa LongSequence{typeof(A)} + end + end + end + for (index, S) in [ + (4, "QMN!KK"), + (7, "ATCGAT???"), + (1, ";") + ] + for T in [String, SubString, Vector{UInt8}, Test.GenericString] + @test guess_alphabet(T(S)) == index + @test_throws BioSequences.EncodeError bioseq(T(S)) + end + end +end