Add functions guess_parse and guess_alphabet (#292)

Add functions `guess_parse` and `guess_alphabet` This is useful for ephemeral, interactive programming, e.g. in the REPL when you're given some data and just need to work with it right now. This does not really solve the similar problem of fallible constructors / parsing into BioSequences.
BioJulia · Oct 22, 2024 · b47c3d6 · b47c3d6 · jakobnissen · Oct 22, 2024
1 parent 761f053
commit b47c3d6
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,14 +4,21 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
 
-## [UNRELEASED]
-
-* Add new `ncbi_tranlation_table` of the 15th genetic code (i.e `blepharisma_macronuclear_genetic_code`)
-
+## [3.3.0]
+* Add functions `bioseq` and `guess_alphabet` to easily construct a biosequence
+  of an unknown alphabet from e.g. a string.
 * Relax requirement of `decode`, such that it no longer needs to check for
   invalid data. Note that this change is not breaking, since it is not possible
   for correctly-implemented `Alphabet` and `BioSequence` to store invalid data.
 
+## [3.2.0]
+* Dropped support for Julia versions older than 1.10.0
+* Added a 'Recipes' page to the documentation
+* Add new genetic code: `blepharisma_macronuclear_genetic_code`
+* Improve documentation of sequence count methods and sequence string literals
+* Various performance improvements to counting, `ExactSearchQuery` and
+  `ispalindromic`
+
 ## [3.1.6]
 * The heuristics for translating sequences with ambiguous symbols is now improved.
   Now, `translate` does not rely on heuristics but uses an algorithm that always

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioSequences"
 uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
-version = "3.2.0"
+version = "3.3.0"
 
 [deps]
 BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"

diff --git a/docs/src/construction.md b/docs/src/construction.md
@@ -302,6 +302,33 @@ the bodies of things like for loops. And if you use them and are unsure, use the
 @aa_str
 ```
 
+## Loose parsing
+As of version 3.2.0, BioSequences.jl provide the [`bioseq`](@ref) function, which can be used to build a `LongSequence`
+from a string (or an `AbstractVector{UInt8}`) without knowing the correct `Alphabet`.
+
+```jldoctest
+julia> bioseq("ATGTGCTGA")
+9nt DNA Sequence:
+ATGTGCTGA
+```
+
+The function will prioritise 2-bit alphabets over 4-bit alphabets, and prefer smaller alphabets (like `DNAAlphabet{4}`) over larger (like `AminoAcidAlphabet`).
+If the input cannot be encoded by any of the built-in alphabets, an error is thrown:
+
+```jldoctest
+julia> bioseq("0!(CC!;#&&%")
+ERROR: cannot encode 0x30 in AminoAcidAlphabet
+[...]
+```
+
+Note that this function is only intended to be used for interactive, ephemeral work.
+The function is necessarily type unstable, and the precise returned alphabet for a given input is a heuristic which is subject to change.
+
+```@docs
+bioseq
+guess_alphabet
+```
+
 ## Comparison to other sequence types
 Following Base standards, BioSequences do not compare equal to other containers even if they have the same elements.
 To e.g. compare a BioSequence with a vector of DNA, compare the elements themselves:

diff --git a/src/BioSequences.jl b/src/BioSequences.jl
@@ -95,6 +95,9 @@ export
     ### Alphabets
     ###
 
+    guess_alphabet,
+    bioseq,
+
     # Types & aliases
     Alphabet,
     NucleicAcidAlphabet,

diff --git a/src/alphabet.jl b/src/alphabet.jl
@@ -111,7 +111,7 @@ end
 EncodeError(::A, val::T) where {A,T} = EncodeError{A,T}(val)
 
 function Base.showerror(io::IO, err::EncodeError{A}) where {A}
-    print(io, "cannot encode ", err.val, " in ", A)
+    print(io, "cannot encode ", repr(err.val), " in ", A)
 end
 
 """
@@ -305,3 +305,75 @@ for (anum, atype) in enumerate((DNAAlphabet{4}, DNAAlphabet{2}, RNAAlphabet{4},
         ascii_encode(::$(atype), x::UInt8) = @inbounds $(tablename)[x + 1]
     end
 end
+
+const GUESS_ALPHABET_LUT = let
+    v = zeros(UInt8, 64)
+    for (offset, A) in [
+        (0, DNAAlphabet{2}()),
+        (0, RNAAlphabet{2}()),
+        (1, DNAAlphabet{4}()),
+        (1, RNAAlphabet{4}()),
+        (2, DNAAlphabet{4}()),
+        (3, AminoAcidAlphabet())
+    ]
+        for symbol in A
+            for byte in (UInt8(uppercase(Char(symbol))), UInt8(lowercase(Char(symbol))))
+                v[div(byte, 2) + 1] |= 0x01 << (4*isodd(byte) + offset)
+            end
+        end
+    end
+    Tuple(v)
+end
+
+"""
+    guess_alphabet(s::Union{AbstractString, AbstractVector{UInt8}}) -> Union{Integer, Alphabet}
+
+Pick an `Alphabet` that can encode input `s`.  If no `Alphabet` can, return the index of the first
+byte of the input which is not encodable in any alphabet.
+This function only knows about the alphabets listed below. If multiple alphabets are possible,
+pick the first from the order below (i.e. `DNAAlphabet{2}()` if possible, otherwise `RNAAlphabet{2}()` etc).
+1. `DNAAlphabet{2}()`
+2. `RNAAlphabet{2}()`
+3. `DNAAlphabet{4}()`
+4. `RNAAlphabet{4}()`
+5. `AminoAcidAlphabet()`
+
+!!! warning
+    The functions `bioseq` and `guess_alphabet` are intended for use in interactive
+    sessions, and are not suitable for use in packages or non-ephemeral work.
+    They are type unstable, and their heuristics **are subject to change** in minor versions.
+
+# Examples
+```jldoctest
+julia> guess_alphabet("AGGCA")
+DNAAlphabet{2}()
+
+julia> guess_alphabet("WKLQSTV")
+AminoAcidAlphabet()
+
+julia> guess_alphabet("QAWT+!")
+5
+
+julia> guess_alphabet("UAGCSKMU")
+RNAAlphabet{4}()
+```
+"""
+function guess_alphabet(v::AbstractVector{UInt8})
+    possibilities = 0x0f
+    for (index, byte) in pairs(v)
+        lut_byte = @inbounds GUESS_ALPHABET_LUT[div(byte & 0x7f, 2) + 0x01]
+        lut_value = (lut_byte >>> (4 * isodd(byte))) & 0x0f
+        possibilities &= (lut_value * (byte < 0x80))
+        iszero(possibilities) && return index
+    end
+    dna = !iszero(possibilities & 0b0100)
+    @assert !iszero(possibilities) # We checked that in the loop above
+    if !iszero(possibilities & 0b0001)
+        dna ? DNAAlphabet{2}() : RNAAlphabet{2}()
+    elseif !iszero(possibilities & 0b0010)
+        dna ? DNAAlphabet{4}() : RNAAlphabet{4}()
+    else
+        AminoAcidAlphabet()
+    end
+end
+guess_alphabet(s::AbstractString) = guess_alphabet(codeunits(s))
diff --git a/src/longsequences/constructors.jl b/src/longsequences/constructors.jl
@@ -85,4 +85,38 @@ function LongSequence{A}(
     return copyto!(seq, 1, src, first(part), len)
 end
 
-Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
+Base.parse(::Type{LongSequence{A}}, seq::AbstractString) where A = LongSequence{A}(seq)
+
+"""
+    bioseq(s::Union{AbstractString, AbstractVector{UInt8}}) -> LongSequence
+
+Parse `s` into a `LongSequence` with an appropriate `Alphabet`, or throw an exception
+if no alphabet matches.
+See [`guess_alphabet`](@ref) for the available alphabets and the alphabet priority.
+
+!!! warning
+    The functions `bioseq` and `guess_alphabet` are intended for use in interactive
+    sessions, and are not suitable for use in packages or non-ephemeral work.
+    They are type unstable, and their heuristics **are subject to change** in minor versions.
+
+# Examples
+```jldoctest
+julia> bioseq("QMKLPEEFW")
+9aa Amino Acid Sequence:
+QMKLPEEFW
+
+julia> bioseq("UAUGCUGUAGG")
+11nt RNA Sequence:
+UAUGCUGUAGG
+
+julia> bioseq("PKMW#3>>0;kL")
+ERROR: cannot encode 0x23 in AminoAcidAlphabet
+[...]
+```
+"""
+function bioseq(s::AbstractVector{UInt8})
+    A = guess_alphabet(s)
+    A isa Integer && throw(EncodeError(AminoAcidAlphabet(), s[A]))
+    LongSequence{typeof(A)}(s)
+end
+bioseq(s::AbstractString) = bioseq(codeunits(s))
diff --git a/test/alphabet.jl b/test/alphabet.jl
@@ -196,3 +196,30 @@ end
     @test BioSequences.has_interface(Alphabet, RNAAlphabet{4}())
     @test BioSequences.has_interface(Alphabet, AminoAcidAlphabet())
 end
+
+@testset "Guess parsing and guess alphabet" begin
+    for (A, Ss) in [
+        (DNAAlphabet{2}(), ["", "TAGCT", "AAA"]),
+        (RNAAlphabet{2}(), ["U", "CGGUAG", "CCCCU"]),
+        (DNAAlphabet{4}(), ["W", "HKW--", "TAGCTATSG", "TAGC-TAG"]),
+        (RNAAlphabet{4}(), ["WU", "HAUH-CD", "VKMNSU"]),
+        (AminoAcidAlphabet(), ["Q", "PLBMW", "***"])
+    ]
+        for S in Ss
+            for T in [String, SubString, Vector{UInt8}, Test.GenericString]
+                @test guess_alphabet(T(S)) == A
+                @test bioseq(T(S)) isa LongSequence{typeof(A)}
+            end
+        end
+    end
+    for (index, S) in [
+        (4, "QMN!KK"),
+        (7, "ATCGAT???"),
+        (1, ";")
+    ]
+        for T in [String, SubString, Vector{UInt8}, Test.GenericString]
+            @test guess_alphabet(T(S)) == index
+            @test_throws BioSequences.EncodeError bioseq(T(S))
+        end
+    end
+end