BioJulia · jakobnissen · Oct 18, 2024 · Oct 18, 2024
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "BioSequences"
 uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
 authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
-version = "3.1.6"
+version = "3.2.0"
 
 [deps]
 BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"

diff --git a/docs/src/counting.md b/docs/src/counting.md
@@ -59,4 +59,13 @@ For such sequence and function combinations, `Base.count(f, seq)` is overloaded
 to call an internal `BioSequences.count_*_bitpar` function, which is passed the
 sequence(s). If you want to force BioSequences to use naive counting for the
 purposes of testing or debugging for example, then you can call
-`BioSequences.count_naive` directly.
+`BioSequences.count_naive` directly.
+
+```@docs
+gc_content
+n_gaps
+n_certain
+n_ambiguous
+matches
+mismatches
+```
diff --git a/src/biosequence/counting.jl b/src/biosequence/counting.jl
@@ -53,20 +53,112 @@ Base.count(::typeof(iscertain), seqa::S, seqb::S) where {S<:BioSequence{<:Nuclei
 ###
 
 """
-    gc_content(seq::BioSequence)
+    gc_content(seq::BioSequence) -> Float64
 
-Calculate GC content of `seq`.
+Calculate GC content of `seq`, i.e. the number of symbols that is `DNA_C`, `DNA_G`,
+`DNA_C` or `DNA_G` divided by the length of the sequence.
+
+# Examples
+```jldoctest
+julia> gc_content(dna"AGCTA")
+0.4
+
+julia> gc_content(rna"UAGCGA")
+0.5
+```
 """
 gc_content(seq::NucleotideSeq) = isempty(seq) ? 0.0 : count(isGC, seq) / length(seq)
 
-n_ambiguous(seq) = count(isambiguous, seq)
+"""
+    n_ambiguous(a::BioSequence, [b::BioSequence]) -> Int
+
+Count the number of positions where `a` (or `b`, if present) have ambigious symbols.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> n_ambiguous(dna"--TAC-WN-ACY")
+3
+
+julia> n_ambiguous(rna"UAYWW", rna"UAW")
+1
+```
+"""
+n_ambiguous(seq::BioSequence) = count(isambiguous, seq)
 n_ambiguous(seqa::BioSequence, seqb::BioSequence) = count(isambiguous_or, seqa, seqb)
 
-n_certain(seq) = count(iscertain, seq)
+"""
+    n_certain(a::BioSequence, [b::BioSequence]) -> Int
+
+Count the number of positions where `a` (and `b`, if present) have certain (i.e. non-ambigous
+and non-gap) symbols.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> n_certain(dna"--TAC-WN-ACY")
+5
+
+julia> n_certain(rna"UAYWW", rna"UAW")
+2
+```
+"""
+n_certain(seq::BioSequence) = count(iscertain, seq)
 n_certain(seqa::BioSequence, seqb::BioSequence) = count(iscertain_and, seqa, seqb)
 
+"""
+    n_gaps(a::BioSequence, [b::BioSequence]) -> Int
+
+Count the number of positions where `a` (or `b`, if present) have gaps.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> n_gaps(dna"--TAC-WN-ACY")
+4
+
+julia> n_gaps(dna"TC-AC-", dna"-CACG")
+2
+```
+"""
 n_gaps(seq::BioSequence) = count(isgap, seq)
 n_gaps(seqa::BioSequence, seqb::BioSequence) = count(isgap_or, seqa, seqb)
 
+"""
+    mismatches(a::BioSequence, b::BioSequences) -> Int
+
+Count the number of positions in where `a` and `b` differ.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> mismatches(dna"TAGCTA", dna"TACCTA")
+1
+
+julia> mismatches(dna"AACA", dna"AAG")
+1
+```
+"""
 mismatches(seqa::BioSequence, seqb::BioSequence) = count(!=, seqa, seqb)
+
+"""
+    mismatches(a::BioSequence, b::BioSequences) -> Int
+
+Count the number of positions in where `a` and `b` are equal.
+If `b` is given, and the length of `a` and `b` differ, look only at the indices
+of the shorter sequence.
+
+# Examples
+```jldoctest
+julia> matches(dna"TAGCTA", dna"TACCTA")
+5
+
+julia> matches(dna"AACA", dna"AAG")
+2
+```
+"""
 matches(seqa::BioSequence, seqb::BioSequence) = count(==, seqa, seqb)
diff --git a/src/bit-manipulation/bitpar-compiler.jl b/src/bit-manipulation/bitpar-compiler.jl
@@ -131,9 +131,9 @@
         @assert length(seqa) ≤ length(seqb)
 
         nexta = bitindex(seqa, 1)
-        stopa = bitindex(seqa, lastindex(seqa) + 1)
+        stopa = bitindex(seqa, (lastindex(seqa) + 1) % UInt)
         nextb = bitindex(seqb, 1)
-        stopb = bitindex(seqb, lastindex(seqb) + 1)
+        stopb = bitindex(seqb, (lastindex(seqb) + 1) % UInt)
         adata = seqa.data
         bdata = seqb.data
 
@@ -148,8 +148,8 @@
         if nexta < stopa && offset(nexta) != 0
             # Here we shift the first data chunks to the right so as the first
             # nucleotide of the seq/subseq is the first nibble / pair of bits.
-            x = adata[index(nexta)] >> offset(nexta)
-            y = bdata[index(nextb)] >> offset(nextb)
+            x = @inbounds adata[index(nexta)] >> offset(nexta)
+            y = @inbounds bdata[index(nextb)] >> offset(nextb)
             # Here it was assumed that there is something to go and get from
             # the next chunk of `b`, yet that may not be true.
             # We know that if this is not true of `b`, then it is certainly not
@@ -160,7 +160,7 @@
             # This edge case was found and accounted for by Ben J. Ward @BenJWard.
             # Ask this maintainer for more information.
             if offset(nextb) > offset(nexta) && 64 - offset(nextb) < stopb - nextb
-                y |= bdata[index(nextb) + 1] << (64 - offset(nextb))
+                y |= @inbounds bdata[index(nextb) + 1] << (64 - offset(nextb))
             end
             # Here we need to check something, we need to check if the
             # integer of `a` we are currently aligning contains the end of
@@ -192,8 +192,8 @@
 
         if offset(nextb) == 0  # data are aligned with each other
             while stopa - nexta ≥ 64 # Iterate through body of data
-                x = adata[index(nexta)]
-                y = bdata[index(nextb)]
+                x = @inbounds adata[index(nexta)]
+                y = @inbounds bdata[index(nextb)]
 
                 $(body_code)
 
@@ -202,8 +202,8 @@
             end
 
             if nexta < stopa
-                x = adata[index(nexta)]
-                y = bdata[index(nextb)]
+                x = @inbounds adata[index(nexta)]
+                y = @inbounds bdata[index(nextb)]
 
                 offs = stopa - nexta
                 m = bitmask(offs)
@@ -218,8 +218,8 @@
             # Note that here, updating `nextb` by 64, increases the chunk index,
             # but the `offset(nextb)` will remain the same.
             while stopa - nexta ≥ 64 # processing body of data
-                x = adata[index(nexta)]
-                z = bdata[index(nextb)]
+                x = @inbounds adata[index(nexta)]
+                z = @inbounds bdata[index(nextb)]
                 y = y >> offset(nextb) | z << (64 - offset(nextb))
 
                 $(body_code)
@@ -230,7 +230,7 @@
             end
 
             if nexta < stopa # processing tail of data
-                x = adata[index(nexta)]
+                x = @inbounds adata[index(nexta)]
                 y = y >> offset(nextb)
                 if 64 - offset(nextb) < stopa - nexta
                     y |= bdata[index(nextb)] << (64 - offset(nextb))