Remove FASTX methods and make it lighter

camilogarciabotero · Mar 29, 2024 · 78434d0 · 78434d0
1 parent ab25b99
commit 78434d0
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 102 deletions.
diff --git a/Manifest.toml b/Manifest.toml
@@ -2,19 +2,7 @@
 
 julia_version = "1.10.2"
 manifest_format = "2.0"
-project_hash = "4f6774eaeaec568c8cb7a865a84c0c7d9a180727"
-
-[[deps.Automa]]
-deps = ["PrecompileTools", "TranscodingStreams"]
-git-tree-sha1 = "588e0d680ad1d7201d4c6a804dcb1cd9cba79fbb"
-uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
-version = "1.0.3"
-
-[[deps.BioGenerics]]
-deps = ["TranscodingStreams"]
-git-tree-sha1 = "7bbc085aebc6faa615740b63756e4986c9e85a70"
-uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea"
-version = "0.1.4"
+project_hash = "0426921543f1f35e1faa99ad2688f4a61508977b"
 
 [[deps.BioSequences]]
 deps = ["BioSymbols", "PrecompileTools", "Random", "Twiddle"]
@@ -32,16 +20,6 @@ version = "5.1.3"
 deps = ["Printf"]
 uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
-[[deps.FASTX]]
-deps = ["Automa", "BioGenerics", "PrecompileTools", "StringViews", "TranscodingStreams"]
-git-tree-sha1 = "bff5d62bf5e1c382a370ac701bcaea9a24115ac6"
-uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12"
-version = "2.1.4"
-weakdeps = ["BioSequences"]
-
-    [deps.FASTX.extensions]
-    BioSequencesExt = "BioSequences"
-
 [[deps.IterTools]]
 git-tree-sha1 = "42d5f897009e7ff2cf88db414a389e5ed1bdd023"
 uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
@@ -71,28 +49,11 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 version = "0.7.0"
 
-[[deps.StringViews]]
-git-tree-sha1 = "f7b06677eae2571c888fd686ba88047d8738b0e3"
-uuid = "354b36f9-a18e-4713-926e-db85100087ba"
-version = "1.3.3"
-
 [[deps.TOML]]
 deps = ["Dates"]
 uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 version = "1.0.3"
 
-[[deps.TranscodingStreams]]
-git-tree-sha1 = "71509f04d045ec714c4748c785a59045c3736349"
-uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
-version = "0.10.7"
-
-    [deps.TranscodingStreams.extensions]
-    TestExt = ["Test", "Random"]
-
-    [deps.TranscodingStreams.weakdeps]
-    Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-    Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
 [[deps.Twiddle]]
 git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a"
 uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795"

diff --git a/Project.toml b/Project.toml
@@ -5,13 +5,11 @@ version = "0.2.0"
 
 [deps]
 BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
-FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
 IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 
 [compat]
 BioSequences = "3"
-FASTX = "2"
 IterTools = "1.4"
 PrecompileTools = "1"
 julia = "1"

diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl
@@ -20,7 +20,7 @@ using BioSequences:
     ncbi_trans_table,
     translate
 
-using FASTX: FASTAReader, FASTARecord, description, sequence
+# using FASTX: FASTAReader, FASTARecord, description, sequence
 using IterTools: takewhile, iterated
 using PrecompileTools: @setup_workload, @compile_workload
 

diff --git a/src/getorfs.jl b/src/getorfs.jl
@@ -1,4 +1,4 @@
-export get_orfs_dna, get_orfs_aa, record_orfs_fna, record_orfs_faa
+export get_orfs_dna, get_orfs_aa
 
 #### get_orfs_* methods ####
 
@@ -77,23 +77,23 @@ An array of `FASTARecord` objects representing the identified ORFs.
 # Description
 This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `min_len` argument.
 """
-function record_orfs_fna(
-    sequence::NucleicSeqOrView{DNAAlphabet{N}}; 
-    alternative_start::Bool = false, 
-    min_len::Int64 = 6
-) where {N}
-    orfs = findorfs(sequence; alternative_start, min_len)
-    norfs = length(orfs)
-    padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
-    records = FASTARecord[]
-    @inbounds for (index, orf) in enumerate(orfs)
-        id = string(lpad(string(index), padding, "0"))
-        header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
-        record = FASTARecord(header, sequence[orf])
-        push!(records, record)
-    end
-    return records
-end
+# function record_orfs_fna(
+#     sequence::NucleicSeqOrView{DNAAlphabet{N}}; 
+#     alternative_start::Bool = false, 
+#     min_len::Int64 = 6
+# ) where {N}
+#     orfs = findorfs(sequence; alternative_start, min_len)
+#     norfs = length(orfs)
+#     padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
+#     records = FASTARecord[]
+#     @inbounds for (index, orf) in enumerate(orfs)
+#         id = string(lpad(string(index), padding, "0"))
+#         header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
+#         record = FASTARecord(header, sequence[orf])
+#         push!(records, record)
+#     end
+#     return records
+# end
 
 """
     record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N}
@@ -111,21 +111,21 @@ The function returns a list of FASTA records, where each record represents an OR
 # Returns
 - A list of FASTA records representing the ORFs found in the sequence.
 """
-function record_orfs_faa(
-    sequence::NucleicSeqOrView{DNAAlphabet{N}};
-    alternative_start::Bool = false, 
-    code::GeneticCode = ncbi_trans_table[1],
-    min_len::Int64 = 6
-) where {N}
-    orfs = findorfs(sequence; alternative_start, min_len)
-    norfs = length(orfs)
-    padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
-    records = FASTARecord[]
-    @inbounds for (index, orf) in enumerate(orfs)
-        id = string(lpad(string(index), padding, "0"))
-        header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
-        record = FASTARecord(header, translate(sequence[orf]; code))
-        push!(records, record)
-    end
-    return records
-end
+# function record_orfs_faa(
+#     sequence::NucleicSeqOrView{DNAAlphabet{N}};
+#     alternative_start::Bool = false, 
+#     code::GeneticCode = ncbi_trans_table[1],
+#     min_len::Int64 = 6
+# ) where {N}
+#     orfs = findorfs(sequence; alternative_start, min_len)
+#     norfs = length(orfs)
+#     padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
+#     records = FASTARecord[]
+#     @inbounds for (index, orf) in enumerate(orfs)
+#         id = string(lpad(string(index), padding, "0"))
+#         header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
+#         record = FASTARecord(header, translate(sequence[orf]; code))
+#         push!(records, record)
+#     end
+#     return records
+# end
diff --git a/src/utils.jl b/src/utils.jl
@@ -1,15 +1,15 @@
-export fasta_to_dna, hasprematurestop
+# export fasta_to_dna, hasprematurestop
 # General purposes methods supporting main functions
-"""
-    fasta_to_dna(input::String)
-
-Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects.
-"""
-function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}}
-    FASTAReader(open(input)) do reader
-        return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader]
-    end
-end
+# """
+#     fasta_to_dna(input::String)
+
+# Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects.
+# """
+# function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}}
+#     FASTAReader(open(input)) do reader
+#         return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader]
+#     end
+# end
 
 # function gff_to_dna(input::AbstractString)
 #     GFF3.Reader(open(input)) do reader
@@ -24,23 +24,23 @@ Determine whether the `sequence` of type `LongSequence{DNAAlphabet{4}}` contains
 
 Returns a boolean indicating whether the `sequence` has more than one stop codon.
 """
-function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N}
+# function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N}
 
-    stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")]  # Create a set of stop codons
+#     stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")]  # Create a set of stop codons
 
-    length(sequence) % 3 == 0 || error("The sequence is not divisible by 3")
+#     length(sequence) % 3 == 0 || error("The sequence is not divisible by 3")
 
-    occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence")
+#     occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence")
 
-    @inbounds for i in 1:3:length(sequence) - 4
-        codon = sequence[i:i+2]
-        if codon in stopcodons
-            return true
-        end
-    end
+#     @inbounds for i in 1:3:length(sequence) - 4
+#         codon = sequence[i:i+2]
+#         if codon in stopcodons
+#             return true
+#         end
+#     end
 
-    return false
-end
+#     return false
+# end
 
 @doc raw"""
     iscoding(

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,6 +6,17 @@ using FASTX
 using GeneFinder
 using Aqua
 
+"""
+    fasta_to_dna(input::String)
+
+Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects.
+"""
+function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}}
+    FASTAReader(open(input)) do reader
+        return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader]
+    end
+end
+
 include("findorfstest.jl")
 include("iotest.jl")
 include("getindextest.jl")