From 78434d0a6c82ea2cb6e3f8f885dc90da3ead4130 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Camilo=20Garc=C3=ADa?= Date: Fri, 29 Mar 2024 17:52:27 -0500 Subject: [PATCH] Remove FASTX methods and make it lighter --- Manifest.toml | 41 +-------------------------- Project.toml | 2 -- src/GeneFinder.jl | 2 +- src/getorfs.jl | 72 +++++++++++++++++++++++------------------------ src/utils.jl | 46 +++++++++++++++--------------- test/runtests.jl | 11 ++++++++ 6 files changed, 72 insertions(+), 102 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index 371ac23..00b6976 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,19 +2,7 @@ julia_version = "1.10.2" manifest_format = "2.0" -project_hash = "4f6774eaeaec568c8cb7a865a84c0c7d9a180727" - -[[deps.Automa]] -deps = ["PrecompileTools", "TranscodingStreams"] -git-tree-sha1 = "588e0d680ad1d7201d4c6a804dcb1cd9cba79fbb" -uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b" -version = "1.0.3" - -[[deps.BioGenerics]] -deps = ["TranscodingStreams"] -git-tree-sha1 = "7bbc085aebc6faa615740b63756e4986c9e85a70" -uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea" -version = "0.1.4" +project_hash = "0426921543f1f35e1faa99ad2688f4a61508977b" [[deps.BioSequences]] deps = ["BioSymbols", "PrecompileTools", "Random", "Twiddle"] @@ -32,16 +20,6 @@ version = "5.1.3" deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" -[[deps.FASTX]] -deps = ["Automa", "BioGenerics", "PrecompileTools", "StringViews", "TranscodingStreams"] -git-tree-sha1 = "bff5d62bf5e1c382a370ac701bcaea9a24115ac6" -uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12" -version = "2.1.4" -weakdeps = ["BioSequences"] - - [deps.FASTX.extensions] - BioSequencesExt = "BioSequences" - [[deps.IterTools]] git-tree-sha1 = "42d5f897009e7ff2cf88db414a389e5ed1bdd023" uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" @@ -71,28 +49,11 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" version = "0.7.0" -[[deps.StringViews]] -git-tree-sha1 = "f7b06677eae2571c888fd686ba88047d8738b0e3" -uuid = "354b36f9-a18e-4713-926e-db85100087ba" -version = "1.3.3" - [[deps.TOML]] deps = ["Dates"] uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" version = "1.0.3" -[[deps.TranscodingStreams]] -git-tree-sha1 = "71509f04d045ec714c4748c785a59045c3736349" -uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" -version = "0.10.7" - - [deps.TranscodingStreams.extensions] - TestExt = ["Test", "Random"] - - [deps.TranscodingStreams.weakdeps] - Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - [[deps.Twiddle]] git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a" uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795" diff --git a/Project.toml b/Project.toml index 093d413..9dec4f7 100644 --- a/Project.toml +++ b/Project.toml @@ -5,13 +5,11 @@ version = "0.2.0" [deps] BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" -FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" [compat] BioSequences = "3" -FASTX = "2" IterTools = "1.4" PrecompileTools = "1" julia = "1" diff --git a/src/GeneFinder.jl b/src/GeneFinder.jl index 01959a5..fbbbd74 100644 --- a/src/GeneFinder.jl +++ b/src/GeneFinder.jl @@ -20,7 +20,7 @@ using BioSequences: ncbi_trans_table, translate -using FASTX: FASTAReader, FASTARecord, description, sequence +# using FASTX: FASTAReader, FASTARecord, description, sequence using IterTools: takewhile, iterated using PrecompileTools: @setup_workload, @compile_workload diff --git a/src/getorfs.jl b/src/getorfs.jl index 46887c2..fed466b 100644 --- a/src/getorfs.jl +++ b/src/getorfs.jl @@ -1,4 +1,4 @@ -export get_orfs_dna, get_orfs_aa, record_orfs_fna, record_orfs_faa +export get_orfs_dna, get_orfs_aa #### get_orfs_* methods #### @@ -77,23 +77,23 @@ An array of `FASTARecord` objects representing the identified ORFs. # Description This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `min_len` argument. """ -function record_orfs_fna( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - records = FASTARecord[] - @inbounds for (index, orf) in enumerate(orfs) - id = string(lpad(string(index), padding, "0")) - header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" - record = FASTARecord(header, sequence[orf]) - push!(records, record) - end - return records -end +# function record_orfs_fna( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false, +# min_len::Int64 = 6 +# ) where {N} +# orfs = findorfs(sequence; alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# records = FASTARecord[] +# @inbounds for (index, orf) in enumerate(orfs) +# id = string(lpad(string(index), padding, "0")) +# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" +# record = FASTARecord(header, sequence[orf]) +# push!(records, record) +# end +# return records +# end """ record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N} @@ -111,21 +111,21 @@ The function returns a list of FASTA records, where each record represents an OR # Returns - A list of FASTA records representing the ORFs found in the sequence. """ -function record_orfs_faa( - sequence::NucleicSeqOrView{DNAAlphabet{N}}; - alternative_start::Bool = false, - code::GeneticCode = ncbi_trans_table[1], - min_len::Int64 = 6 -) where {N} - orfs = findorfs(sequence; alternative_start, min_len) - norfs = length(orfs) - padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) - records = FASTARecord[] - @inbounds for (index, orf) in enumerate(orfs) - id = string(lpad(string(index), padding, "0")) - header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" - record = FASTARecord(header, translate(sequence[orf]; code)) - push!(records, record) - end - return records -end \ No newline at end of file +# function record_orfs_faa( +# sequence::NucleicSeqOrView{DNAAlphabet{N}}; +# alternative_start::Bool = false, +# code::GeneticCode = ncbi_trans_table[1], +# min_len::Int64 = 6 +# ) where {N} +# orfs = findorfs(sequence; alternative_start, min_len) +# norfs = length(orfs) +# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs)) +# records = FASTARecord[] +# @inbounds for (index, orf) in enumerate(orfs) +# id = string(lpad(string(index), padding, "0")) +# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)" +# record = FASTARecord(header, translate(sequence[orf]; code)) +# push!(records, record) +# end +# return records +# end \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index ea57668..2571723 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,15 +1,15 @@ -export fasta_to_dna, hasprematurestop +# export fasta_to_dna, hasprematurestop # General purposes methods supporting main functions -""" - fasta_to_dna(input::String) - -Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. -""" -function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} - FASTAReader(open(input)) do reader - return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] - end -end +# """ +# fasta_to_dna(input::String) + +# Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. +# """ +# function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} +# FASTAReader(open(input)) do reader +# return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] +# end +# end # function gff_to_dna(input::AbstractString) # GFF3.Reader(open(input)) do reader @@ -24,23 +24,23 @@ Determine whether the `sequence` of type `LongSequence{DNAAlphabet{4}}` contains Returns a boolean indicating whether the `sequence` has more than one stop codon. """ -function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N} +# function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N} - stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")] # Create a set of stop codons +# stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")] # Create a set of stop codons - length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") +# length(sequence) % 3 == 0 || error("The sequence is not divisible by 3") - occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence") +# occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence") - @inbounds for i in 1:3:length(sequence) - 4 - codon = sequence[i:i+2] - if codon in stopcodons - return true - end - end +# @inbounds for i in 1:3:length(sequence) - 4 +# codon = sequence[i:i+2] +# if codon in stopcodons +# return true +# end +# end - return false -end +# return false +# end @doc raw""" iscoding( diff --git a/test/runtests.jl b/test/runtests.jl index d89a385..899fe24 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,6 +6,17 @@ using FASTX using GeneFinder using Aqua +""" + fasta_to_dna(input::String) + +Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects. +""" +function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}} + FASTAReader(open(input)) do reader + return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader] + end +end + include("findorfstest.jl") include("iotest.jl") include("getindextest.jl")