Skip to content

Commit

Permalink
Remove FASTX methods and make it lighter
Browse files Browse the repository at this point in the history
  • Loading branch information
camilogarciabotero committed Mar 29, 2024
1 parent ab25b99 commit 78434d0
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 102 deletions.
41 changes: 1 addition & 40 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,7 @@

julia_version = "1.10.2"
manifest_format = "2.0"
project_hash = "4f6774eaeaec568c8cb7a865a84c0c7d9a180727"

[[deps.Automa]]
deps = ["PrecompileTools", "TranscodingStreams"]
git-tree-sha1 = "588e0d680ad1d7201d4c6a804dcb1cd9cba79fbb"
uuid = "67c07d97-cdcb-5c2c-af73-a7f9c32a568b"
version = "1.0.3"

[[deps.BioGenerics]]
deps = ["TranscodingStreams"]
git-tree-sha1 = "7bbc085aebc6faa615740b63756e4986c9e85a70"
uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea"
version = "0.1.4"
project_hash = "0426921543f1f35e1faa99ad2688f4a61508977b"

[[deps.BioSequences]]
deps = ["BioSymbols", "PrecompileTools", "Random", "Twiddle"]
Expand All @@ -32,16 +20,6 @@ version = "5.1.3"
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"

[[deps.FASTX]]
deps = ["Automa", "BioGenerics", "PrecompileTools", "StringViews", "TranscodingStreams"]
git-tree-sha1 = "bff5d62bf5e1c382a370ac701bcaea9a24115ac6"
uuid = "c2308a5c-f048-11e8-3e8a-31650f418d12"
version = "2.1.4"
weakdeps = ["BioSequences"]

[deps.FASTX.extensions]
BioSequencesExt = "BioSequences"

[[deps.IterTools]]
git-tree-sha1 = "42d5f897009e7ff2cf88db414a389e5ed1bdd023"
uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
Expand Down Expand Up @@ -71,28 +49,11 @@ uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"

[[deps.StringViews]]
git-tree-sha1 = "f7b06677eae2571c888fd686ba88047d8738b0e3"
uuid = "354b36f9-a18e-4713-926e-db85100087ba"
version = "1.3.3"

[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"

[[deps.TranscodingStreams]]
git-tree-sha1 = "71509f04d045ec714c4748c785a59045c3736349"
uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
version = "0.10.7"

[deps.TranscodingStreams.extensions]
TestExt = ["Test", "Random"]

[deps.TranscodingStreams.weakdeps]
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[[deps.Twiddle]]
git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a"
uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795"
Expand Down
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@ version = "0.2.0"

[deps]
BioSequences = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
FASTX = "c2308a5c-f048-11e8-3e8a-31650f418d12"
IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"

[compat]
BioSequences = "3"
FASTX = "2"
IterTools = "1.4"
PrecompileTools = "1"
julia = "1"
Expand Down
2 changes: 1 addition & 1 deletion src/GeneFinder.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ using BioSequences:
ncbi_trans_table,
translate

using FASTX: FASTAReader, FASTARecord, description, sequence
# using FASTX: FASTAReader, FASTARecord, description, sequence
using IterTools: takewhile, iterated
using PrecompileTools: @setup_workload, @compile_workload

Expand Down
72 changes: 36 additions & 36 deletions src/getorfs.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export get_orfs_dna, get_orfs_aa, record_orfs_fna, record_orfs_faa
export get_orfs_dna, get_orfs_aa

#### get_orfs_* methods ####

Expand Down Expand Up @@ -77,23 +77,23 @@ An array of `FASTARecord` objects representing the identified ORFs.
# Description
This function searches for Open Reading Frames (ORFs) in a given nucleic acid sequence. An ORF is a sequence of DNA that starts with a start codon and ends with a stop codon, without any other stop codons in between. By default, only the standard start codon (ATG) is considered, but if `alternative_start` is set to `true`, alternative start codons are also considered. The minimum length of an ORF to be recorded can be specified using the `min_len` argument.
"""
function record_orfs_fna(
sequence::NucleicSeqOrView{DNAAlphabet{N}};
alternative_start::Bool = false,
min_len::Int64 = 6
) where {N}
orfs = findorfs(sequence; alternative_start, min_len)
norfs = length(orfs)
padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
records = FASTARecord[]
@inbounds for (index, orf) in enumerate(orfs)
id = string(lpad(string(index), padding, "0"))
header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
record = FASTARecord(header, sequence[orf])
push!(records, record)
end
return records
end
# function record_orfs_fna(
# sequence::NucleicSeqOrView{DNAAlphabet{N}};
# alternative_start::Bool = false,
# min_len::Int64 = 6
# ) where {N}
# orfs = findorfs(sequence; alternative_start, min_len)
# norfs = length(orfs)
# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
# records = FASTARecord[]
# @inbounds for (index, orf) in enumerate(orfs)
# id = string(lpad(string(index), padding, "0"))
# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
# record = FASTARecord(header, sequence[orf])
# push!(records, record)
# end
# return records
# end

"""
record_orfs_faa(sequence::NucleicSeqOrView{DNAAlphabet{N}}; kwargs...) where {N}
Expand All @@ -111,21 +111,21 @@ The function returns a list of FASTA records, where each record represents an OR
# Returns
- A list of FASTA records representing the ORFs found in the sequence.
"""
function record_orfs_faa(
sequence::NucleicSeqOrView{DNAAlphabet{N}};
alternative_start::Bool = false,
code::GeneticCode = ncbi_trans_table[1],
min_len::Int64 = 6
) where {N}
orfs = findorfs(sequence; alternative_start, min_len)
norfs = length(orfs)
padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
records = FASTARecord[]
@inbounds for (index, orf) in enumerate(orfs)
id = string(lpad(string(index), padding, "0"))
header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
record = FASTARecord(header, translate(sequence[orf]; code))
push!(records, record)
end
return records
end
# function record_orfs_faa(
# sequence::NucleicSeqOrView{DNAAlphabet{N}};
# alternative_start::Bool = false,
# code::GeneticCode = ncbi_trans_table[1],
# min_len::Int64 = 6
# ) where {N}
# orfs = findorfs(sequence; alternative_start, min_len)
# norfs = length(orfs)
# padding = norfs < 10 ? length(string(norfs)) + 1 : length(string(norfs))
# records = FASTARecord[]
# @inbounds for (index, orf) in enumerate(orfs)
# id = string(lpad(string(index), padding, "0"))
# header = "ORF$(id) id=$(id) start=$(orf.location.start) stop=$(orf.location.stop) strand=$(orf.strand) frame=$(orf.frame)"
# record = FASTARecord(header, translate(sequence[orf]; code))
# push!(records, record)
# end
# return records
# end
46 changes: 23 additions & 23 deletions src/utils.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
export fasta_to_dna, hasprematurestop
# export fasta_to_dna, hasprematurestop
# General purposes methods supporting main functions
"""
fasta_to_dna(input::String)
Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects.
"""
function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}}
FASTAReader(open(input)) do reader
return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader]
end
end
# """
# fasta_to_dna(input::String)

# Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects.
# """
# function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}}
# FASTAReader(open(input)) do reader
# return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader]
# end
# end

# function gff_to_dna(input::AbstractString)
# GFF3.Reader(open(input)) do reader
Expand All @@ -24,23 +24,23 @@ Determine whether the `sequence` of type `LongSequence{DNAAlphabet{4}}` contains
Returns a boolean indicating whether the `sequence` has more than one stop codon.
"""
function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N}
# function hasprematurestop(sequence::NucleicSeqOrView{DNAAlphabet{N}})::Bool where {N}

stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")] # Create a set of stop codons
# stopcodons = [LongDNA{4}("TAA"), LongDNA{4}("TAG"), LongDNA{4}("TGA")] # Create a set of stop codons

length(sequence) % 3 == 0 || error("The sequence is not divisible by 3")
# length(sequence) % 3 == 0 || error("The sequence is not divisible by 3")

occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence")
# occursin(biore"T(AG|AA|GA)"dna, sequence[end-2:end]) || error("There is no stop codon at the end of the sequence")

@inbounds for i in 1:3:length(sequence) - 4
codon = sequence[i:i+2]
if codon in stopcodons
return true
end
end
# @inbounds for i in 1:3:length(sequence) - 4
# codon = sequence[i:i+2]
# if codon in stopcodons
# return true
# end
# end

return false
end
# return false
# end

@doc raw"""
iscoding(
Expand Down
11 changes: 11 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,17 @@ using FASTX
using GeneFinder
using Aqua

"""
fasta_to_dna(input::String)
Converts a FASTA formatted file (even if it is a multi-fasta) to an array of `LongSequence{DNAAlphabet{4}}` objects.
"""
function fasta_to_dna(input::AbstractString)::Vector{LongSequence{DNAAlphabet{4}}}
FASTAReader(open(input)) do reader
return [LongSequence{DNAAlphabet{4}}(sequence(record)) for record in reader]
end
end

include("findorfstest.jl")
include("iotest.jl")
include("getindextest.jl")
Expand Down

0 comments on commit 78434d0

Please sign in to comment.