From 5c8f294da4f30080d8650ce67c7e561fd6420856 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 18 Jan 2024 06:35:19 +0000 Subject: [PATCH] update and split fgenesh tools --- tools/fgenesh/fgenesh_annotate.xml | 295 +++++++++++++++++++++++++ tools/fgenesh/fgenesh_get_mrnas_gc.xml | 80 +++++++ tools/fgenesh/fgenesh_merge.xml | 87 ++++++++ tools/fgenesh/fgenesh_split.xml | 93 ++++++++ tools/fgenesh/fgenesh_to_genbank.xml | 105 +++++++++ tools/fgenesh/macros.xml | 3 +- 6 files changed, 661 insertions(+), 2 deletions(-) create mode 100644 tools/fgenesh/fgenesh_annotate.xml create mode 100644 tools/fgenesh/fgenesh_get_mrnas_gc.xml create mode 100644 tools/fgenesh/fgenesh_merge.xml create mode 100644 tools/fgenesh/fgenesh_split.xml create mode 100644 tools/fgenesh/fgenesh_to_genbank.xml diff --git a/tools/fgenesh/fgenesh_annotate.xml b/tools/fgenesh/fgenesh_annotate.xml new file mode 100644 index 00000000..346e6a82 --- /dev/null +++ b/tools/fgenesh/fgenesh_annotate.xml @@ -0,0 +1,295 @@ + + sequences + + macros.xml + + + + configfile && + echo "BLASTP = \$BLAST_PATH" >> configfile && + echo "BLAST2 = \$BLAST_PATH" >> configfile && + echo "NUM_THREADS = \${GALAXY_SLOTS:-4}" >> configfile && + cat '$cfg' >> configfile && + ### cat seqlit - data preparation && + #if $inputs.input_type == 'single': + #for $input in $inputs.single_seq + ln -fs '$input' $input.element_identifier && + echo `pwd`/$input.element_identifier >> seqlist && + echo `pwd`/$input.element_identifier > '$input.element_identifier'.list && + #end for + sort seqlist > sorted_seqlist && + #if $repeat_sequence.selector == 'single_masked_seq': + #for $seq in $repeat_sequence.masked_seq_single + ln -fs '$seq' $seq.element_identifier && + echo `pwd`/$seq.element_identifier >> seqlistN && + echo `pwd`/$seq.element_identifier > '$seq.element_identifier'.list && + #end for + sort seqlistN > sorted_seqlistN && + #end if + #elif $inputs.input_type == 'multiple': + #for $i,$input in enumerate($inputs.multiple_seq) + ln -fs '$input' $input.element_identifier && + echo `pwd`/$input.element_identifier >> seqlist && + echo `pwd`/$input.element_identifier > '$input.element_identifier'.list && + #end for + #if $repeat_sequence.selector == 'multiple_masked_seq': + #for $e,$mseq in enumerate($repeat_sequence.masked_seq_multiple) + ln -fs '$mseq' $mseq.element_identifier && + echo `pwd`/$mseq.element_identifier >> seqlistN_temp && + paste seqlistN_temp seqlist | sort | cut -f1 > seqlistN && #### sort the filename in the seqlistN file to maintain the same order as the filename in seqlist file + ### prep fo parallel command + echo `pwd`/$mseq.element_identifier > '$mseq.element_identifier'.list && + #end for + #end if + #end if + + ### cat seqlist + #if $repeat_sequence.selector == 'no_repeat_seq': + + for s in `cat sorted_seqlist`; + do + echo "run_pipe.pl configfile -l '\$s'.list -d result_'\$(basename \$s)'"; + done > fgenesh_parallel_command.sh && + + cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" && + + ####run_pipe.pl configfile -l seqlist -d result && + mv result_*/* result/ && + run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1 + #elif $repeat_sequence.selector == 'single_masked_seq': + + + for s in `cat sorted_seqlist`; + do + echo "run_pipe.pl configfile -l '\$s'.list -m '\$s'.N.list -d result_'\$(basename \$s)'"; + done > fgenesh_parallel_command.sh && + + cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" && + + ####run_pipe.pl configfile -l seqlist -m seqlistN -d result && + mv result_*/* result/ && + run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1 + #elif $repeat_sequence.selector == 'multiple_masked_seq': + + for s in `cat seqlist`; + do + echo "run_pipe.pl configfile -l '\$s'.list -m '\$s'.N.list -d result_'\$(basename \$s)'"; + done > fgenesh_parallel_command.sh && + + cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" && + ###run_pipe.pl configfile -l seqlist -m seqlistN -d result && + mv result_*/* result/ && + run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1 + #end if + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + `_. Note: The software license needs to acquired prior to use. + + + ]]> + diff --git a/tools/fgenesh/fgenesh_get_mrnas_gc.xml b/tools/fgenesh/fgenesh_get_mrnas_gc.xml new file mode 100644 index 00000000..6dc67039 --- /dev/null +++ b/tools/fgenesh/fgenesh_get_mrnas_gc.xml @@ -0,0 +1,80 @@ + + rensn3 and genomic file + + macros.xml + + + + &1 | tee -a '$log' + + ]]> + + + + + + + + + + + + + + + + + + + + + + [-CDS] [-GC] [-fix_id seq_name | seq_No | seq_count] + +- mrna_file - output file with CDS sequences in fasta file format + +- CDS - CDS only + +- GC - rerport GC donor splice sites + +- fix_id +- 1) seq_name - use sequence names [example of ID: 'ENm002_gene_7'] +- 2) seq_No - (numbers are taken from 'Sequence: ' if such field is present, e.g., "Length of sequence: 1000000, Sequence: 2, File: encode_hg17_44N.fa") - [example of ID: 'seq_2_gene_7'] +- 3) seq_count - use numbers (count sequences starting from 1) [example of ID: 'seq_2_gene_7'] + +**Output** + +- mrna_file - output file with CDS sequences in fasta file format + +- report - a report file of CDS and GC + + +.. class:: infomark + +**Contributor** + +Galaxy Australia wrapped the Fgenesh: the original software is available on this website `FGENESH `_. Note: The software license needs to acquired prior to use. + + + ]]> + diff --git a/tools/fgenesh/fgenesh_merge.xml b/tools/fgenesh/fgenesh_merge.xml new file mode 100644 index 00000000..566265f1 --- /dev/null +++ b/tools/fgenesh/fgenesh_merge.xml @@ -0,0 +1,87 @@ + + gff3 or resn3 file + + macros.xml + + + + > resn3.list && + #end for + merge_res_files.pl -l resn3.list -dir input_resn3_files/ -o $output_resn3 + #end if + + ]]> + + + + + + + + + + + + + + + + + input['selector'] == "gff" + + + input['selector'] == "resn3" + + + + + + + + + `_. Note: The software license needs to acquired prior to use. + + + ]]> + diff --git a/tools/fgenesh/fgenesh_split.xml b/tools/fgenesh/fgenesh_split.xml new file mode 100644 index 00000000..2c0c5d7a --- /dev/null +++ b/tools/fgenesh/fgenesh_split.xml @@ -0,0 +1,93 @@ + + fasta sequences + + macros.xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + `_. Note: The software license needs to acquired prior to use. + + + ]]> + diff --git a/tools/fgenesh/fgenesh_to_genbank.xml b/tools/fgenesh/fgenesh_to_genbank.xml new file mode 100644 index 00000000..493da03d --- /dev/null +++ b/tools/fgenesh/fgenesh_to_genbank.xml @@ -0,0 +1,105 @@ + + create genbank file + + macros.xml + + + + > resn3.list && + #end for + + sort resn3.list > resn3_sorted.list && + + #for $seq_file in $sequence_collection + ln -s '$seq_file' ${seq_file.element_identifier} && + echo ${seq_file.element_identifier} >> seq.list && + #end for + + sort seq.list > seq_sorted.list && + + ln -s $genbank_header header && + + run_fgenesh_2_genbank.pl $polya $skip_empty + -div:$div + -org_code:$organism_code + -method:$method + header + resn3_sorted.list + seq_sorted.list + output_dir/ + 2>&1 + + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + +- header file - file with additional comments (use Genbank keywords) + +- res_files_list - a list of resn3 files (i.e a collection of resn3 files in the Galaxy history) + +- seq_files_list - a list of corresponding fasta files to the input resn3 files in res_files_list (i.e a collection of fasta files in the Galaxy history) + +- option + +- taxa - annotate predicted TATA boxes (not implemented in this wrapper due to unexpected bug) +- polya - annotate predicted PolyA signals +- div:
- GenBank division (PRI by default) +- org_code: - 2-letter organism code used as prefix for gene names, for example: HS - Homo sapiens, PG - Punicum granatum (Pomegranate), EG - Elaeis guineensis, GN is used by default if no other code is provided +- method: - program used for gene prediction (Fgenesh, Fgenesh++); Fgenesh by default) +- skip_empty - do not append records for sequences with no predictions + +**Output** + +- folder - contains a list of genbank files (aka a collection in the Galaxy history) + +.. class:: infomark + +**Contributor** + +Galaxy Australia wrapped the Fgenesh: the original software is available on this website `FGENESH `_. Note: The software license needs to acquired prior to use. + + + ]]> + diff --git a/tools/fgenesh/macros.xml b/tools/fgenesh/macros.xml index ec455c7e..198dc4fc 100644 --- a/tools/fgenesh/macros.xml +++ b/tools/fgenesh/macros.xml @@ -6,8 +6,7 @@ 3e414082c1a12393ab10b1bc4e22de540397fef626840945824a76f6d62def6b - wthang/genomeannotation:latest - blast + wthang/genomepannotation:v2