-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #102 from mthang/fgenesh_update
update and split fgenesh tools
- Loading branch information
Showing
6 changed files
with
661 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,295 @@ | ||
<tool id="fgenesh_annotate" name="FGENESH annotate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" > | ||
<description>sequences</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="requirements" /> | ||
<expand macro="stdio" /> | ||
<command><![CDATA[ | ||
mkdir -p result && | ||
mkdir -p output_gff && | ||
BLAST_PATH=\$(which blastp) > configfile && | ||
echo "BLASTP = \$BLAST_PATH" >> configfile && | ||
echo "BLAST2 = \$BLAST_PATH" >> configfile && | ||
echo "NUM_THREADS = \${GALAXY_SLOTS:-4}" >> configfile && | ||
cat '$cfg' >> configfile && | ||
### cat seqlit - data preparation && | ||
#if $inputs.input_type == 'single': | ||
#for $input in $inputs.single_seq | ||
ln -fs '$input' $input.element_identifier && | ||
echo `pwd`/$input.element_identifier >> seqlist && | ||
echo `pwd`/$input.element_identifier > '$input.element_identifier'.list && | ||
#end for | ||
sort seqlist > sorted_seqlist && | ||
#if $repeat_sequence.selector == 'single_masked_seq': | ||
#for $seq in $repeat_sequence.masked_seq_single | ||
ln -fs '$seq' $seq.element_identifier && | ||
echo `pwd`/$seq.element_identifier >> seqlistN && | ||
echo `pwd`/$seq.element_identifier > '$seq.element_identifier'.list && | ||
#end for | ||
sort seqlistN > sorted_seqlistN && | ||
#end if | ||
#elif $inputs.input_type == 'multiple': | ||
#for $i,$input in enumerate($inputs.multiple_seq) | ||
ln -fs '$input' $input.element_identifier && | ||
echo `pwd`/$input.element_identifier >> seqlist && | ||
echo `pwd`/$input.element_identifier > '$input.element_identifier'.list && | ||
#end for | ||
#if $repeat_sequence.selector == 'multiple_masked_seq': | ||
#for $e,$mseq in enumerate($repeat_sequence.masked_seq_multiple) | ||
ln -fs '$mseq' $mseq.element_identifier && | ||
echo `pwd`/$mseq.element_identifier >> seqlistN_temp && | ||
paste seqlistN_temp seqlist | sort | cut -f1 > seqlistN && #### sort the filename in the seqlistN file to maintain the same order as the filename in seqlist file | ||
### prep fo parallel command | ||
echo `pwd`/$mseq.element_identifier > '$mseq.element_identifier'.list && | ||
#end for | ||
#end if | ||
#end if | ||
### cat seqlist | ||
#if $repeat_sequence.selector == 'no_repeat_seq': | ||
for s in `cat sorted_seqlist`; | ||
do | ||
echo "run_pipe.pl configfile -l '\$s'.list -d result_'\$(basename \$s)'"; | ||
done > fgenesh_parallel_command.sh && | ||
cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" && | ||
####run_pipe.pl configfile -l seqlist -d result && | ||
mv result_*/* result/ && | ||
run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1 | ||
#elif $repeat_sequence.selector == 'single_masked_seq': | ||
for s in `cat sorted_seqlist`; | ||
do | ||
echo "run_pipe.pl configfile -l '\$s'.list -m '\$s'.N.list -d result_'\$(basename \$s)'"; | ||
done > fgenesh_parallel_command.sh && | ||
cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" && | ||
####run_pipe.pl configfile -l seqlist -m seqlistN -d result && | ||
mv result_*/* result/ && | ||
run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1 | ||
#elif $repeat_sequence.selector == 'multiple_masked_seq': | ||
for s in `cat seqlist`; | ||
do | ||
echo "run_pipe.pl configfile -l '\$s'.list -m '\$s'.N.list -d result_'\$(basename \$s)'"; | ||
done > fgenesh_parallel_command.sh && | ||
cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" && | ||
###run_pipe.pl configfile -l seqlist -m seqlistN -d result && | ||
mv result_*/* result/ && | ||
run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1 | ||
#end if | ||
]]></command> | ||
<configfiles> | ||
<configfile name="cfg"><![CDATA[ | ||
GENE_PARAM = ${matrix_type.species_matrix.fields.path} | ||
PIPE_PARAM = ${db_type.genome_type.fields.path} | ||
PREDICT_GC = ${predict_gc} | ||
#if $map_mrna.mRNAs == '0' | ||
MAP_mRNAs = ${map_mrna.mRNAs} | ||
#else | ||
MAP_mRNAs = ${map_mrna.mRNAs} | ||
CDNA_FILE = ${map_mrna.cdna_file} | ||
PROT_FILE = ${map_mrna.prot_file} | ||
DAT_FILE = ${map_mrna.dat_file} | ||
#end if | ||
#if $map_est.ESTs == '0' | ||
MAP_ESTS = ${map_est.ESTs} | ||
#else | ||
MAP_ESTS = ${map_est.ESTs} | ||
EST_FILE = ${map_est.est_file} | ||
#end if | ||
USE_READS = ${use_reads} | ||
DIR_SITES = na | ||
PROG_PROT = ${use_proteins} | ||
USE_PROTEINS = ${use_proteins} | ||
PROTEIN_DB = ${nr_type.nr_db.fields.path} | ||
PROTEIN_DB_INDEX = ${nr_type.nr_db.fields.path}.ind | ||
PROTEIN_DB_TAG = NR | ||
BLAST_AI_PROTEINS = ${find_homologs} # find homologs for ab initio predicted genes ( 0 - no , 1 - yes) | ||
INTRONIC_GENES = ${intronic_genes} | ||
]]></configfile> | ||
</configfiles> | ||
<inputs> | ||
<conditional name="inputs"> | ||
<param name="input_type" type="select" label="Input type" help="Select single sequence or collection of sequence"> | ||
<option value="single" selected="true">Single sequence</option> | ||
<option value="multiple">Multiple sequences</option> | ||
</param> | ||
<when value="single"> | ||
<param name="single_seq" format="fasta" type="data" label="Single sequence" help="Single sequence" multiple="true"/> | ||
</when> | ||
<when value="multiple"> | ||
<param name="multiple_seq" format="fasta" type="data_collection" collection_type="list" label="Multiple sequence"/> | ||
</when> | ||
</conditional> | ||
<conditional name="repeat_sequence"> | ||
<param name="selector" type="select" label="Use repeat masking sequence" help="Enable this option if you want to use repeat masked sequences ."> | ||
<option value="no_repeat_seq" selected="true">No repeat sequence</option> | ||
<option value="single_masked_seq">Single masked sequence</option> | ||
<option value="multiple_masked_seq">Multiple masked sequences</option> | ||
</param> | ||
<when value="single_masked_seq"> | ||
<param name="masked_seq_single" format="fasta" type="data" label="repeat masked sequence" help="Single masked sequence" multiple="true"/> | ||
</when> | ||
<when value="multiple_masked_seq"> | ||
<param name="masked_seq_multiple" format="fasta" type="data_collection" collection_type="list" label="repeat masked sequence" help="Multiple repeat sequence"/> | ||
</when> | ||
<when value="no_repeat_seq"></when> | ||
</conditional> | ||
<conditional name="matrix_type"> | ||
<param name="matrix_type_selector" type="select" label="Select matrix type" help="Select matrix for your species"> | ||
<option value="indexed" selected="true">Use a built-in index</option> | ||
<option value="history">Use one from the history</option> | ||
</param> | ||
<when value="indexed"> | ||
<param name="species_matrix" type="select" label="Select a species matrix" help="If your species of interest is not listed, contact your Galaxy admin"> | ||
<options from_data_table="fgenesh_matrix"> | ||
<filter type="sort_by" column="2"/> | ||
<validator type="no_options" message="No indexes are available for the selected input dataset"/> | ||
</options> | ||
</param> | ||
</when> | ||
<when value="history"> | ||
<param name="own_file" type="data" format="txt" label="Select species matrix" /> | ||
</when> | ||
</conditional> | ||
<conditional name="db_type"> | ||
<param name="db_type_selector" type="select" label="Select db type" help="Select Mammal DB / Non Mammal DB"> | ||
<option value="indexed" selected="true">Use a built-in index</option> | ||
<option value="history">Use one from the history</option> | ||
</param> | ||
<when value="indexed"> | ||
<param name="genome_type" type="select" label="Select a reference database" help="If your database of interest is not listed, contact your Galaxy admin"> | ||
<options from_data_table="fgenesh_db"> | ||
<filter type="sort_by" column="2"/> | ||
<validator type="no_options" message="No indexes are available for the selected input dataset"/> | ||
</options> | ||
</param> | ||
</when> | ||
<when value="history"> | ||
<param name="own_file" type="data" format="txt" label="Select reference database" /> | ||
</when> | ||
</conditional> | ||
<conditional name="nr_type"> | ||
<param name="nr_type_selector" type="select" label="Select nr db type" help="Select NR database"> | ||
<option value="indexed" selected="true">Use a built-in index</option> | ||
<option value="history">Use one from the history</option> | ||
</param> | ||
<when value="indexed"> | ||
<param name="nr_db" type="select" label="Select a NR database" help="If your database of interest is not listed, contact your Galaxy admin"> | ||
<options from_data_table="fgenesh_nr"> | ||
<filter type="sort_by" column="2"/> | ||
<validator type="no_options" message="No indexes are available for the selected input dataset"/> | ||
</options> | ||
</param> | ||
</when> | ||
<when value="history"> | ||
<param name="own_file" type="data" format="txt" label="Select reference database" /> | ||
</when> | ||
</conditional> | ||
<conditional name="map_mrna"> | ||
<param name="mRNAs" type="select" label="mRNAs" help="map known mRNA data to the genomic sequences"> | ||
<option value="0">No</option> | ||
<option value="1">Yes</option> | ||
</param> | ||
<when value="1"> | ||
<param name="prot_file" type="data" format="fasta" label="cDNA file" help="cdna fasta file for known mRNAs"/> | ||
<param name="cdna_file" type="data" format="fasta" label="Protein file" help="protein fasta file for known mRNAs"/> | ||
<param name="dat_file" type="data" format="txt" label="Dat file" help="dat file for known mRNAs"/> | ||
</when> | ||
<when value="0"/> | ||
</conditional> | ||
<conditional name="map_est"> | ||
<param name="ESTs" type="select" label="ESTs" help="map ESTs to the genomic sequences"> | ||
<option value="0">No</option> | ||
<option value="1">Yes</option> | ||
</param> | ||
<when value="1"> | ||
<param name="est_file" type="data" format="fasta" label="ESTs file" help="fasta file with ESTs"/> | ||
</when> | ||
<when value="0"/> | ||
</conditional> | ||
<param name="predict_gc" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Predict GC" help="predict genes with GC donor splice sites or not"/> | ||
<param name="use_reads" type="boolean" checked="false" truevalue="1" falsevalue="0" label="USE_READS" help="use reads info to improve gene models"/> | ||
<param name="find_homologs" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Find homologs" help="find homologs for ab initio predicted genes"/> | ||
<param name="use_proteins" type="boolean" checked="false" truevalue="1" falsevalue="0" label="USE_PROTEINS" help="Using known proteins for prediction"/> | ||
<param name="intronic_genes" type="boolean" checked="false" truevalue="1" falsevalue="0" label="INTRONIC_GENES" help="predict genes in long introns of other genes"/> | ||
</inputs> | ||
<outputs> | ||
<!--<data name="single_annotation" format="txt" label="${tool.name} on ${on_string}: single annotation" from_work_dir="result/*.resn3"> | ||
<filter>input['input_type'] == 'individual'</filter> | ||
</data>--> | ||
<collection name="multiple_annotation" type="list" label="${tool.name} on ${on_string}: multiple annotation"> | ||
<discover_datasets pattern="(?P<name>.*).resn3$" format="txt" directory="result"/> | ||
<!--<filter>input['input_type'] == 'multiple'</filter>--> | ||
</collection> | ||
<collection name="annotated_gff3" type="list" label="${tool.name} on ${on_string}: GFF3"> | ||
<discover_datasets pattern="(?P<name>.*).gff3$" format="gff" directory="output_gff"/> | ||
</collection> | ||
</outputs> | ||
|
||
<tests> | ||
<test> | ||
<!-- #1 test --> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
.. class:: infomark | ||
**What it does** | ||
*Fgenesh is a genome annotation tool* | ||
**Input** | ||
- input file - Genome or de novo assembly file in FASTA format and repeat masking fasta file | ||
**Command line Example:** | ||
- FGENESHPIPE/run_pipe.pl human_prj.cfg -l seq_1.list -m seq_1N.list -d results_1 | ||
- human_prj.cfg - a configuration file containing path to database, path gene matrix and the settings of the third party softwares | ||
- seq_1.list - a list of chromosome / scaffolds (unmasked) | ||
- seq_1N.ist - a list of chromosome / scaffolds (masked) | ||
- results_1 - output folder | ||
**Parameters:** | ||
- matrix type - built-in index or select the index from the user history | ||
- species matrix - select the gene matrix that matches the species of your input genome if built-in index in the matrix type is selected | ||
- db type - built-in database or select the database from the user history | ||
- reference database - fgenesh comes with Mammal DB / Non Mammal DB if built-in database in the db type is selected | ||
- NR db type - built-in non-redundant database or select the non-redundant database from the user history | ||
- NR database - select the non-redundant database for your species | ||
- mRNAs - map known mRNA sequences to the genomic sequences (default: No). If Yes is selected, make sure .cdna, .pro and .dat files are available in the user history. | ||
- ESTs - map ESTs to the genomic sequences (default: No) if Yes is selected, make sure there is ESTs fasta file available in the user history | ||
- Predict GC - predict genes with GC donor splice sites or not (default:No) | ||
- USE_READS - use reads info to improve gene models (default:No) | ||
- Find homologs - find homologs for ab initio predicted genes (0 - no, 1 - yes) | ||
- USE_PROTEINS - Using known proteins for prediction (default: No) | ||
- INTRONIC_GENES - Predicting genes in long introns of other genes (default: No) | ||
**Output** | ||
- txt (resn3) - raw output produced by Fgenesh | ||
- gff3 - gff3 file format converted from the Fgenesh resn3 file | ||
.. class:: infomark | ||
**Contributor** | ||
Galaxy Australia wrapped the Fgenesh: the original software is available on this website `FGENESH <http://www.softberry.com/berry.phtml?topic=fgenesh_plus_plus&group=help&subgroup=pipelines>`_. Note: The software license needs to acquired prior to use. | ||
]]></help> | ||
</tool> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
<tool id="fgenesh_get_mrnas_gc" name="FGENESH get mRNA or GC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" > | ||
<description>rensn3 and genomic file</description> | ||
<macros> | ||
<import>macros.xml</import> | ||
</macros> | ||
<expand macro="requirements" /> | ||
<expand macro="stdio" /> | ||
<command><![CDATA[ | ||
ln -s '$resn3_file' '$resn3_file.element_identifier' && | ||
ln -s '$sequence_file' '$sequence_file.element_identifier' && | ||
get_mrnas_or_GC.pl '$resn3_file.element_identifier' '$sequence_file.element_identifier' $output_mrna_file $CDS $GC -fix_id $fix_id_type 2>&1 | tee -a '$log' | ||
]]></command> | ||
<inputs> | ||
<param name="resn3_file" format="txt" type="data" label="Input Resn3 file"/> | ||
<param name="sequence_file" format="fasta" type="data" label="Sequence file"/> | ||
<param argument="-CDS" type="boolean" truevalue="-CDS" falsevalue="" checked="true" label="CDS only" help="create CDS fasta file" /> | ||
<param argument="-GC" type="boolean" truevalue="-GC" falsevalue="" checked="false" label="GC report" help="report GC donor splice sites" /> | ||
<param name="fix_id_type" type="select" label="fix header id in the output fasta file" help="Default: sequence name"> | ||
<option value="seq_No" selected="True">Sequence No</option> | ||
<option value="seq_count">Sequence Count</option> | ||
<option value="seq_nmae">Sequence Name</option> | ||
</param> | ||
</inputs> | ||
<outputs> | ||
<data name="output_mrna_file" format="fasta" label="${tool.name} on ${on_string}: mRNA file"/> | ||
<data name="log" format="txt" label="${tool.name} on ${on_string}: Report"/> | ||
</outputs> | ||
|
||
<tests> | ||
<test> | ||
<!-- #1 test --> | ||
</test> | ||
</tests> | ||
<help><![CDATA[ | ||
.. class:: infomark | ||
**What it does** | ||
*Fgenesh get_mrnas_or_GC.pl is a tool to extract CDS(mRNAs) sequences using the input file with Fgenesh/Fgenesh++ predictions* | ||
**Input** | ||
- resn3 file - input file with Fgenesh/Fgenesh++ predictions | ||
- sequence file - input file with genomic FASTA sequences | ||
**Command line Example:** | ||
- get_mrnas_or_GC.pl <resn3_file> <seq_file> <mrna_file> [-CDS] [-GC] [-fix_id seq_name | seq_No | seq_count] | ||
- mrna_file - output file with CDS sequences in fasta file format | ||
- CDS - CDS only | ||
- GC - rerport GC donor splice sites | ||
- fix_id | ||
- 1) seq_name - use sequence names [example of ID: 'ENm002_gene_7'] | ||
- 2) seq_No - (numbers are taken from 'Sequence: <No>' if such field is present, e.g., "Length of sequence: 1000000, Sequence: 2, File: encode_hg17_44N.fa") - [example of ID: 'seq_2_gene_7'] | ||
- 3) seq_count - use numbers (count sequences starting from 1) [example of ID: 'seq_2_gene_7'] | ||
**Output** | ||
- mrna_file - output file with CDS sequences in fasta file format | ||
- report - a report file of CDS and GC | ||
.. class:: infomark | ||
**Contributor** | ||
Galaxy Australia wrapped the Fgenesh: the original software is available on this website `FGENESH <http://www.softberry.com/berry.phtml?topic=fgenesh_plus_plus&group=help&subgroup=pipelines>`_. Note: The software license needs to acquired prior to use. | ||
]]></help> | ||
</tool> |
Oops, something went wrong.