Merge pull request #102 from mthang/fgenesh_update

update and split fgenesh tools
usegalaxy-au · Jan 18, 2024 · e0633b7 · e0633b7
2 parents 8d44f0a + 5c8f294
commit e0633b7
Show file tree

Hide file tree

Showing 6 changed files with 661 additions and 2 deletions.
diff --git a/tools/fgenesh/fgenesh_annotate.xml b/tools/fgenesh/fgenesh_annotate.xml
@@ -0,0 +1,295 @@
+<tool id="fgenesh_annotate" name="FGENESH annotate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" >
+    <description>sequences</description>
+    <macros>
+	  <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <command><![CDATA[
+	    mkdir -p result &&
+	    mkdir -p output_gff &&
+	    BLAST_PATH=\$(which blastp) > configfile &&
+	    echo "BLASTP = \$BLAST_PATH" >> configfile &&
+	    echo "BLAST2 = \$BLAST_PATH" >> configfile &&
+	    echo "NUM_THREADS = \${GALAXY_SLOTS:-4}"  >> configfile &&
+	    cat '$cfg' >> configfile &&
+	    ### cat seqlit - data preparation  &&
+	    #if $inputs.input_type == 'single':
+	    	#for $input in $inputs.single_seq
+	    	   ln -fs '$input' $input.element_identifier &&
+	    	   echo `pwd`/$input.element_identifier >> seqlist &&
+		   echo `pwd`/$input.element_identifier > '$input.element_identifier'.list &&
+	    	#end for
+		sort seqlist > sorted_seqlist &&
+	    	#if $repeat_sequence.selector == 'single_masked_seq':
+		   #for $seq in $repeat_sequence.masked_seq_single
+	    		ln -fs '$seq' $seq.element_identifier &&
+	    		echo `pwd`/$seq.element_identifier >> seqlistN && 
+			echo `pwd`/$seq.element_identifier > '$seq.element_identifier'.list &&
+	    	#end for
+		sort seqlistN > sorted_seqlistN &&
+		#end if
+	    #elif $inputs.input_type == 'multiple':
+	    	#for $i,$input in enumerate($inputs.multiple_seq)
+	    	    ln -fs '$input' $input.element_identifier &&
+	    	    echo `pwd`/$input.element_identifier >> seqlist && 
+		    echo `pwd`/$input.element_identifier > '$input.element_identifier'.list &&
+	    	#end for
+                #if $repeat_sequence.selector == 'multiple_masked_seq':
+                   #for $e,$mseq in enumerate($repeat_sequence.masked_seq_multiple)
+                        ln -fs '$mseq' $mseq.element_identifier &&
+	    		echo `pwd`/$mseq.element_identifier >> seqlistN_temp && 
+	    		paste seqlistN_temp seqlist | sort | cut -f1 > seqlistN && #### sort the filename in the seqlistN file to maintain the same order as the filename in seqlist file
+			### prep fo parallel command
+	    		echo `pwd`/$mseq.element_identifier > '$mseq.element_identifier'.list &&
+	    	   #end for
+		#end if
+	    #end if
+
+	    ### cat seqlist
+	    #if $repeat_sequence.selector == 'no_repeat_seq':
+
+	    	for s in `cat sorted_seqlist`;
+	    	do
+	    	   echo  "run_pipe.pl configfile -l '\$s'.list -d result_'\$(basename \$s)'";
+	    	done > fgenesh_parallel_command.sh &&
+		
+	    	cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" &&
+
+	    	####run_pipe.pl configfile -l seqlist -d result &&
+		mv result_*/* result/ &&
+	    	run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1
+	    #elif $repeat_sequence.selector == 'single_masked_seq':
+
+
+                for s in `cat sorted_seqlist`;
+                do
+                   echo  "run_pipe.pl configfile -l '\$s'.list -m '\$s'.N.list -d result_'\$(basename \$s)'";
+                done > fgenesh_parallel_command.sh &&
+
+                cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" &&
+
+	    	####run_pipe.pl configfile -l seqlist -m seqlistN -d result && 
+		mv result_*/* result/ &&
+	    	run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1
+	    #elif $repeat_sequence.selector == 'multiple_masked_seq':
+	    
+	    	for s in `cat seqlist`;
+	    	do
+	     	    echo  "run_pipe.pl configfile -l '\$s'.list -m '\$s'.N.list -d result_'\$(basename \$s)'";
+	        done > fgenesh_parallel_command.sh &&
+
+	    	cat fgenesh_parallel_command.sh | parallel --will-cite -j "\${GALAXY_SLOTS:-10}" && 
+	    	###run_pipe.pl configfile -l seqlist -m seqlistN -d result &&
+		mv result_*/* result/ &&
+	    	run_fgenesh_2_gff3.pl result output_gff -sort -print_exons && 2>&1
+	    #end if
+
+	    ]]></command>
+    <configfiles>
+	    <configfile name="cfg"><![CDATA[
+GENE_PARAM = ${matrix_type.species_matrix.fields.path}
+PIPE_PARAM = ${db_type.genome_type.fields.path}
+PREDICT_GC = ${predict_gc}
+#if $map_mrna.mRNAs == '0'		    
+MAP_mRNAs = ${map_mrna.mRNAs}
+#else
+MAP_mRNAs = ${map_mrna.mRNAs}
+CDNA_FILE = ${map_mrna.cdna_file}
+PROT_FILE = ${map_mrna.prot_file}
+DAT_FILE = ${map_mrna.dat_file}
+#end if
+#if $map_est.ESTs == '0'
+MAP_ESTS = ${map_est.ESTs}
+#else
+MAP_ESTS = ${map_est.ESTs}
+EST_FILE = ${map_est.est_file}		    
+#end if
+USE_READS = ${use_reads}
+DIR_SITES = na
+PROG_PROT = ${use_proteins}
+USE_PROTEINS = ${use_proteins}
+PROTEIN_DB = ${nr_type.nr_db.fields.path}
+PROTEIN_DB_INDEX = ${nr_type.nr_db.fields.path}.ind
+PROTEIN_DB_TAG = NR
+BLAST_AI_PROTEINS =  ${find_homologs} # find homologs for ab initio predicted genes ( 0 - no , 1 - yes)
+INTRONIC_GENES = ${intronic_genes} 
+             ]]></configfile>
+    </configfiles>
+    <inputs>
+	    <conditional name="inputs">
+                <param name="input_type" type="select" label="Input type" help="Select single sequence or collection of sequence">
+			<option value="single" selected="true">Single sequence</option>
+			<option value="multiple">Multiple sequences</option>
+		</param>
+		<when value="single">
+			<param name="single_seq" format="fasta" type="data" label="Single sequence" help="Single sequence" multiple="true"/>
+		</when>
+		<when value="multiple">
+			<param name="multiple_seq" format="fasta" type="data_collection" collection_type="list" label="Multiple sequence"/>
+		</when>
+	    </conditional>
+            <conditional name="repeat_sequence">
+		    	<param name="selector" type="select" label="Use repeat masking sequence" help="Enable this option if you want to use repeat masked sequences .">
+			<option value="no_repeat_seq" selected="true">No repeat sequence</option>
+                	<option value="single_masked_seq">Single masked sequence</option>
+                	<option value="multiple_masked_seq">Multiple masked sequences</option>
+            	</param>
+		<when value="single_masked_seq">
+                      <param name="masked_seq_single" format="fasta" type="data" label="repeat masked sequence" help="Single masked sequence" multiple="true"/>
+                </when>
+		<when value="multiple_masked_seq">
+		      <param name="masked_seq_multiple" format="fasta" type="data_collection" collection_type="list" label="repeat masked sequence" help="Multiple repeat sequence"/>
+	      	</when>
+		<when value="no_repeat_seq"></when>
+            </conditional>
+            <conditional name="matrix_type">
+                  <param name="matrix_type_selector" type="select" label="Select matrix type" help="Select matrix for your species">
+                         <option value="indexed" selected="true">Use a built-in index</option>
+                         <option value="history">Use one from the history</option>
+                  </param>
+                  <when value="indexed">
+                         <param name="species_matrix" type="select" label="Select a species matrix" help="If your species of interest is not listed, contact your Galaxy admin">
+                          <options from_data_table="fgenesh_matrix">
+                              <filter type="sort_by" column="2"/>
+                              <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                           </options>
+                         </param>
+                  </when>
+                  <when value="history">
+                       <param name="own_file" type="data" format="txt" label="Select species matrix" />
+                  </when>
+            </conditional>
+            <conditional name="db_type">
+		  <param name="db_type_selector" type="select" label="Select db type" help="Select Mammal DB / Non Mammal DB">
+                         <option value="indexed" selected="true">Use a built-in index</option>
+                         <option value="history">Use one from the history</option>
+                  </param>
+                  <when value="indexed">
+                         <param name="genome_type" type="select" label="Select a reference database" help="If your database of interest is not listed, contact your Galaxy admin">
+                          <options from_data_table="fgenesh_db">
+                              <filter type="sort_by" column="2"/>
+                              <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                           </options>
+                         </param>
+                  </when>
+                  <when value="history">
+                       <param name="own_file" type="data" format="txt" label="Select reference database" />
+                  </when>
+	  </conditional>
+            <conditional name="nr_type">
+                  <param name="nr_type_selector" type="select" label="Select nr db type" help="Select NR database">
+                         <option value="indexed" selected="true">Use a built-in index</option>
+                         <option value="history">Use one from the history</option>
+                  </param>
+                  <when value="indexed">
+                         <param name="nr_db" type="select" label="Select a NR database" help="If your database of interest is not listed, contact your Galaxy admin">
+                          <options from_data_table="fgenesh_nr">
+                              <filter type="sort_by" column="2"/>
+                              <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                           </options>
+                         </param>
+                  </when>
+                  <when value="history">
+                       <param name="own_file" type="data" format="txt" label="Select reference database" />
+                  </when>
+           </conditional>
+           <conditional name="map_mrna">
+                  <param name="mRNAs" type="select" label="mRNAs" help="map known mRNA data to the genomic sequences">
+                        <option value="0">No</option>
+                        <option value="1">Yes</option>
+                  </param>
+                  <when value="1">
+                          <param name="prot_file" type="data" format="fasta" label="cDNA file" help="cdna fasta file for known mRNAs"/>
+                          <param name="cdna_file" type="data" format="fasta" label="Protein file" help="protein fasta file for known mRNAs"/>
+                          <param name="dat_file" type="data" format="txt" label="Dat file" help="dat file for known mRNAs"/>
+                  </when>
+                  <when value="0"/>
+	  </conditional>
+           <conditional name="map_est">
+                  <param name="ESTs" type="select" label="ESTs" help="map ESTs to the genomic sequences">
+                        <option value="0">No</option>
+                        <option value="1">Yes</option>
+                  </param>
+                  <when value="1">
+                          <param name="est_file" type="data" format="fasta" label="ESTs file" help="fasta file with ESTs"/>
+                  </when>
+                  <when value="0"/>
+          </conditional>
+	  <param name="predict_gc" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Predict GC" help="predict genes with GC donor splice sites or not"/>
+	  <param name="use_reads" type="boolean" checked="false" truevalue="1" falsevalue="0" label="USE_READS" help="use reads info to improve gene models"/>
+	  <param name="find_homologs" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Find homologs" help="find homologs for ab initio predicted genes"/>
+          <param name="use_proteins" type="boolean" checked="false" truevalue="1" falsevalue="0" label="USE_PROTEINS" help="Using known proteins for prediction"/>
+          <param name="intronic_genes" type="boolean" checked="false" truevalue="1" falsevalue="0" label="INTRONIC_GENES" help="predict genes in long introns of other genes"/>
+    </inputs>
+    <outputs>
+	    <!--<data name="single_annotation" format="txt" label="${tool.name} on ${on_string}: single annotation" from_work_dir="result/*.resn3">
+		  <filter>input['input_type'] == 'individual'</filter>
+	    </data>-->
+	    <collection name="multiple_annotation" type="list" label="${tool.name} on ${on_string}: multiple annotation">
+		  <discover_datasets pattern="(?P&lt;name&gt;.*).resn3$" format="txt" directory="result"/>
+			  <!--<filter>input['input_type'] == 'multiple'</filter>-->
+	    </collection>
+            <collection name="annotated_gff3" type="list" label="${tool.name} on ${on_string}: GFF3">
+                  <discover_datasets pattern="(?P&lt;name&gt;.*).gff3$" format="gff" directory="output_gff"/>
+	    </collection>
+    </outputs>
+
+    <tests>
+	 <test>
+            <!-- #1 test -->
+        </test>
+    </tests>
+    <help><![CDATA[
+	    
+.. class:: infomark
+
+**What it does**
+	    
+*Fgenesh is a genome annotation tool*
+
+**Input**
+	    
+- input file  -  Genome or de novo assembly file in FASTA format and repeat masking fasta file  
+
+**Command line Example:**
+
+- FGENESHPIPE/run_pipe.pl  human_prj.cfg  -l seq_1.list  -m seq_1N.list  -d results_1
+
+- human_prj.cfg - a configuration file containing path to database, path gene matrix and the settings of the third party softwares
+- seq_1.list - a list of chromosome / scaffolds (unmasked)
+- seq_1N.ist - a list of chromosome / scaffolds (masked)
+- results_1 - output folder
+
+
+**Parameters:**
+
+- matrix type - built-in index or select the index from the user history
+- species matrix - select the gene matrix that matches the species of your input genome if built-in index in the matrix type is selected
+- db type - built-in database or select the database from the user history
+- reference database - fgenesh comes with Mammal DB / Non Mammal DB if built-in database in the db type is selected
+- NR db type - built-in non-redundant database or select the non-redundant database from the user history
+- NR database - select the non-redundant database for your species
+- mRNAs - map known mRNA sequences to the genomic sequences (default: No). If Yes is selected, make sure .cdna, .pro and .dat files are available in the user history.
+- ESTs - map ESTs to the genomic sequences (default: No) if Yes is selected, make sure there is ESTs fasta file available in the user history
+- Predict GC - predict genes with GC donor splice sites or not (default:No)
+- USE_READS - use reads info to improve gene models (default:No)
+- Find homologs - find homologs for ab initio predicted genes (0 - no, 1 - yes)
+- USE_PROTEINS - Using known proteins for prediction (default: No)
+- INTRONIC_GENES - Predicting genes in long introns of other genes (default: No)
+
+**Output**
+
+- txt (resn3) - raw output produced by Fgenesh
+- gff3 - gff3 file format converted from the Fgenesh resn3 file
+
+ 
+.. class:: infomark
+
+**Contributor**
+
+Galaxy Australia wrapped the Fgenesh: the original software is available on this website `FGENESH <http://www.softberry.com/berry.phtml?topic=fgenesh_plus_plus&group=help&subgroup=pipelines>`_. Note: The software license needs to acquired prior to use.
+
+	    
+	]]></help>
+</tool>
diff --git a/tools/fgenesh/fgenesh_get_mrnas_gc.xml b/tools/fgenesh/fgenesh_get_mrnas_gc.xml
@@ -0,0 +1,80 @@
+<tool id="fgenesh_get_mrnas_gc" name="FGENESH get mRNA or GC" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" >
+    <description>rensn3 and genomic file</description>
+    <macros>
+	  <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="stdio" />
+    <command><![CDATA[
+	    ln -s '$resn3_file' '$resn3_file.element_identifier' &&
+	    ln -s '$sequence_file' '$sequence_file.element_identifier' &&
+	    get_mrnas_or_GC.pl '$resn3_file.element_identifier' '$sequence_file.element_identifier' $output_mrna_file $CDS $GC -fix_id $fix_id_type 2>&1 | tee -a '$log'
+
+	    ]]></command>
+    <inputs>
+	    <param name="resn3_file" format="txt" type="data" label="Input Resn3 file"/>
+	    <param name="sequence_file" format="fasta" type="data" label="Sequence file"/>
+	    <param argument="-CDS" type="boolean" truevalue="-CDS" falsevalue="" checked="true" label="CDS only" help="create CDS fasta file" />
+	    <param argument="-GC" type="boolean" truevalue="-GC" falsevalue="" checked="false" label="GC report" help="report GC donor splice sites" />
+	    <param name="fix_id_type" type="select" label="fix header id in the output fasta file" help="Default: sequence name">
+                <option value="seq_No" selected="True">Sequence No</option>
+		<option value="seq_count">Sequence Count</option>
+		<option value="seq_nmae">Sequence Name</option>
+            </param>
+    </inputs>
+    <outputs>
+	    <data name="output_mrna_file" format="fasta"  label="${tool.name} on ${on_string}: mRNA file"/>
+	    <data name="log" format="txt" label="${tool.name} on ${on_string}: Report"/>
+    </outputs>
+
+    <tests>
+	 <test>
+            <!-- #1 test -->
+        </test>
+    </tests>
+    <help><![CDATA[
+	    
+.. class:: infomark
+
+**What it does**
+	    
+*Fgenesh get_mrnas_or_GC.pl is a tool to extract CDS(mRNAs) sequences using the input file with Fgenesh/Fgenesh++ predictions*
+
+**Input**
+	    
+- resn3 file  -  input file with Fgenesh/Fgenesh++ predictions  
+
+- sequence file - input  file with genomic FASTA sequences
+
+
+**Command line Example:**
+
+- get_mrnas_or_GC.pl <resn3_file> <seq_file> <mrna_file> [-CDS] [-GC] [-fix_id seq_name | seq_No | seq_count]
+
+- mrna_file - output file with CDS sequences in fasta file format
+
+- CDS - CDS only
+
+- GC - rerport GC donor splice sites
+
+- fix_id 
+- 1) seq_name - use sequence names [example of ID: 'ENm002_gene_7'] 
+- 2) seq_No - (numbers are taken from 'Sequence: <No>' if such field is present, e.g., "Length of sequence: 1000000, Sequence: 2, File: encode_hg17_44N.fa") - [example of ID: 'seq_2_gene_7']
+- 3) seq_count - use numbers (count sequences starting from 1) [example of ID: 'seq_2_gene_7']
+	
+**Output**
+
+- mrna_file - output file with CDS sequences in fasta file format
+
+- report - a report file of CDS and GC
+
+ 
+.. class:: infomark
+
+**Contributor**
+
+Galaxy Australia wrapped the Fgenesh: the original software is available on this website `FGENESH <http://www.softberry.com/berry.phtml?topic=fgenesh_plus_plus&group=help&subgroup=pipelines>`_. Note: The software license needs to acquired prior to use.
+
+	    
+	]]></help>
+</tool>