Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add panaroo tool #118

Merged
merged 1 commit into from
Jul 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions tools/panaroo/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: panaroo
owner: galaxy-australia
categories:
- Pangenome
description: A Bacterial Pangenome Analysis Pipeline
homepage_url: https://gthlab.au/panaroo/#/
long_description: |
a graph-based pangenome clustering tool that is able to account for many of the sources of error introduced during the annotation of prokaryotic genome assemblies.
remote_repository_url: https://github.com/usegalaxy-au/tools-au/tree/master/tools/panaroo
type: unrestricted
69 changes: 69 additions & 0 deletions tools/panaroo/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
<macros>
<token name="@TOOL_VERSION@">1.5.0</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">22.05</token>
<xml name="edam_ontology">
<edam_topics>
<edam_topic>topic_0194</edam_topic>
</edam_topics>
</xml>
<xml name="biotools">
<xrefs>
<xref type="bio.tools">panaroo</xref>
</xrefs>
</xml>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">panaroo</requirement>
<requirement type="package" version="170427">prank</requirement>
</requirements>
</xml>
<xml name="clean_mode">
<option value="strict">strict</option>
<option value="moderate">moderate</option>
<option value="sensitive">sensitive</option>
</xml>
<xml name="genetic_code">
<option value="1">1. Standard</option>
<option value="2">2. Vertebrate Mitochondrial</option>
<option value="3">3. Yeast Mitochondrial</option>
<option value="4">4. Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
<option value="5">5. Invertebrate Mitochondrial</option>
<option value="6">6. Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
<option value="9">9. Echinoderm Mitochondrial</option>
<option value="10">10. Euplotid Nuclear</option>
<option value="11" selected="True">11. Bacteria and Archaea</option>
<option value="12">12. Alternative Yeast Nuclear</option>
<option value="13">13. Ascidian Mitochondrial</option>
<option value="14">14. Flatworm Mitochondrial</option>
<option value="15">15. Blepharisma Macronuclear</option>
<option value="16">16. Chlorophycean Mitochondrial</option>
<option value="21">21. Trematode Mitochondrial</option>
<option value="22">22. Scenedesmus obliquus mitochondrial</option>
<option value="23">23. Thraustochytrium Mitochondrial</option>
<option value="24">24. Pterobranchia mitochondrial</option>
<option value="25">25. Candidate Division SR1 and Gracilibacteria Code</option>
<option value="26">26. Pachysolen tannophilus Nuclear Code</option>
<option value="27">27. Karyorelict Nuclear Code</option>
<option value="28">28. Condylostoma Nuclear Code</option>
<option value="29">29. Mesodinium Nuclear Code</option>
<option value="30">30. Peritrich Nuclear Code</option>
<option value="31">31. Blastocrithidia Nuclear Code</option>
<option value="33">33. Cephalodiscidae Mitochondrial UAA-Tyr Code</option>
</xml>
<xml name="refind_mode_option">
<option value="default" selected="True">default</option>
<option value="strict">strict</option>
<option value="off">off</option>
</xml>
<xml name="gene_alignment">
<option value="None" selected="True">None</option>
<option value="core">core</option>
<option value="pan">pan</option>
</xml>
<xml name="gene_aligner">
<option value="mafft" selected="True">mafft</option>
<option value="prank">prank</option>
<option value="clustal">clustal</option>
</xml>
</macros>
235 changes: 235 additions & 0 deletions tools/panaroo/panaroo.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
<tool id="panaroo" name="Panaroo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>A Bacterial Pangenome Analysis Pipeline</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="edam_ontology"/>
<expand macro="biotools"/>
<expand macro="requirements"/>
<stdio>
<exit_code range="1:" />
<regex match="System..*Exception"
source="both"
level="fatal"
description="Error encountered" />
</stdio>
<command><![CDATA[

mkdir outdir &&

#import re
#set input_directory = 'input_directory'
mkdir $input_directory &&
#for $gff in $gff_input_collection:
#set identifier = re.sub('[^\s\w\-\\.]','_',str($gff.element_identifier))
ln -fs '$gff' '$input_directory/$identifier' &&
#end for

panaroo
-t \${GALAXY_SLOTS:-2}
#if str($gen_code) != 'None':
--codon-table $gen_code
#end if
#if str($advanced.adv_options_selector) == "set":
#if $advanced.remove_invalid_gene
$advanced.remove_invalid_gene
#end if
-c '$advanced.matching_option.seq_threshold'
-f '$advanced.matching_option.peptide_threshold'
--len_dif_percent '$advanced.matching_option.length_diff_cutoff'
$advanced.matching_option.merge_paralogs
--search_radius '$advanced.refind_option.search_radius'
--refind_prop_match '$advanced.refind_option.refind_prop_match'
--refind-mode '$advanced.refind_option.refind_mode'
--min_trailing_support '$advanced.graph_correction_option.min_trailing_support'
--trailing_recursive '$advanced.graph_correction_option.trailing_recursive'
--edge_support_threshold '$advanced.graph_correction_option.edge_support_threshold'
--remove_by_consensus '$advanced.graph_correction_option.remove_by_consensus'
--high_var_flag '$advanced.graph_correction_option.high_var_flag'
--min_edge_support_sv '$advanced.graph_correction_option.min_edge_support_sv'
$advanced.graph_correction_option.all_seq_in_graph
$advanced.graph_correction_option.no_clean_edges

#if $advanced.gene_alignment_option.a != 'None'
-a '$advanced.gene_alignment_option.a'
#end if

#if '$advanced.gene_alignment_option.aligner' == 'mafft'
--aligner mafft
#else
--aligner '$advanced.gene_alignment_option.aligner'
#end if
#if $advanced.gene_alignment_option.core_subset != ''
--core_subset $advanced.gene_alignment_option.core_subset
#end if
#end if
-i $input_directory/*.gff
-o outdir
--clean-mode $mode
> '$log' &&
mv outdir/gene_presence_absence.Rtab outdir/gene_presence_absence_rtab.Rtab &&
2>&1

]]></command>
<inputs>
<param name="gff_input_collection" type="data_collection" format="gff" collection_type="list" label="GFF Input Collection" help="A list of gff files (i.e prokka)"/>
<param name="mode" type="select" label="The stringency mode at which to run panaroo" help="--clean-mode">
<expand macro="clean_mode"/>
</param>
<param name="gen_code" type="select" label="the codon table user for translation" help="default: 11">
<expand macro="genetic_code"/>
</param>
<conditional name="advanced">
<param name="adv_options_selector" type="select" label="Set advanced options?" help="Provides additional controls">
<option value="set">Set</option>
<option value="do_not_set" selected="True">Do not set</option>
</param>
<when value="set">
<param name="remove_invalid_gene" argument="--remove-invalid-genes" type="boolean" truevalue="--remove-invalid-genes" falsevalue="" label="removes annotations that do not conform to the expected Prokka format such as those including premature stop codons" help="--remove-invalid-genes"/>

<section name="matching_option" title="Matching" expanded="false">
<param name="seq_threshold" argument="--threshold" type="float" value="0.98" label="sequence identity threshold" help="default: 0.98"/>
<param name="peptide_threshold" argument="--family_threshold" type="float" value="0.7" label="protein family sequence identity threshold" help="default: 0.7"/>
<param name="length_diff_cutoff" argument="--len_dif_percent" type="float" value="0.98" label="length difference cutoff" help="default: 0.98"/>
<param name="merge_paralogs" type="boolean" truevalue="--merge_paralogs" falsevalue="" checked="false" label="do not split paralogs" help="--merge_paralogs"/>
</section>

<section name="refind_option" title="Refind" expanded="false">
<param argument="--search_radius" type="integer" value="5000" label="Search radius" help="--search_radius (default: 5000)"/>
<param argument="--refind_prop_match" type="float" value="0.75" label="Gene proportion match" help="default: 0.75"/>
<param argument="--refind_mode" type="select" label="The stringency mode at which to re-find genes" help="default: default">
<expand macro="refind_mode_option"/>
</param>
</section>

<section name="graph_correction_option" title="Graph Correction" expanded="false">
<param argument="--min_trailing_support" type="integer" value="2" label="Minimum cluster size to keep a gene called at the end of a contig" help="--min_traiiing_support [relexed mode : 2 is used]"/>
<param argument="--trailing_recursive" type="integer" value="1" label="Number of times to perform recursive trimming of low support nodes near the end of contigs" help="--trailing_recursive [relaxed mode: 1 is used]"/>
<param name="edge_support_threshold" type="integer" value="1" label="Edge support threshold" help="--edge_support_threshold [ Minimal edge 1 is used ]"/>
<param name="len_outlier_proportion" type="float" value="0.01" label="Length outlier support proportion" help="--length_outlier_support_proportion [default: 0.01]"/>
<param name="remove_by_consensus" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove consensus" help="--remove_by_consensus [default: False]"/>
<param name="high_var_flag" type="integer" value="5" label="Highly variable gene region" help="--high_var_flag [default: 5]"/>
<param name="min_edge_support_sv" type="integer" value="2" label="Minimum edge support structural variants" help="--min_edge_support_sv [relaxed mode: 2 is used]"/>
<param argument="--all_seq_in_graph" type="boolean" truevalue="--all_seq_in_graph" falsevalue="" label="Retains all DNA sequence" help="--all_seq_in_graph [default: off]"/>
<param argument="--no_clean_edges" type="boolean" truevalue="--no_clean_edges" falsevalue="" label="Edge filtering in the final output graph" help="--no_clean_edges [default: off]"/>
</section>

<section name="gene_alignment_option" title="Gene Alignment" expanded="false">
<param argument="-a" type="select" label="Output alignments of core genes or all genes." help="-a [optional: core or pan; default: None">
<expand macro="gene_alignment"/>
</param>
<param argument="--aligner" type="select" label="Specify an aligner" help="--aligner [mafft|prank|clustal][default: mafft]">
<expand macro="gene_aligner"/>
</param>
<param name="codons" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate codon alignments" help="--codons"/>
<param name="core_threshold" type="float" value="0.95" label="Core-genome sample threshold" help="--core_threshold [default: 0.95]"/>
<param argument="--core_subset" type="integer" value="" optional="true" label="Subset of the core genome to these many genes" help="--core_subset [default: all]"/>
<param name="core_entropy" type="float" value="0.1" label="Set the Block Mapping and Gathering with Entropy" help="--core_entropy_filter (threshold can be between 0.0 and 1.0) [default: Tukey outlier method]"/>
</section>
</when>
<when value="do_not_set"/>
</conditional>
</inputs>
<outputs>
<collection name="output" type="list" label="${tool.name} on ${on_string}: Pangenome output">
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;clstr)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;txt)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;gml)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;Rtab)" directory="outdir" format="tabular" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;csv)" directory="outdir" format="csv" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fasta)" directory="outdir" format="fasta" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fa)" directory="outdir" format="fasta" visible="false" />
<filter>advanced['gene_alignment_option']['a'] == 'None' </filter>
</collection>
<collection name="output_pangenome" type="list" label="${tool.name} on ${on_string}: Pangenome alignment output">
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;clstr)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;txt)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;gml)" directory="outdir" format="txt" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;Rtab)" directory="outdir" format="tabular" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;csv)" directory="outdir" format="csv" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fasta)" directory="outdir" format="fasta" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fa)" directory="outdir" format="fasta" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;aln)" directory="outdir" format="aln" visible="false" />
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;embl)" directory="outdir" format="embl" visible="false" />
<filter>advanced['gene_alignment_option']['a'] != 'None' </filter>
</collection>
<collection name="output_pangenome_fasta" type="list" label="${tool.name} on ${on_string}: Pangenom alignment fasta">
<discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fas)" directory="outdir/aligned_gene_sequences" format="fasta" visible="false" />
<filter>advanced['gene_alignment_option']['a'] != 'None' </filter>
</collection>

<data name="log" format="txt" label="${tool.name} on ${on_string}: log"/>
</outputs>
<tests>
<!-- run panaroo with default parameters (i.e panaroo -t 2 -i *.gff -o default \-\-clean-mode strict \-\-remove-invalid-genes) -->
<test expect_num_outputs="2">
<param name="gen_code" value="11"/>
<param name="mode" value="strict"/>
<param name="adv_options_selector" value="set"/>
<param name="a" value="None"/>
<param name="gff_input_collection">
<collection type="list">
<element name="gff10.gff" value="10_small.gff"/>
<element name="gff11.gff" value="11_small.gff"/>
</collection>
</param>
<output_collection name="output" count="13"/>
<output name="log">
<assert_contents>
<has_text text="pre-processing gff3 files..."/>
</assert_contents>
</output>
</test>
<test expect_num_outputs="3">
<param name="gen_code" value="11"/>
<param name="mode" value="strict"/>
<param name="adv_options_selector" value="set"/>
<param name="a" value="core"/>
<param name="gff_input_collection">
<collection type="list">
<element name="gff10.gff" value="10_small.gff"/>
<element name="gff11.gff" value="11_small.gff"/>
</collection>
</param>
<output_collection name="output_pangenome" count="18"/>
<output_collection name="output_pangenome_fasta" count="251"/>
<output name="log">
<assert_contents>
<has_text text="pre-processing gff3 files..."/>
</assert_contents>
</output>
</test>
</tests>
<help><![CDATA[
Panaroo_ is A Bacterial Pangenome Analysis Pipeline.

**INPUTS**
Panaroo now supports multiple input formats. To use non-standard GFF3 files you must profile the input file as a list in a text file (one per line). Separate GFF and FASTA files can be provided per isolate by providing each file delimited by a space or a tab. Genbank file formats are also supported with extensions '.gbk', '.gb' or '.gbff'. These must compliant with Genbank/ENA/DDJB. This can be forced in Prokka by specifying the --compliance parameter.

- data file in gff format

**OUTPUTS**

- combined_protein_cdhit_out.txt
- combined_protein_cdhit_out.txt.clstr
- pre_filt_graph.gml
- gene_data.csv
- combined_protein_CDS.fasta
- combined_DNA_CDS.fasta
- gene_presence_absence.Rtab
- gene_presence_absence_roary.csv
- gene_presence_absence.csv
- summary_statistics.txt
- pan_genome_reference.fa
- struct_presence_absence.Rtab
- final_graph.gml


.. _Panaroo: https://gthlab.au/panaroo/#/gettingstarted/quickstart

]]></help>
<citations>
<citation type="doi">10.1186/s13059-020-02090-4</citation>
</citations>
</tool>

Loading
Loading