forked from geraldinepascal/FROGS-wrappers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.xml
178 lines (126 loc) · 7.48 KB
/
clustering.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
<?xml version="1.0"?>
<!--
# Copyright (C) 2015 INRA
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-->
<tool id="FROGS_clustering" name="FROGS Clustering swarm" version="3.1">
<description>Single-linkage clustering on sequences</description>
<requirements>
<requirement type="package" version="3.1.0">frogs</requirement>
</requirements>
<stdio>
<exit_code range="1:" />
<exit_code range=":-1" />
</stdio>
<command>
clustering.py
--nb-cpus \${GALAXY_SLOTS:-1}
--distance $maximal_distance
--input-fasta $sequence_file
--input-count $count_file
--output-biom $abundance_biom
--output-fasta $seed_file
--output-compo $swarms_composition
$denoising
</command>
<inputs>
<!-- Files -->
<param format="fasta" name="sequence_file" type="data" label="Sequences file" help="The sequences file (format: fasta)." optional="false" />
<param format="tabular" name="count_file" type="data" label="Count file" help="It contains the count by sample for each sequence (format: TSV)." optional="false" />
<!-- Parameters -->
<param name="maximal_distance" type="integer" label="Aggregation distance" help="Maximum number of differences between sequences in each aggregation step." value="3" min="1" max="15" optional="false" />
<param name="denoising" type="boolean" checked="true" truevalue="--denoising" falsevalue="" label="Performe denoising clustering step?" help="If checked, clustering will be perform in two steps, first with distance = 1 and then with your input distance"/>
</inputs>
<outputs>
<data format="fasta" name="seed_file" label="${tool.name}: seed_sequences.fasta" from_work_dir="seeds.fasta"/>
<data format="biom1" name="abundance_biom" label="${tool.name}: abundance.biom" from_work_dir="abundance.biom" />
<data format="tabular" name="swarms_composition" label="${tool.name}: swarms_composition.tsv" from_work_dir="swarms.tsv"/>
</outputs>
<tests>
<test>
<param name="sequence_file" value="references/01-prepro-vsearch.fasta"/>
<param name="count_file" value="references/01-prepro-vsearch.tsv"/>
<param name="maximal_distance" value="3"/>
<param name="denoising" value="true"/>
<output name="seed_file" file="references/02-clustering.fasta" compare="sim_size" delta="0" />
<!-- output name="abundance_biom" file="references/02-clustering.biom" -->
<!-- Voila deux autres facon de comparer: (Mais ca ne marche pas dans ce cas car les fichiers générés par planemo sont vraiments différents comparé à nos test manuels (cf. /tmp/frogs-test-result-olivier /tmp/frogs-test-result-olivier2) -->
<!-- cf. https://docs.galaxyproject.org/en/latest/dev/schema.html#tool-tests-test-output -->
<!--output name="abundance_biom" file="references/02-clustering.biom" compare="sim_size" delta="10"/-->
<!--output name="abundance_biom" file="references/02-clustering.biom" compare="diff" lines_diff="1"/-->
<output name="swarms_composition" file="references/02-clustering_compo.tsv" compare="sim_size" delta="0" />
</test>
</tests>
<help>
.. image:: static/images/frogs_images/FROGS_logo.png
:height: 144
:width: 110
.. class:: infomark page-header h2
What it does
Single-linkage clustering on sequences.
.. class:: infomark page-header h2
Inputs/Outputs
.. class:: h3
Inputs
**Sequences file**:
The sequence file with all samples sequences (format `FASTA <https://en.wikipedia.org/wiki/FASTA_format>`_). These sequences are dereplicated: strictly identical sequence are represented only one and the initial count is kept in count file.
The sequence ID must be "sequenceID;size=X" with X equal to the total abundance among all samples.
*It corresponds to one output of FROGS Pre-process tools.*
**Count file**:
This file contains the count of all uniq sequences in each sample (format `TSV <https://en.wikipedia.org/wiki/Tab-separated_values>`_).
Example::
#id splA splB
seq1 1289 2901
seq2 3415 0
.. class:: h3
Outputs
**Abundance file** (abundance.biom):
The abundance of each cluster in each sample (format `BIOM <http://biom-format.org/>`_). This format is widely used in metagenomic softwares.
**Clusters seeds** (seed_sequences.fasta):
The clusters representative sequences (format `FASTA <https://en.wikipedia.org/wiki/FASTA_format>`_).
**Clusters composition** (swarms_composition.tsv):
A text file representing the read composition of each cluster (format txt). Each line represents one cluster and is composed of read identifier separated by space.
.. class:: infomark page-header h2
How it works
.. csv-table::
:header: "Steps", "With denoising", "Without denoising"
:widths: 5, 150, 150
:class: table table-striped
"1", "Sorting the reads by their abundance", "Sorting the reads by their abundance"
"2", "Clusters the reads (`Swarm <https://github.com/torognes/swarm>`_). The distance parameter is 1", "/"
"3", "Sorting the pre-clusters by their abundance", "/"
"4", "Clusters the pre-clusters (`Swarm <https://github.com/torognes/swarm>`_) with the distance you specify", "Clusters the reads (`Swarm <https://github.com/torognes/swarm>`_) with the distance you specify"
**Swarm focus**
Swarm use an iterative growth process and the use of sequence abundance values to delineate OTUs.
.. image:: static/images/frogs_images/FROGS_cluster_swarm.png
:height: 223
:width: 666
In each groth step the sequence of the previous step is used to find the others sequences with a number of differences inferior or equal to the "Aggregation distance".
After agregation Swarm refines the clusters by looking at the abundancies along the connections. Theoritically the abundances must decrease when you are going away from the seed (which is often the most abundant sequence). If this abundance raises again it means that two different clusters are connected by some poorly abundant sequences, so swarm cut the connection.
.. class:: infomark page-header h2
Advices
The denoising step allows to build very fine clusters with minimal differences. In this case, the number of differences is equal at 1 between sequences of each crowns. This first clustering is extremly quick. After the denoising, a second swarm is run with an aggregation distance >1 as you have configured, between seeds from this first clustering.
To have some metrics on your clusters, you can use the tool **FROGS Clusters Stat**.
----
**Contact**
Contacts: [email protected]
Repository: https://github.com/geraldinepascal/FROGS
website: http://frogs.toulouse.inra.fr/
Please cite the **FROGS article**: *Escudie F., et al. Bioinformatics, 2018. FROGS: Find, Rapidly, OTUs with Galaxy Solution.*
</help>
<citations>
<citation type="doi">10.1093/bioinformatics/btx791</citation>
</citations>
</tool>