Merge branch 'master' of https://github.com/usegalaxy-au/tools-au

usegalaxy-au · Jun 25, 2024 · c5aebed · c5aebed
2 parents 55772b6 + 8c6af15
commit c5aebed
Show file tree

Hide file tree

Showing 16 changed files with 635 additions and 8 deletions.
diff --git a/tools/cellranger/cellranger.xml b/tools/cellranger/cellranger.xml
@@ -8,6 +8,7 @@
     <command><![CDATA[
 
 	    #import re
+	    #import os
 
 	    #set tool_type = $tool_cond.tool
 	    #if $tool_type == "count":
@@ -21,7 +22,30 @@
 	    	#end for
 	    #elif str($tool_cond.tool) == "mkgtf":
 	        #set attributeTag ="--attribute"
-	        #set attribute=str(" ".join(["%s%s" % (str("--attribute=gene_biotype:"), str(ft)) for ft in $tool_cond.attributes]))
+	    	#set attribute=str(" ".join(["%s%s" % (str("--attribute=gene_biotype:"), str(ft)) for ft in $tool_cond.attributes]))
+	    #elif str($tool_cond.tool) == "multi":
+	   	mkdir output_directory &&
+	    	#set input_directory='input_data_directory'
+                touch tmp &&
+                #for $input in $tool_cond.inputs:
+                     #set sample_name_directory=$input.input_collection.element_identifier
+                     #set lib_type = $input.type
+	    	     #set sample_fastq_directory = '/'.join([str("/"),str($input_directory),str($sample_name_directory)])
+                     #if str($input.lanes.lane_source.lane_source_selector) == "user_define":
+                        #set num_lanes = "%s" % ("|".join($input.lanes.lane_source.lane))
+                        #set library_record=','.join([str($sample_name_directory),str($sample_fastq_directory),str($num_lanes),str($lib_type)])
+                        echo '$library_record' >> tmp &&
+                     #else:
+                        #set library_record=','.join([str($sample_name_directory),str($sample_fastq_directory),str(""),str($lib_type)])
+                        echo '$library_record' >> tmp &&
+                     #end if
+                     mkdir -p $input_directory/$sample_name_directory &&
+                     #set collection_identifier =  re.sub('[^\s\w\-]', '_', str($input.input_collection.element_identifier))
+                     #for $f in $input.input_collection:
+                        #set identifier = re.sub('[^\s\w\-\\.]','_',str($f.element_identifier))
+	    		ln -sf '$f' "\$(pwd)"/$input_directory/$sample_name_directory/$identifier &&
+                     #end for
+                #end for
 	    #end if 
 
 	    #if str($tool_cond.tool) == "count"
@@ -60,15 +84,69 @@
             2>&1
 	    #elif str($tool_cond.tool) == "mkgtf"
 	    	cellranger mkgtf $raw_gtf $filtered_gtf $attribute
+	    #elif str($tool_cond.tool) == "multi"
+	    	cp '$multi_config' 'config.txt' &&
+	    	cat tmp >> 'config.txt' &&
+		sed -i "s|input_data_directory|`pwd`/input_data_directory|g" config.txt &&
+                cellranger multi --id=output_directory
+                                 --csv=config.txt
+                                 --localcores=\${GALAXY_SLOTS:-2}
+                                 --localmem=\${GALAXY_MEMORY_GB:-8}
+                                --disable-ui
+                && gunzip -f output_directory/outs/per_sample_outs/output_directory/count/sample_filtered_feature_bc_matrix/matrix.mtx.gz
+                && gunzip -f output_directory/outs/per_sample_outs/output_directory/count/sample_filtered_feature_bc_matrix/features.tsv.gz
+	    	&& gunzip -f output_directory/outs/per_sample_outs/output_directory/count/sample_filtered_feature_bc_matrix/barcodes.tsv.gz
+		&& rm tmp 
+                && 2>&1	   	
 	    #end if
 
     ]]></command>
+    <configfiles>
+	<configfile name="multi_config"><![CDATA[
+#import re
+#set $lib_type_multi = list()
+#for $input in $tool_cond.inputs:
+     #set lib_type = $input.type
+     #if str($lib_type) not in $lib_type_multi:
+	 $lib_type_multi.append('%s' %(str($input.type)))
+     #end if
+     #if str($lib_type) == "Gene Expression":
+	#set selected_gex_ref = $input.reference_source.ref_file.fields.path
+	#set has_bam = $tool_cond.GEX.no_bam
+     #elif str($lib_type) == "VDJ":
+	#set selected_vdj_ref = $input.reference_source.ref_file.fields.path
+     #else:
+	#set selected_gex_ref = $input.reference_source.ref_file.fields.path
+	#set has_bam = $tool_cond.GEX.no_bam
+	#set selected_vdj_ref = $input.reference_source.ref_file.fields.path
+     #end if
+#end for
+
+#if str("Gene Expression") in $lib_type_multi and str("VDJ") in $lib_type_multi:
+[gene-expression]
+reference, ${selected_gex_ref}
+no-bam,${has_bam}
+[vdj]
+reference, ${selected_vdj_ref}
+#elif str($lib_type) == "Gene Expression":
+[gene-expression]
+reference, ${selected_gex_ref}
+no-bam,${has_bam}
+#elif str($lib_type) == "VDJ" or "Gene Expression" not in $lib_type_multi:
+[vdj]
+reference, ${selected_vdj_ref}
+#end if
+[libraries]
+fastq_id,fastqs,lanes,feature_types,subsample_rate
+]]></configfile>
+    </configfiles>
     <inputs>
 	    <conditional name="tool_cond">
-		   <param name="tool" type="select" label="Select a CellRanger tool" help="CellRanger tool: count, mkref or mkgtf.">
+		   <param name="tool" type="select" label="Select a CellRanger tool" help="CellRanger tool: count, mkref, mkgtf, multi.">
 			   <option value="count" selected="True">count</option>
 			   <option value="mkref">mkref</option>
 			   <option value="mkgtf">mkgtf</option>
+		           <option value="multi">multi</option>
 		   </param>
 		   <when value="count">
 			 <param name="input_collection" type="data_collection" format="fastq.gz,fastqsanger.gz,fastq" collection_type="list" label="Input Collection" help="A list of paired-end FASTQ files in a collection."/>
@@ -101,6 +179,25 @@
 				  <expand macro="feature_type"/>
 			  </param>
 		   </when>
+		   <when value="multi">
+			 <repeat name="inputs" title="Input Collections" min="1">
+			      <param name="input_collection" type="data_collection" format="fastq.gz,fastqsanger.gz,fastq" collection_type="list" label="Input Collection"/>
+			      <param name="type" type="select" label="library type" multiple="false" help="Select library type.">
+				   <expand macro="library_type"/>
+			      </param>
+			      <section name="lanes">   
+				     <expand macro="number_of_lane"/>
+		       	      </section>
+			      <expand macro="db_reference"/>
+		      	</repeat>
+                         <section name="GEX" title="Gene Expresion Options">
+                               <expand macro="gene_expression_options"/>
+                               <expand macro="chemistry"/>
+                         </section>
+                         <section name="VDJ" title="VDJ options">
+                               <expand macro="vdj"/>
+                         </section>
+		   </when>
 	    </conditional>
     </inputs>
     <outputs>
@@ -131,6 +228,38 @@
 	    <data name="tar_ref_output" format="tgz"  label="${tool.name} on ${on_string}: A tarball of the custom reference">
 		    <filter>tool_cond['tool'] == 'mkref'</filter>
 	    </data>
+            <data format="html" name="output_summary" label="Summary from ${tool.name} on ${on_string}" from_work_dir="output_directory/outs/per_sample_outs/output_directory/web_summary.html" >
+                    <filter>tool_cond['tool'] == 'multi'</filter>
+            </data>
+	    <data format="binary" name="cloupe" label="Cloupe file from ${tool.name} on ${on_string}" from_work_dir="output_directory/outs/per_sample_outs/output_directory/count/sample_cloupe.cloupe">
+                    <filter>tool_cond['tool'] == 'multi'</filter>
+            </data>
+            <data format="mtx" name="matrix" label="Matrix file from ${tool.name} on ${on_string}" from_work_dir="output_directory/outs/per_sample_outs/output_directory/count/sample_filtered_feature_bc_matrix/matrix.mtx">
+                    <filter>tool_cond['tool'] == 'multi'</filter>
+            </data>
+            <data format="tabular" name="feature" label="Feature file from ${tool.name} on ${on_string}" from_work_dir="output_directory/outs/per_sample_outs/output_directory/count/sample_filtered_feature_bc_matrix/features.tsv">
+                    <filter>tool_cond['tool'] == 'multi'</filter>
+            </data>
+            <data format="tabular" name="barcode" label="Barcode file from ${tool.name} on ${on_string}" from_work_dir="output_directory/outs/per_sample_outs/output_directory/count/sample_filtered_feature_bc_matrix/barcodes.tsv">
+                    <filter>tool_cond['tool'] == 'multi'</filter>
+            </data>
+	    <data format="txt" name="multi_config_out" from_work_dir="config.txt" label="${tool.name} on ${on_string}: config">
+		    <filter>tool_cond['tool'] == 'multi'</filter>
+	    </data>
+	    <collection name="multi_output" type="list" label="${tool.name} on ${on_string}: multi">
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;h5)" directory="output_directory/outs/per_sample_outs/output_directory/count" format="h5" visible="false" />
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;bam)" directory="output_directory/outs/per_sample_outs/output_directory/count" format="bam" visible="false" />
+                    <filter>tool_cond['tool'] == 'multi' and tool_cond['GEX']['no_bam']</filter>
+	    </collection>
+	    <collection name="vdj_output" type="list" label="${tool.name} on ${on_string}: multi vdj output">
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;vloupe)" directory="output_directory/outs/per_sample_outs/output_directory/vdj_b" format="binary" visible="false" />
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;tsv)" directory="output_directory/outs/per_sample_outs/output_directory/vdj_b" format="tsv" visible="false" />
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;csv)" directory="output_directory/outs/per_sample_outs/output_directory/vdj_b" format="csv" visible="false" />
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fastq)" directory="output_directory/outs/per_sample_outs/output_directory/vdj_b" format="fastq" visible="false" />
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;fasta)" directory="output_directory/outs/per_sample_outs/output_directory/vdj_b" format="fasta" visible="false" />
+		    <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;bam)" directory="output_directory/outs/per_sample_outs/output_directory/vdj_b" format="bam" visible="false" />
+                    <filter>tool_cond['tool'] == 'multi' and tool_cond['inputs']['type'] == "VDJ"</filter>
+	    </collection>
     </outputs>
 
     <tests>
@@ -151,6 +280,7 @@ Cell Ranger is a set of analysis pipelines that process Chromium single cell dat
 - count : aligns sequencing reads in FASTQ files to a reference transcriptome
 - mkref : build a custom reference
 - mkgtf : filter GTF files with the feature attributes (i.e gene_biotype:protein_coding)
+- multi : tool for analyzing 3' Cell Multiplexing data
 
 **CellRanger Count**
 
@@ -197,12 +327,28 @@ Cell Ranger is a set of analysis pipelines that process Chromium single cell dat
 
 - A filtered reference genome GTF file
 
+**CellRanger multi**
+
+**Input**
+
+1) Single Cell gene expression dataset in a collection named by the sample name. Example, if the gene expression sample name is sc5p_v2_hs_B_1k_5gex_S1_L001_I1_001.fastq.gz and the collection name should be formatted as sc5p_v2_hs_B_1k_5gex (the partial prefix of the gene expression sample name). 
+2) Single Cell VDJ dataset in a collection named by the sample name. Example, if the VDJ sample name is sc5p_v2_hs_B_1k_b_S1_L001_I1_001.fastq.gz and the collection name should be formatted as sc5p_v2_hs_B_1k_b ( the partial prefix of the VDJ sample name).
+3) both 1) and 2) 
+
+**Output**
+
+1) Gene expression - A summary file in html format, two h5 files, Cloupe,  barcode, feature and a matrix file.
+2) VDJ - Vloupe (Cellranger supported file format), clonotypes, airr_rearrangement, consensus_annotations, filtered_contig_annotations, filtered_contig, concat_ref and consensus. 
+3) Both output 1 and output 2 will be generated only if both gene expression and VDJ are used as an input. 
+
 .. class:: infomark
 
 **More Information**
 
 - `CellRanger`: https://support.10xgenomics.com/docs/citations
 
+- `Output` : see more https://www.10xgenomics.com/support/software/cell-ranger/latest/analysis/outputs/cr-5p-outputs-overview-vdj
+
 **Citations for 10x Genomics Publications**
 
 ]]></help>

diff --git a/tools/cellranger/macros.xml b/tools/cellranger/macros.xml
@@ -1,6 +1,6 @@
 <macros>
     <token name="@TOOL_VERSION@">7.1.0</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">22.05</token>
     <token name="@VERSION@">7.1.0</token>
     <xml name="requirements">
@@ -17,6 +17,38 @@
     <xml name="attribute_option" token_value="default">
         <option value="@VALUE@" selected="true">@VALUE@</option>
     </xml>
+    <xml name="db_reference">
+	  <conditional name="reference_source">
+             <param name="reference_source_selector" type="select" label="Will you select a reference genome from your history or use a built-in index?">
+                 <option value="cached">Use a built-in genome index</option>
+                 <option value="history">Use a genome from history and build index</option>
+             </param>
+             <when value="cached">
+                   <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
+                      <options from_data_table="cellranger_db">
+                           <filter type="sort_by" column="2" />
+                           <validator type="no_options" message="No reference genomes are available" />
+                      </options>
+                           <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file"/>
+                    </param>
+              </when>
+              <when value="history">
+                    <param name="own_ref_file" type="data" format="tgz" hierarchy="recurse" label="Use the following dataset as the reference sequence" help="You can upload a tarball formatted in CellRanger format as reference" />
+              </when>
+         </conditional>
+    </xml>
+    <xml name="chemistry">
+       <param name="chemistry_list" type="select" label="Select chemistry">
+	 <option value="auto" selected="true">auto</option>
+         <option value="threeprime" >Single Cell 3'</option>
+	 <option value="fiveprime">Single Cell 5'</option>
+	 <option value="SC3Pv1">Single Cell 3' v1</option>
+	 <option value="SC3Pv2">Single Cell 3' v2</option>
+         <option value="SC5P-PE">SC5P-PE</option>
+         <option value="SC5P-R2">SC5P-R2 for R2-only</option>
+	 <option value="SC-FB">SC-FB for Single Cell AntiBody-only</option>
+       </param>
+    </xml>
     <xml name="feature_type">
         <expand macro="attribute_option" value="protein_coding"/>
         <expand macro="attribute_option" value="lncRNA"/>
@@ -35,4 +67,50 @@
         <expand macro="attribute_option" value="TR_V_pseudogene"/>
         <expand macro="attribute_option" value="TR_J_pseudogene"/>
     </xml>
+    <xml name="library_type">
+	<expand macro="attribute_option" value="Gene Expression"/>
+	<expand macro="attribute_option" value="VDJ"/>
+	<expand macro="attribute_option" value="VDJ-T"/>
+	<expand macro="attribute_option" value="VDJ-T-GD"/>
+	<expand macro="attribute_option" value="VDJ-B"/>
+	<expand macro="attribute_option" value="Antibody Capture"/>
+	<expand macro="attribute_option" value="Antigen Capture (BEAM)"/>
+	<expand macro="attribute_option" value="CRISP Guide Capture"/>
+    </xml>
+    <xml name="gene_expression_options">
+	<param name="no_target_umi_filter" type="boolean" truevalue="true" falsevalue="false" checked="False" label="No target umi filter"/>
+	<param name="r1_length" type="text" optional="True" label="R1 length" help=""/>
+	<param name="r2_length" type="text" optional="True" label="R2 length" help=""/>
+	<param name="expect_cells" type="text" optional="True" label="Expect cells" help=""/>
+	<param name="force_cells" type="text" optional="True" label="Force cells" help=""/>
+	<param name="include_introns" type="boolean" truevalue="true" falsevalue="false" checked="True" label="Include introns"/>
+	<param name="no_secondary" type="boolean" truevalue="true" falsevalue="false" checked="True" label="No Secondary"/>
+	<param name="no_bam" type="boolean" truevalue="true" falsevalue="false" checked="Talse" label="create bam file"/>
+	<param name="check_library_compatibility" type="boolean" truevalue="true" falsevalue="false" checked="True" label="Check library compatibility"/>
+    </xml>
+    <xml name="vdj">
+	<param name="inner_enrichment_primers" type="text" optional="True" label="Inner enrichment pimers"/>
+        <param name="r1_length" type="text" optional="True" label="R1 length"/>
+	<param name="r2_length" type="text" optional="True" label="R2 length"/>
+    </xml>
+    <xml name="number_of_lane">
+          <conditional name="lane_source">
+             <param name="lane_source_selector" type="select" label="Select number of lanes for your dataset?">
+                 <option value="default">Default: all lanes</option>
+                 <option value="user_define">User define</option>
+             </param>
+             <when value="default">
+             </when>
+             <when value="user_define">
+		  <param name="lane" type="select" multiple="true" label="Select a list of lanes">
+        	      <option value="1">Lane 1</option>
+         	      <option value="2">Lane 2</option>
+         	      <option value="3">Lane 3</option>
+         	      <option value="4">Lane 4</option>
+         	      <option value="5">Lane 5</option>
+                      <option value="6">Lane 6</option>
+	          </param>
+             </when>
+           </conditional>
+    </xml>
 </macros>
diff --git a/tools/dorado/.shed.yml b/tools/dorado/.shed.yml
@@ -0,0 +1,23 @@
+---
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "{{ tool_name }} from the dorado suite"
+categories:
+  - Sequence Analysis
+description: Dorado is a high-performance, easy-to-use, open source basecaller for Oxford Nanopore reads.
+exclude:
+  - tool_test_output.html
+  - tool_test_output.json
+homepage_url: https://github.com/nanoporetech/dorado
+long_description: >
+  Dorado is a high-performance, easy-to-use, open source basecaller for Oxford Nanopore reads.
+name: dorado
+owner: galaxy-australia
+remote_repository_url: https://github.com/usegalaxy-au/tools-au/tree/main/tools/dorado
+suite:
+  name: suite_dorado
+  description: >
+    Dorado is a high-performance, easy-to-use, open source basecaller for Oxford Nanopore reads.
+  long_description: >
+    Dorado is a high-performance, easy-to-use, open source basecaller for Oxford Nanopore reads.
+type: unrestricted
diff --git a/tools/dorado/README.md b/tools/dorado/README.md
@@ -0,0 +1,48 @@
+
+## Tool versions
+
+Dorado is distributed on
+[DockerHub](https://hub.docker.com/r/nanoporetech/dorado/tags) by nanoporetech.
+The containers are identified by sha256 hash, but not tagged with a version.
+
+We can still use the containers and display the dorado version by hard-coding
+both dorado version and container hash into the wrapper (see `macros.xml`).
+Unfortunately you have to pull a >6 GB container and run `dorado --version` just
+to check the tool version. This also prevents auto-updates of this wrapper.
+
+You can update the list of models at the same time (see
+below). **You must do this when you update the wrapper**.
+
+## Basecalling models
+
+The models are bundled in the container at `/models` and made available by the
+`dorado_models.loc` file. 
+
+The columns are `value`, `container_hash`, `name` and  `path`.
+
+To update the list, modify `tool-data/dorado_models.loc.sample`.
+
+Because models can be added and removed, models are listed **per container** in
+the loc file.
+
+Here's some code to **append** the models from the container with hash
+`1c65eb070a9fc1d88710c4dc09b06541f96fdd28`  to the loc file.
+
+```bash
+export DORADO_HASH="1c65eb070a9fc1d88710c4dc09b06541f96fdd28"
+
+apptainer exec "docker://nanoporetech/dorado:sha${DORADO_HASH}" \
+    ls /models | \
+    awk -v hash="${DORADO_HASH}" '{print hash "_" $0 "\t" hash "\t" $0 "\t/models/" $0}' \
+    >> tool-data/dorado_models.loc.sample
+```
+
+The loc file doesn't have a header, so you can keep it sorted.
+
+```bash
+cp tool-data/dorado_models.loc.sample \
+    tool-data/dorado_models.loc.sample.old &&
+sort -t$'\t' -k1,1V tool-data/dorado_models.loc.sample.old \
+    > tool-data/dorado_models.loc
+```
+