Merge pull request #148 from TomHarrop/dorado_basecall

Adding a wrapper for trimming already-basecalled reads
usegalaxy-au · Nov 4, 2024 · 8f48f7e · 8f48f7e
2 parents d626bb2 + bf243d8
commit 8f48f7e
Show file tree

Hide file tree

Showing 10 changed files with 126 additions and 0 deletions.
diff --git a/tools/dorado/dorado_trimming.xml b/tools/dorado/dorado_trimming.xml
@@ -0,0 +1,102 @@
+<tool id="dorado_trimming" name="Dorado adapter and primer trimming" version="@VERSION@+galaxy0" python_template_version="3.5" profile="24.1">
+    <description>for Oxford Nanopore (ONT) DNA reads</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+
+ln -s '$reads' ./reads
+
+&&
+
+dorado trim
+--verbose
+--threads "\${GALAXY_SLOTS}"
+#if $no_trim_primers
+    --no-trim-primers
+#end if
+#if $primer_sequences
+    --primer-sequences '$primer_sequences'
+#end if
+reads
+> trimmed.bam
+
+&& 
+
+dorado summary
+trimmed.bam
+> summary.tsv
+
+
+        ]]></command>
+    <inputs>
+        <param name="reads" type="data" format="bam,fastqsanger,unsorted.bam" label="Existing, basecalled DNA dataset" help="Note: this tool does not support trimming adaptors from RNA reads. These need to be removed during basecalling."/>
+        <param argument="--no-trim-primers" type="boolean" label="Don't trim primers" help="This option can be used to prevent the trimming of primer sequences. In this case only adapter sequences will be trimmed."/>
+        <param argument="--primer-sequences" type="data" format="fasta" optional="true" label="Custom primer sequences" help="You can specify an alternative set of primer sequences to search for when trimming by adding a FASTA file containing the primer sequences you want to search for. The record names of the sequences do not matter. Note that if you use this option the normal primer sequences built-in to the dorado software will not be searched for."/>
+    </inputs>
+    <outputs>
+        <data format="unsorted.bam" name="out_bam" label="Reads from ${on_string} trimmed by the ${tool.name} tool" from_work_dir="trimmed.bam"/>
+        <data format="tsv" name="out_tsv" label="${tool.name} sequencing summary for ${on_string}" from_work_dir="summary.tsv"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="reads" value="FAL00375_473bf0ed_0.ten_reads.bam"/>
+            <output name="out_bam" ftype="unsorted.bam" file="dorado_trimming_test1.bam"/>
+            <output name="out_tsv" ftype="tsv" file="dorado_trimming_test1.tsv"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="reads" value="FAL00375_473bf0ed_0.ten_reads.bam"/>
+            <param name="no_trim_primers" value="True"/>
+            <output name="out_bam" ftype="unsorted.bam" file="dorado_trimming_test2.bam"/>
+            <output name="out_tsv" ftype="tsv" file="dorado_trimming_test2.tsv"/>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="reads" value="lsk109_single_read.fastqsanger.gz" ftype="fastqsanger.gz"/>
+            <param name="primer_sequences" value="custom_primers.fasta.gz" ftype="fasta.gz"/>
+            <output name="out_bam" ftype="unsorted.bam" file="dorado_trimming_test3.bam"/>
+            <output name="out_tsv" ftype="tsv" file="dorado_trimming_test3.tsv"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Detect and remove any adapter and/or primer sequences from the beginning
+and end of DNA reads using Oxford Nanopore’s open source
+`Dorado <https://github.com/nanoporetech/dorado/>`__ basecaller.
+
+This tool scans existing, basecalled datasets for adapter and/or primer
+sequences at either end, and trims any such found sequences.
+
+**If you have raw (un-basecalled) data, you can trim them during
+basecalling with the Dorado tool on Galaxy**.
+
+Note that if you intend to demultiplex the reads later, trimming
+adapters and primers may result in some portions of the flanking regions
+of the barcodes being removed, which could interfere with correct
+demultiplexing.
+
+The **Don't trim primers** option can be used to prevent the trimming of
+primer sequences. In this case only adapter sequences will be trimmed.
+
+The output of will always be unaligned records, regardless of whether
+the input is aligned/sorted or not.
+
+Custom primer trimming
+----------------------
+
+The software automatically searches for primer sequences used in Oxford
+Nanopore kits. However, you can specify an alternative set of primer
+sequences to search by adding a FASTA file of primer sequences in the
+**Custom primer sequences** option. The record names of the sequences do
+not matter. Note that if you use this option the normal primer sequences
+built-in to the dorado software will not be searched for.
+
+RNA adapter trimming
+--------------------
+
+Adapters for RNA002 and RNA004 kits are automatically trimmed during
+basecalling. However, unlike in DNA, the RNA adapter cannot be trimmed
+post-basecalling.
+            ]]></help>
+    <expand macro="citation"/>
+</tool>
diff --git a/tools/dorado/test-data/FAL00375_473bf0ed_0.ten_reads.bam b/tools/dorado/test-data/FAL00375_473bf0ed_0.ten_reads.bam
diff --git a/tools/dorado/test-data/custom_primers.fasta.gz b/tools/dorado/test-data/custom_primers.fasta.gz
diff --git a/tools/dorado/test-data/dorado_trimming_test1.bam b/tools/dorado/test-data/dorado_trimming_test1.bam
diff --git a/tools/dorado/test-data/dorado_trimming_test1.tsv b/tools/dorado/test-data/dorado_trimming_test1.tsv
@@ -0,0 +1,11 @@
+filename	read_id	run_id	channel	mux	start_time	duration	template_start	template_duration	sequence_length_template	mean_qscore_template	barcode
+reads.pod5	005cf7ae-4d74-42dd-ab96-9befed842822	473bf0edfc2f8f756173de35db5da9b6f6db4959	473	2	258.872	29.436	259.165	29.1435	14587	5.17895	unclassified
+reads.pod5	0066800d-d191-4833-a495-cfe8b925aca0	473bf0edfc2f8f756173de35db5da9b6f6db4959	56	4	359.11	14.0378	359.11	14.0378	5111	9.1072	unclassified
+reads.pod5	00777c4b-cbd6-4a79-8647-bbe5f5f3f3bf	473bf0edfc2f8f756173de35db5da9b6f6db4959	300	3	329.733	2.5165	329.766	2.484	1055	12.9149	unclassified
+reads.pod5	002f231b-5d37-437f-a027-a2e8b872e73b	473bf0edfc2f8f756173de35db5da9b6f6db4959	118	3	534.745	19.5847	534.745	19.5847	8387	9.88254	unclassified
+reads.pod5	000a9728-0a7c-4b64-9791-76bb30b63796	473bf0edfc2f8f756173de35db5da9b6f6db4959	105	4	331.319	3.27625	331.319	3.27625	1175	12.0328	unclassified
+reads.pod5	0067486b-9f92-4849-8456-671463e64412	473bf0edfc2f8f756173de35db5da9b6f6db4959	84	4	164.018	2.453	164.018	2.453	885	12.399	unclassified
+reads.pod5	009f5efd-de5d-4a7e-9d17-969c3996cbc8	473bf0edfc2f8f756173de35db5da9b6f6db4959	230	2	171.634	3.27475	171.634	3.27475	1232	13.6258	unclassified
+reads.pod5	00aeb4ba-e404-49d2-97c5-8fcf22547f81	473bf0edfc2f8f756173de35db5da9b6f6db4959	51	4	297.891	50.1888	297.984	50.0962	19142	8.45178	unclassified
+reads.pod5	0072b26f-f37c-4517-afa7-621543ac2187	473bf0edfc2f8f756173de35db5da9b6f6db4959	317	3	111.275	2.31875	111.275	2.31875	929	9.69614	unclassified
+reads.pod5	00ad521b-b916-404f-a31d-a657f9aa7756	473bf0edfc2f8f756173de35db5da9b6f6db4959	365	2	566.263	23.6745	566.263	23.6745	10934	12.074	unclassified
diff --git a/tools/dorado/test-data/dorado_trimming_test2.bam b/tools/dorado/test-data/dorado_trimming_test2.bam
diff --git a/tools/dorado/test-data/dorado_trimming_test2.tsv b/tools/dorado/test-data/dorado_trimming_test2.tsv
@@ -0,0 +1,11 @@
+filename	read_id	run_id	channel	mux	start_time	duration	template_start	template_duration	sequence_length_template	mean_qscore_template	barcode
+reads.pod5	00777c4b-cbd6-4a79-8647-bbe5f5f3f3bf	473bf0edfc2f8f756173de35db5da9b6f6db4959	300	3	329.733	2.5165	329.766	2.484	1055	12.9149	unclassified
+reads.pod5	0072b26f-f37c-4517-afa7-621543ac2187	473bf0edfc2f8f756173de35db5da9b6f6db4959	317	3	111.275	2.31875	111.275	2.31875	929	9.69614	unclassified
+reads.pod5	0067486b-9f92-4849-8456-671463e64412	473bf0edfc2f8f756173de35db5da9b6f6db4959	84	4	164.018	2.453	164.018	2.453	885	12.399	unclassified
+reads.pod5	002f231b-5d37-437f-a027-a2e8b872e73b	473bf0edfc2f8f756173de35db5da9b6f6db4959	118	3	534.745	19.5847	534.745	19.5847	8387	9.88254	unclassified
+reads.pod5	0066800d-d191-4833-a495-cfe8b925aca0	473bf0edfc2f8f756173de35db5da9b6f6db4959	56	4	359.11	14.0378	359.11	14.0378	5111	9.1072	unclassified
+reads.pod5	009f5efd-de5d-4a7e-9d17-969c3996cbc8	473bf0edfc2f8f756173de35db5da9b6f6db4959	230	2	171.634	3.27475	171.634	3.27475	1232	13.6258	unclassified
+reads.pod5	005cf7ae-4d74-42dd-ab96-9befed842822	473bf0edfc2f8f756173de35db5da9b6f6db4959	473	2	258.872	29.436	259.165	29.1435	14587	5.17895	unclassified
+reads.pod5	000a9728-0a7c-4b64-9791-76bb30b63796	473bf0edfc2f8f756173de35db5da9b6f6db4959	105	4	331.319	3.27625	331.319	3.27625	1175	12.0328	unclassified
+reads.pod5	00ad521b-b916-404f-a31d-a657f9aa7756	473bf0edfc2f8f756173de35db5da9b6f6db4959	365	2	566.263	23.6745	566.263	23.6745	10934	12.074	unclassified
+reads.pod5	00aeb4ba-e404-49d2-97c5-8fcf22547f81	473bf0edfc2f8f756173de35db5da9b6f6db4959	51	4	297.891	50.1888	297.984	50.0962	19142	8.45178	unclassified
diff --git a/tools/dorado/test-data/dorado_trimming_test3.bam b/tools/dorado/test-data/dorado_trimming_test3.bam
diff --git a/tools/dorado/test-data/dorado_trimming_test3.tsv b/tools/dorado/test-data/dorado_trimming_test3.tsv
@@ -0,0 +1,2 @@
+filename	read_id	run_id	channel	mux	start_time	duration	template_start	template_duration	sequence_length_template	mean_qscore_template	barcode
+	2f707b6e-0060-4f33-9c92-a1230d26cb21	unknown	0	0	0	0	0	0	421	0	unclassified
diff --git a/tools/dorado/test-data/lsk109_single_read.fastqsanger.gz b/tools/dorado/test-data/lsk109_single_read.fastqsanger.gz