Skip to content

Commit

Permalink
Merge pull request #148 from TomHarrop/dorado_basecall
Browse files Browse the repository at this point in the history
Adding a wrapper for trimming already-basecalled reads
  • Loading branch information
TomHarrop authored Nov 4, 2024
2 parents d626bb2 + bf243d8 commit 8f48f7e
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 0 deletions.
102 changes: 102 additions & 0 deletions tools/dorado/dorado_trimming.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<tool id="dorado_trimming" name="Dorado adapter and primer trimming" version="@VERSION@+galaxy0" python_template_version="3.5" profile="24.1">
<description>for Oxford Nanopore (ONT) DNA reads</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="xrefs"/>
<expand macro="requirements"/>
<command detect_errors="exit_code"><![CDATA[
ln -s '$reads' ./reads
&&
dorado trim
--verbose
--threads "\${GALAXY_SLOTS}"
#if $no_trim_primers
--no-trim-primers
#end if
#if $primer_sequences
--primer-sequences '$primer_sequences'
#end if
reads
> trimmed.bam
&&
dorado summary
trimmed.bam
> summary.tsv
]]></command>
<inputs>
<param name="reads" type="data" format="bam,fastqsanger,unsorted.bam" label="Existing, basecalled DNA dataset" help="Note: this tool does not support trimming adaptors from RNA reads. These need to be removed during basecalling."/>
<param argument="--no-trim-primers" type="boolean" label="Don't trim primers" help="This option can be used to prevent the trimming of primer sequences. In this case only adapter sequences will be trimmed."/>
<param argument="--primer-sequences" type="data" format="fasta" optional="true" label="Custom primer sequences" help="You can specify an alternative set of primer sequences to search for when trimming by adding a FASTA file containing the primer sequences you want to search for. The record names of the sequences do not matter. Note that if you use this option the normal primer sequences built-in to the dorado software will not be searched for."/>
</inputs>
<outputs>
<data format="unsorted.bam" name="out_bam" label="Reads from ${on_string} trimmed by the ${tool.name} tool" from_work_dir="trimmed.bam"/>
<data format="tsv" name="out_tsv" label="${tool.name} sequencing summary for ${on_string}" from_work_dir="summary.tsv"/>
</outputs>
<tests>
<test expect_num_outputs="2">
<param name="reads" value="FAL00375_473bf0ed_0.ten_reads.bam"/>
<output name="out_bam" ftype="unsorted.bam" file="dorado_trimming_test1.bam"/>
<output name="out_tsv" ftype="tsv" file="dorado_trimming_test1.tsv"/>
</test>
<test expect_num_outputs="2">
<param name="reads" value="FAL00375_473bf0ed_0.ten_reads.bam"/>
<param name="no_trim_primers" value="True"/>
<output name="out_bam" ftype="unsorted.bam" file="dorado_trimming_test2.bam"/>
<output name="out_tsv" ftype="tsv" file="dorado_trimming_test2.tsv"/>
</test>
<test expect_num_outputs="2">
<param name="reads" value="lsk109_single_read.fastqsanger.gz" ftype="fastqsanger.gz"/>
<param name="primer_sequences" value="custom_primers.fasta.gz" ftype="fasta.gz"/>
<output name="out_bam" ftype="unsorted.bam" file="dorado_trimming_test3.bam"/>
<output name="out_tsv" ftype="tsv" file="dorado_trimming_test3.tsv"/>
</test>
</tests>
<help><![CDATA[
Detect and remove any adapter and/or primer sequences from the beginning
and end of DNA reads using Oxford Nanopore’s open source
`Dorado <https://github.com/nanoporetech/dorado/>`__ basecaller.
This tool scans existing, basecalled datasets for adapter and/or primer
sequences at either end, and trims any such found sequences.
**If you have raw (un-basecalled) data, you can trim them during
basecalling with the Dorado tool on Galaxy**.
Note that if you intend to demultiplex the reads later, trimming
adapters and primers may result in some portions of the flanking regions
of the barcodes being removed, which could interfere with correct
demultiplexing.
The **Don't trim primers** option can be used to prevent the trimming of
primer sequences. In this case only adapter sequences will be trimmed.
The output of will always be unaligned records, regardless of whether
the input is aligned/sorted or not.
Custom primer trimming
----------------------
The software automatically searches for primer sequences used in Oxford
Nanopore kits. However, you can specify an alternative set of primer
sequences to search by adding a FASTA file of primer sequences in the
**Custom primer sequences** option. The record names of the sequences do
not matter. Note that if you use this option the normal primer sequences
built-in to the dorado software will not be searched for.
RNA adapter trimming
--------------------
Adapters for RNA002 and RNA004 kits are automatically trimmed during
basecalling. However, unlike in DNA, the RNA adapter cannot be trimmed
post-basecalling.
]]></help>
<expand macro="citation"/>
</tool>
Binary file not shown.
Binary file added tools/dorado/test-data/custom_primers.fasta.gz
Binary file not shown.
Binary file added tools/dorado/test-data/dorado_trimming_test1.bam
Binary file not shown.
11 changes: 11 additions & 0 deletions tools/dorado/test-data/dorado_trimming_test1.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
filename read_id run_id channel mux start_time duration template_start template_duration sequence_length_template mean_qscore_template barcode
reads.pod5 005cf7ae-4d74-42dd-ab96-9befed842822 473bf0edfc2f8f756173de35db5da9b6f6db4959 473 2 258.872 29.436 259.165 29.1435 14587 5.17895 unclassified
reads.pod5 0066800d-d191-4833-a495-cfe8b925aca0 473bf0edfc2f8f756173de35db5da9b6f6db4959 56 4 359.11 14.0378 359.11 14.0378 5111 9.1072 unclassified
reads.pod5 00777c4b-cbd6-4a79-8647-bbe5f5f3f3bf 473bf0edfc2f8f756173de35db5da9b6f6db4959 300 3 329.733 2.5165 329.766 2.484 1055 12.9149 unclassified
reads.pod5 002f231b-5d37-437f-a027-a2e8b872e73b 473bf0edfc2f8f756173de35db5da9b6f6db4959 118 3 534.745 19.5847 534.745 19.5847 8387 9.88254 unclassified
reads.pod5 000a9728-0a7c-4b64-9791-76bb30b63796 473bf0edfc2f8f756173de35db5da9b6f6db4959 105 4 331.319 3.27625 331.319 3.27625 1175 12.0328 unclassified
reads.pod5 0067486b-9f92-4849-8456-671463e64412 473bf0edfc2f8f756173de35db5da9b6f6db4959 84 4 164.018 2.453 164.018 2.453 885 12.399 unclassified
reads.pod5 009f5efd-de5d-4a7e-9d17-969c3996cbc8 473bf0edfc2f8f756173de35db5da9b6f6db4959 230 2 171.634 3.27475 171.634 3.27475 1232 13.6258 unclassified
reads.pod5 00aeb4ba-e404-49d2-97c5-8fcf22547f81 473bf0edfc2f8f756173de35db5da9b6f6db4959 51 4 297.891 50.1888 297.984 50.0962 19142 8.45178 unclassified
reads.pod5 0072b26f-f37c-4517-afa7-621543ac2187 473bf0edfc2f8f756173de35db5da9b6f6db4959 317 3 111.275 2.31875 111.275 2.31875 929 9.69614 unclassified
reads.pod5 00ad521b-b916-404f-a31d-a657f9aa7756 473bf0edfc2f8f756173de35db5da9b6f6db4959 365 2 566.263 23.6745 566.263 23.6745 10934 12.074 unclassified
Binary file added tools/dorado/test-data/dorado_trimming_test2.bam
Binary file not shown.
11 changes: 11 additions & 0 deletions tools/dorado/test-data/dorado_trimming_test2.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
filename read_id run_id channel mux start_time duration template_start template_duration sequence_length_template mean_qscore_template barcode
reads.pod5 00777c4b-cbd6-4a79-8647-bbe5f5f3f3bf 473bf0edfc2f8f756173de35db5da9b6f6db4959 300 3 329.733 2.5165 329.766 2.484 1055 12.9149 unclassified
reads.pod5 0072b26f-f37c-4517-afa7-621543ac2187 473bf0edfc2f8f756173de35db5da9b6f6db4959 317 3 111.275 2.31875 111.275 2.31875 929 9.69614 unclassified
reads.pod5 0067486b-9f92-4849-8456-671463e64412 473bf0edfc2f8f756173de35db5da9b6f6db4959 84 4 164.018 2.453 164.018 2.453 885 12.399 unclassified
reads.pod5 002f231b-5d37-437f-a027-a2e8b872e73b 473bf0edfc2f8f756173de35db5da9b6f6db4959 118 3 534.745 19.5847 534.745 19.5847 8387 9.88254 unclassified
reads.pod5 0066800d-d191-4833-a495-cfe8b925aca0 473bf0edfc2f8f756173de35db5da9b6f6db4959 56 4 359.11 14.0378 359.11 14.0378 5111 9.1072 unclassified
reads.pod5 009f5efd-de5d-4a7e-9d17-969c3996cbc8 473bf0edfc2f8f756173de35db5da9b6f6db4959 230 2 171.634 3.27475 171.634 3.27475 1232 13.6258 unclassified
reads.pod5 005cf7ae-4d74-42dd-ab96-9befed842822 473bf0edfc2f8f756173de35db5da9b6f6db4959 473 2 258.872 29.436 259.165 29.1435 14587 5.17895 unclassified
reads.pod5 000a9728-0a7c-4b64-9791-76bb30b63796 473bf0edfc2f8f756173de35db5da9b6f6db4959 105 4 331.319 3.27625 331.319 3.27625 1175 12.0328 unclassified
reads.pod5 00ad521b-b916-404f-a31d-a657f9aa7756 473bf0edfc2f8f756173de35db5da9b6f6db4959 365 2 566.263 23.6745 566.263 23.6745 10934 12.074 unclassified
reads.pod5 00aeb4ba-e404-49d2-97c5-8fcf22547f81 473bf0edfc2f8f756173de35db5da9b6f6db4959 51 4 297.891 50.1888 297.984 50.0962 19142 8.45178 unclassified
Binary file added tools/dorado/test-data/dorado_trimming_test3.bam
Binary file not shown.
2 changes: 2 additions & 0 deletions tools/dorado/test-data/dorado_trimming_test3.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
filename read_id run_id channel mux start_time duration template_start template_duration sequence_length_template mean_qscore_template barcode
2f707b6e-0060-4f33-9c92-a1230d26cb21 unknown 0 0 0 0 0 0 421 0 unclassified
Binary file not shown.

0 comments on commit 8f48f7e

Please sign in to comment.