diff --git a/.github/workflows/test_pr_lreads_docker_ont_hifi.bkp b/.github/workflows/test_pr_lreads_docker_ont_hifi.bkp new file mode 100644 index 00000000..42882ea7 --- /dev/null +++ b/.github/workflows/test_pr_lreads_docker_ont_hifi.bkp @@ -0,0 +1,37 @@ +name: Testing long-reads HIFI / docker (ONT) from PR +on: + pull_request: + branches: [ master, dev ] + types: [ opened, synchronize, reopened ] + +jobs: + run_nextflow: + name: Run pipeline for the upcoming PR + runs-on: ubuntu-latest + + steps: + + - name: Check out pipeline code + uses: actions/checkout@v2 + + - name: Install Nextflow + env: + CAPSULE_LOG: none + run: | + wget -qO- get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + + - name: Clean environment + run: | + sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android + sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET + + - name: Run tests for long-reads (ont) + run: | + nextflow run main.nf -profile docker,test,lreads,ont_hifi --max_memory '6.GB' --max_cpus 4 + rm -r work .nextflow* + + - name: View results + run: | + sudo apt-get install -y tree + tree lreads_test_ont_hifi diff --git a/Dockerfile b/Dockerfile index 0e015d74..2b2b5fd0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,8 +17,8 @@ RUN medaka --help # fix permissions USER root -RUN mkdir -p $CONDA_PREFIX/envs/mpgap-3.2/lib/python3.8/site-packages/medaka && \ - chmod -R 777 $CONDA_PREFIX/envs/mpgap-3.2/lib/python3.8/site-packages/medaka +RUN mkdir -p $CONDA_PREFIX/envs/mpgap-3.2/lib/python3.9/site-packages/medaka && \ + chmod -R 777 $CONDA_PREFIX/envs/mpgap-3.2/lib/python3.9/site-packages/medaka # pre-download BUSCO bacteria database RUN mkdir -p /opt/busco_db/ && \ diff --git a/README.md b/README.md index 5fee9549..431ec09e 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ This pipeline wraps up the following software: || **Source** | |:- | :- | -| **Assemblers** | [Canu](https://github.com/marbl/canu), [Flye](https://github.com/fenderglass/Flye), [Raven](https://github.com/lbcb-sci/raven), [Shasta](https://github.com/chanzuckerberg/shasta), [wtdbg2](https://github.com/ruanjue/wtdbg2), [Haslr](https://github.com/vpc-ccg/haslr), [Unicycler](https://github.com/rrwick/Unicycler), [Spades](https://github.com/ablab/spades), [Shovill](https://github.com/tseemann/shovill), [Megahit](https://github.com/voutcn/megahit) | +| **Assemblers** | [Hifiasm](https://github.com/chhylp123/hifiasm), [Canu](https://github.com/marbl/canu), [Flye](https://github.com/fenderglass/Flye), [Raven](https://github.com/lbcb-sci/raven), [Shasta](https://github.com/chanzuckerberg/shasta), [wtdbg2](https://github.com/ruanjue/wtdbg2), [Haslr](https://github.com/vpc-ccg/haslr), [Unicycler](https://github.com/rrwick/Unicycler), [Spades](https://github.com/ablab/spades), [Shovill](https://github.com/tseemann/shovill), [Megahit](https://github.com/voutcn/megahit) | | **Polishers** | [Nanopolish](https://github.com/jts/nanopolish), [Medaka](https://github.com/nanoporetech/medaka), [gcpp](https://github.com/PacificBiosciences/gcpp), [Polypolish](https://github.com/rrwick/Polypolish) and [Pilon](https://github.com/broadinstitute/pilon) | | **Quality check** | [Quast](https://github.com/ablab/quast), [BUSCO](https://busco.ezlab.org/busco_userguide.html) and [MultiQC](https://multiqc.info/) | diff --git a/assets/lreads_test_ont_hifi.yml b/assets/lreads_test_ont_hifi.yml new file mode 100644 index 00000000..7bec2281 --- /dev/null +++ b/assets/lreads_test_ont_hifi.yml @@ -0,0 +1,4 @@ +samplesheet: + - id: ont_only_hifi + nanopore: https://github.com/fmalmeida/test_datasets/raw/main/SRR27467590.fq.gz + genome_size: 4m \ No newline at end of file diff --git a/conf/defaults.config b/conf/defaults.config index c56c341c..c17144ca 100644 --- a/conf/defaults.config +++ b/conf/defaults.config @@ -143,10 +143,12 @@ params { skip_shasta = false // Nanopore longreads only assemblies shasta_additional_parameters = null // Must be given as shown in shasta manual. E.g. " --Reads.minReadLength 5000 " + skip_hifiasm = false // Longreads only assemblies + hifiasm_additional_parameters = null // Must be given as shown in Hifiasm manual. E.g. " --ul ul.fq.gz " + skip_pilon = false // Skip pilon polisher when performing hybrid assembly strategy 2 skip_polypolish = false // Skip polypolisher polisher when performing hybrid assembly strategy 2 - /* * Resources controlling parameters * @@ -174,4 +176,4 @@ params { max_cpus = 10 max_time = '40.h' -} \ No newline at end of file +} diff --git a/conf/docker.config b/conf/docker.config index 84a07834..a6f18a43 100644 --- a/conf/docker.config +++ b/conf/docker.config @@ -6,6 +6,6 @@ docker.runOptions = '-u \$(id -u):\$(id -g)' fixOwnership = true process { withName: '.*' { - container = "fmalmeida/mpgap@sha256:f640835dad87d98ded0582271aafaebe609f9196618f52a46ac10d991a0fce27" + container = "fmalmeida/mpgap@sha256:28223374b5500b09ae467064d825b44d086c99f1ade6afa80dbf8fd0053d760e" } } \ No newline at end of file diff --git a/conf/singularity.config b/conf/singularity.config index 4cf7c4c3..719e973d 100644 --- a/conf/singularity.config +++ b/conf/singularity.config @@ -5,6 +5,6 @@ singularity.enabled = true singularity.autoMounts = true process { withName: '.*' { - container = "docker://fmalmeida/mpgap@sha256:f640835dad87d98ded0582271aafaebe609f9196618f52a46ac10d991a0fce27" + container = "docker://fmalmeida/mpgap@sha256:28223374b5500b09ae467064d825b44d086c99f1ade6afa80dbf8fd0053d760e" } -} \ No newline at end of file +} diff --git a/conf/test.config b/conf/test.config index d715ab83..13392f21 100644 --- a/conf/test.config +++ b/conf/test.config @@ -9,7 +9,7 @@ profiles { params { input = "$baseDir/assets/illumina_test.yml" output = "sreads_test" - tracedir = "sreads_test/pipeline_info" + tracedir = "${params.output}/pipeline_info" max_memory = 6.GB max_cpus = 2 max_time = '6.h' @@ -30,16 +30,26 @@ profiles { ont { params { - input = "$baseDir/assets/lreads_test_ont.yml" - output = "lreads_test_ont" - tracedir = "lreads_test_ont/pipeline_info" + input = "$baseDir/assets/lreads_test_ont.yml" + output = "lreads_test_ont" + tracedir = "${params.output}/pipeline_info" + skip_hifiasm = true + } + } + ont_hifi { + params { + input = "$baseDir/assets/lreads_test_ont_hifi.yml" + output = "lreads_test_ont_hifi" + tracedir = "${params.output}/pipeline_info" + high_quality_longreads = true } } pacbio { params { - input = "$baseDir/assets/lreads_test_pacbio.yml" - output = "lreads_test_pacbio" - tracedir = "lreads_test_pacbio/pipeline_info" + input = "$baseDir/assets/lreads_test_pacbio.yml" + output = "lreads_test_pacbio" + tracedir = "${params.output}/pipeline_info" + skip_hifiasm = true } } @@ -60,16 +70,18 @@ profiles { ont { params { - input = "$baseDir/assets/hybrid_test_ont.yml" - output = "hybrid_test_ont" - tracedir = "hybrid_test_ont/pipeline_info" + input = "$baseDir/assets/hybrid_test_ont.yml" + output = "hybrid_test_ont" + tracedir = "${params.output}/pipeline_info" + skip_hifiasm = true } } pacbio { params { - input = "$baseDir/assets/hybrid_test_pacbio.yml" - output = "hybrid_test_pacbio" - tracedir = "hybrid_test_pacbio/pipeline_info" + input = "$baseDir/assets/hybrid_test_pacbio.yml" + output = "hybrid_test_pacbio" + tracedir = "${params.output}/pipeline_info" + skip_hifiasm = true } } diff --git a/docs/index.md b/docs/index.md index b9dc093c..d1650b8f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -23,7 +23,7 @@ The pipeline wraps up the following tools and analyses: | Software | Analysis | | :------- | :------- | -| [Canu](https://github.com/marbl/canu), [Flye](https://github.com/fenderglass/Flye), [Unicycler](https://github.com/rrwick/Unicycler), [Raven](https://github.com/lbcb-sci/raven), [Shasta](https://github.com/chanzuckerberg/shasta) and [wtdbg2](https://github.com/ruanjue/wtdbg2) | Long reads assembly | +| [Hifiasm](https://github.com/chhylp123/hifiasm), [Canu](https://github.com/marbl/canu), [Flye](https://github.com/fenderglass/Flye), [Unicycler](https://github.com/rrwick/Unicycler), [Raven](https://github.com/lbcb-sci/raven), [Shasta](https://github.com/chanzuckerberg/shasta) and [wtdbg2](https://github.com/ruanjue/wtdbg2) | Long reads assembly | | [Haslr](https://github.com/vpc-ccg/haslr), [Unicycler](https://github.com/rrwick/Unicycler) and [SPAdes](https://github.com/ablab/spades) | Hybrid assembly | | [Shovill](https://github.com/tseemann/shovill), [Unicycler](https://github.com/rrwick/Unicycler), [Megahit](https://github.com/voutcn/megahit) and [SPAdes](https://github.com/ablab/spades) | Short reads assembly | | [Nanopolish](https://github.com/jts/nanopolish), [Medaka](https://github.com/nanoporetech/medaka), [gcpp](https://github.com/PacificBiosciences/gcpp), [Polypolish](https://github.com/rrwick/Polypolish) and [Pilon](https://github.com/broadinstitute/pilon) | Assembly polishing | diff --git a/docs/manual.md b/docs/manual.md index 9480e609..b8112df2 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -42,6 +42,7 @@ The pipeline is capable of assembling Illumina, ONT and Pacbio reads in three ma + Raven + Shasta + wtdbg2 + + hifiasm 3. **Hybrid assemblies (using both short and long reads)** + Unicycler @@ -162,6 +163,8 @@ However, they can also be set in a sample-specific manner. If a sample has a val | `--shasta_additional_parameters` | :material-close: | False | Passes additional parameters for Raven assembler. E.g. `" --Assembly.detangleMethod 1 "`. Must be given as shown in Shasta's manual | | `--skip_wtdbg2` | :material-close: | False | Skip the execution of wtdbg2 | | `--wtdbg2_additional_parameters` | :material-close: | False | Passes additional parameters for wtdbg2 assembler. E.g. `" -k 250 "`. Must be given as shown in wtdbg2's manual. Remember, the script called for wtdbg2 is `wtdbg2.pl` thus you must give the parameters used by it | +| `--skip_hifiasm` | :material-close: | False | Skip the execution of hifiasm | +| `--hifiasm_additional_parameters` | :material-close: | False | Passes additional parameters for hifiasm assembler. E.g. `" --ul ul.fq.gz "`. Must be given as shown in hifiasm's manual | | `--skip_unicycler` | :material-close: | False | Skip the execution of Unicycler | | `--unicycler_additional_parameters` | :material-close: | False | Passes additional parameters for Unicycler assembler. E.g. `" --mode conservative --no_correct "`. Must be given as shown in Unicycler's manual | | `--skip_spades` | :material-close: | False | Skip the execution of SPAdes | diff --git a/docs/non_bacteria.md b/docs/non_bacteria.md index cf208fe5..138303b4 100644 --- a/docs/non_bacteria.md +++ b/docs/non_bacteria.md @@ -38,6 +38,7 @@ nextflow run fmalmeida/mpgap \ --skip_unicycler \ --flye_additional_parameters ' --keep-haplotypes ' \ --quast_additional_parameters ' --eukaryote ' \ + --skip_hifiasm \ --max_cpus 20 \ --max_memory '40.GB' ``` diff --git a/environment.yml b/environment.yml index 84db87c0..be0289cc 100644 --- a/environment.yml +++ b/environment.yml @@ -55,7 +55,7 @@ dependencies: - bioconda::csvtk=0.23.0 - bioconda::wtdbg=2.5 - bioconda::medaka=1.11.1 - +- bioconda::hifiasm=0.19.8 # for medaka > 1.4 - bioconda::samtools>=1.11 - bioconda::tabix>=1.11 diff --git a/markdown/CHANGELOG.md b/markdown/CHANGELOG.md index 23834d61..bfd67cef 100644 --- a/markdown/CHANGELOG.md +++ b/markdown/CHANGELOG.md @@ -10,11 +10,12 @@ The tracking for changes started in v2. * Increase default `--max_memory` value to 20.GB. * Add a directory called `final_assemblies` in the main output directory holding all the assemblies generated in the pipeline execution. * Updated documentation as discussed in [[#58](https://github.com/fmalmeida/MpGAP/issues/58)] and [[#57](https://github.com/fmalmeida/MpGAP/issues/57)]. -* [[#61](https://github.com/fmalmeida/MpGAP/issues/61)] - Add a simple parameter to adjust how many cpus and how much memory should the assembly jobs request in the first attempt to avoid lack of resources errors. * [[#50](https://github.com/fmalmeida/MpGAP/issues/50)] * Parameters `--skip_pilon` and `--skip_polypolish` added to the pipeline * MultiQC report was fixed and enhanced * Docker image was also modified to download BUSCO standalone and pipeline perform the BUSCO standalone run instead of via quast. +* [[#53](https://github.com/fmalmeida/MpGAP/issues/53)] - Include hifiasm assembler in the pipeline. Long reads only and hybrid strategy 2. +* [[#61](https://github.com/fmalmeida/MpGAP/issues/61)] - Add a simple parameter to adjust how many cpus and how much memory should the assembly jobs request in the first attempt to avoid lack of resources errors. * [[#66](https://github.com/fmalmeida/MpGAP/issues/66)] - Include an automated generation of a samplesheet for bacannot pipeline. ## v3.1.4 -- [2022-Sep-03] diff --git a/markdown/list_of_tools.md b/markdown/list_of_tools.md index bbe94e46..b8c6d30c 100644 --- a/markdown/list_of_tools.md +++ b/markdown/list_of_tools.md @@ -4,6 +4,6 @@ These are the tools that wrapped inside mpgap. **Cite** the tools whenever you u || **Source** | |:- | :- | -| **Assemblers** | [Canu](https://github.com/marbl/canu), [Flye](https://github.com/fenderglass/Flye), [Raven](https://github.com/lbcb-sci/raven), [Shasta](https://github.com/chanzuckerberg/shasta), [wtdbg2](https://github.com/ruanjue/wtdbg2), [Haslr](https://github.com/vpc-ccg/haslr), [Unicycler](https://github.com/rrwick/Unicycler), [Spades](https://github.com/ablab/spades), [Shovill](https://github.com/tseemann/shovill) | +| **Assemblers** | [Hifiasm](https://github.com/chhylp123/hifiasm), [Canu](https://github.com/marbl/canu), [Flye](https://github.com/fenderglass/Flye), [Raven](https://github.com/lbcb-sci/raven), [Shasta](https://github.com/chanzuckerberg/shasta), [wtdbg2](https://github.com/ruanjue/wtdbg2), [Haslr](https://github.com/vpc-ccg/haslr), [Unicycler](https://github.com/rrwick/Unicycler), [Spades](https://github.com/ablab/spades), [Shovill](https://github.com/tseemann/shovill) | | **Polishers** | [Nanopolish](https://github.com/jts/nanopolish), [Medaka](https://github.com/nanoporetech/medaka), [gcpp](https://github.com/PacificBiosciences/gcpp), [Polypolish](https://github.com/rrwick/Polypolish) and [Pilon](https://github.com/broadinstitute/pilon) | | **Quality check** | [Quast](https://github.com/ablab/quast), [BUSCO](https://busco.ezlab.org/busco_userguide.html) and [MultiQC](https://multiqc.info/) | diff --git a/modules/local/LongReads/hifiasm.nf b/modules/local/LongReads/hifiasm.nf new file mode 100644 index 00000000..42cc8fa9 --- /dev/null +++ b/modules/local/LongReads/hifiasm.nf @@ -0,0 +1,29 @@ +process hifiasm { + publishDir "${params.output}/${prefix}/hifiasm", mode: 'copy' + tag "${id}" + label 'process_assembly' + + input: + tuple val(id), val(entrypoint), file(sread1), file(sread2), file(single), file(lreads), val(lr_type), val(wtdbg2_technology), val(genome_size), val(corrected_longreads), val(high_quality_longreads), val(medaka_model), file(fast5), val(nanopolish_max_haplotypes), val(shasta_config), file(bams), val(prefix) + + output: + file "hifiasm*" // Saves all files + tuple val(id), file("hifiasm_assembly.fasta"), val('hifiasm') // Gets contigs file + + when: + (entrypoint == 'longreads_only' || entrypoint == 'hybrid_strategy_2') + + script: + def additional_params = (params.hifiasm_additional_parameters) ? params.hifiasm_additional_parameters : "" + """ + # run hifiasm + hifiasm \\ + -o hifiasm \\ + -t $task.cpus \\ + $additional_params \\ + $lreads + + # convert to fasta + awk '/^S/{print ">"\$2"\\n"\$3}' hifiasm.bp.p_ctg.gfa > hifiasm_assembly.fasta + """ +} diff --git a/nextflow_schema.json b/nextflow_schema.json index 5cd36a8d..dcde1f7a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -243,6 +243,13 @@ "hidden": true, "fa_icon": "fas fa-ban" }, + "skip_hifiasm": { + "type": "boolean", + "description": "Skip Hifiasm assembler", + "help_text": "Hifiasm is a long reads only assembler. Can be use for hybrid assemblies in strategy 2.", + "hidden": true, + "fa_icon": "fas fa-ban" + }, "skip_pilon": { "type": "boolean", "description": "Skip pilon polisher", @@ -340,6 +347,13 @@ "help_text": "Must be given as shown in shasta manual. E.g. \" --Reads.minReadLength 5000 \", inside quotes and separated by spaces", "hidden": true, "fa_icon": "fas fa-quote-left" + }, + "hifiasm_additional_parameters": { + "type": "string", + "description": "Hifiasm additional parameters", + "help_text": "Must be giveen as shown in hifiasm manual. E.g. \" --ul ul.fq.gz \", inside quotes and separated by spaces", + "hidden": true, + "fa_icon": "fas fa-quote-left" } }, "fa_icon": "fas fa-list-ul" @@ -463,4 +477,4 @@ "$ref": "#/definitions/institutional_config_options" } ] -} \ No newline at end of file +} diff --git a/workflows/hybrid.nf b/workflows/hybrid.nf index 5cd6b653..cda4b185 100644 --- a/workflows/hybrid.nf +++ b/workflows/hybrid.nf @@ -24,6 +24,9 @@ include { wtdbg2 as strategy_2_wtdbg2 } from '../modules/local/LongReads/wtdbg2. // Shasta assembler include { shasta as strategy_2_shasta } from '../modules/local/LongReads/shasta.nf' +// Hifiasm assembler +include { hifiasm as strategy_2_hifiasm } from '../modules/local/LongReads/hifiasm.nf' + /* * Modules for long reads assemblies polishment */ @@ -75,6 +78,7 @@ workflow HYBRID { LONGREADS_OUTPUTS['MEDAKA'] = Channel.empty() LONGREADS_OUTPUTS['NANOPOLISH'] = Channel.empty() LONGREADS_OUTPUTS['GCPP'] = Channel.empty() + LONGREADS_OUTPUTS['HIFIASM'] = Channel.empty() def HYBRID_OUTPUTS = [:] HYBRID_OUTPUTS['UNICYCLER'] = Channel.empty() @@ -173,6 +177,14 @@ workflow HYBRID { ch_versions_hb = ch_versions_hb.mix(strategy_2_shasta.out.versions.first()) } + /* + *Hifiasm + */ + if (!params.skip_hifiasm) { + strategy_2_hifiasm(input_branches.secondary) + LONGREADS_OUTPUTS['HIFIASM'] = strategy_2_hifiasm.out[1] + } + /* * wtdbg2 */ @@ -182,6 +194,8 @@ workflow HYBRID { ch_versions_hb = ch_versions_hb.mix(strategy_2_wtdbg2.out.versions.first()) } + + // Get long reads assemblies LONGREADS_OUTPUTS['RAW_ASSEMBLIES'] = LONGREADS_OUTPUTS['CANU'] @@ -190,7 +204,8 @@ workflow HYBRID { LONGREADS_OUTPUTS['UNICYCLER'], LONGREADS_OUTPUTS['RAVEN'], LONGREADS_OUTPUTS['WTDBG2'], - LONGREADS_OUTPUTS['SHASTA'] + LONGREADS_OUTPUTS['SHASTA'], + LONGREADS_OUTPUTS['HIFIASM'] ) .combine(input_tuple, by: 0) diff --git a/workflows/long-reads-only.nf b/workflows/long-reads-only.nf index 90716634..09f68758 100644 --- a/workflows/long-reads-only.nf +++ b/workflows/long-reads-only.nf @@ -24,6 +24,9 @@ include { wtdbg2 } from '../modules/local/LongReads/wtdbg2.nf' // Shasta assembler include { shasta } from '../modules/local/LongReads/shasta.nf' +// Hifiasm assembler +include { hifiasm } from '../modules/local/LongReads/hifiasm.nf' + /* * Modules for long reads assemblies polishment */ @@ -56,6 +59,7 @@ workflow LONGREADS_ONLY { LONGREADS_OUTPUTS['MEDAKA'] = Channel.empty() LONGREADS_OUTPUTS['NANOPOLISH'] = Channel.empty() LONGREADS_OUTPUTS['GCPP'] = Channel.empty() + LONGREADS_OUTPUTS['HIFIASM'] = Channel.empty() ch_versions_lr = Channel.empty() @@ -113,13 +117,22 @@ workflow LONGREADS_ONLY { ch_versions_lr = ch_versions_lr.mix(wtdbg2.out.versions.first()) } + /* + *Hifiasm + */ + if (!params.skip_hifiasm) { + hifiasm(input_tuple) + LONGREADS_OUTPUTS['HIFIASM'] = hifiasm.out[1] + } + // gather assemblies for polishing steps LONGREADS_OUTPUTS['RAW_ASSEMBLIES'] = LONGREADS_OUTPUTS['CANU'] .mix(LONGREADS_OUTPUTS['UNICYCLER'], LONGREADS_OUTPUTS['FLYE'] , LONGREADS_OUTPUTS['RAVEN'], LONGREADS_OUTPUTS['WTDBG2'], - LONGREADS_OUTPUTS['SHASTA']) + LONGREADS_OUTPUTS['SHASTA'], + LONGREADS_OUTPUTS['HIFIASM']) .combine(input_tuple, by: 0) /*