diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 61a7436..8f63d23 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -65,7 +65,7 @@ docker-run: - if: $MATRIX_NAME == "minimap2" variables: NF_PROCESS_FILES: "wf-metagenomics/subworkflows/minimap_pipeline.nf" - NF_WORKFLOW_OPTS: "--fastq test_data/case01 --minimap2_by_reference" + NF_WORKFLOW_OPTS: "--fastq test_data/case01 --minimap2_by_reference --keep_bam" NF_IGNORE_PROCESSES: "extractMinimap2Reads" - if: $MATRIX_NAME == "minimap2-sample-sheet" variables: diff --git a/CHANGELOG.md b/CHANGELOG.md index d53debc..4a0d39a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v1.2.0] +### Added +- Output IGV configuration file if the `keep_bam` option is enabled and a custom reference is provided (in minimap2 mode). +- Output reduced reference file if the `keep_bam` option is enabled (in minimap2 mode). +- `abundance_threshold` reduces the number of references to be displayed in IGV. +### Fixed +- `exclude-host` can input a file in the EPI2ME Desktop Application. +### Changed +- Bump to wf-metagenomics v2.10.0 + ## [v1.1.3] ### Added - Reads below percentages of identity (`min_percent_identity`) and the reference covered (`min_ref_coverage`) are considered as unclassified in the minimap2 approach. diff --git a/README.md b/README.md index 1856e1e..18fe1a8 100644 --- a/README.md +++ b/README.md @@ -177,7 +177,7 @@ input_reads.fastq ─── input_directory ─── input_directory | minimap2filter | string | Filter output of minimap2 by taxids inc. child nodes, E.g. "9606,1404" | Provide a list of taxids if you are only interested in certain ones in your minimap2 analysis outputs. | | | minimap2exclude | boolean | Invert minimap2filter and exclude the given taxids instead | Exclude a list of taxids from analysis outputs. | False | | split_prefix | boolean | Enable if using a very large reference with minimap2 | If reference fasta large enough to require multipart index, set to true to use split-prefix option with minimap2. | False | -| keep_bam | boolean | Copy bam files into the output directory. | | False | +| keep_bam | boolean | Copy bam files into the output directory. It also creates the configuration and reduced reference files needed to load the alignments in IGV. | | False | | minimap2_by_reference | boolean | Add a table with the mean sequencing depth per reference, standard deviation and coefficient of variation. It adds a scatterplot of the sequencing depth vs. the coverage and a heatmap showing the depth per percentile to the report | | False | | min_percent_identity | number | Minimum percentage of identity with the matched reference to define a sequence as classified; sequences with a value lower than this are defined as unclassified. | | 95 | | min_ref_coverage | number | Minimum coverage value to define a sequence as classified; sequences with a coverage value lower than this are defined as unclassified. Use this option if you expect reads whose lengths are similar to the references' lengths. | | 90 | @@ -187,7 +187,7 @@ input_reads.fastq ─── input_directory ─── input_directory | Nextflow parameter name | Type | Description | Help | Default | |--------------------------|------|-------------|------|---------| -| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads), use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger than 1. | 1 | +| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads) use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger or equal to 1. | 1 | | n_taxa_barplot | integer | Number of most abundance taxa to be displayed in the barplot. The rest of taxa will be grouped under the "Other" category. | | 9 | @@ -205,7 +205,7 @@ input_reads.fastq ─── input_directory ─── input_directory | min_len | integer | Specify read length lower limit. | Any reads shorter than this limit will not be included in the analysis. | 800 | | min_read_qual | number | Specify read quality lower limit. | Any reads with a quality lower than this limit will not be included in the analysis. | | | max_len | integer | Specify read length upper limit | Any reads longer than this limit will not be included in the analysis. | 2000 | -| threads | integer | Maximum number of CPU threads to use per workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. The total CPU resource used by the workflow is constrained by the executor configuration. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 | +| threads | integer | Maximum number of CPU threads to use in each parallel workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 | @@ -229,6 +229,10 @@ Output files may be aggregated including information for all samples or provided | BAM index file (minimap2) | ./bams/{{ alias }}.reference.bam.bai | Index file generated from mapping filtered input reads to the reference. | per-sample | | BAM flagstat (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.flagstat.tsv | Mapping results per reference | per-sample | | Minimap2 alignment statistics (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.readstats.tsv.gz | Per read stats after aligning | per-sample | +| Reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz | Reference FASTA file containing only those sequences that have reads mapped against them. | aggregated | +| Index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated | +| GZI index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated | +| JSON configuration file for IGV browser | ./igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated | @@ -263,6 +267,8 @@ nextflow run epi2me-labs/wf-16s --fastq test_data/case01 --classifier minimap2 The creation of alignment statistics plots can be enabled with the `minimap2_by_reference` flag. Using this option produces a table and scatter plot in the report showing sequencing depth and coverage of each reference. The report also contains a heatmap indicating the sequencing depth over relative genomic coordinates for the references with the highest coverage (references with a mean coverage of less than 1% of the one with the largest value are omitted). +In addition, the user can output BAM files in a folder called `bams` by using the option `keep_bam`. If the user provides a custom database, the workflow will also output the references with reads mappings, as well as an IGV configuration file. This configuration file allows the user to view the alignments in the EPI2ME Desktop Application in the Viewer tab. Note that the number of references can be reduced using the `abundance_threshold` option, which will select those references with a number of reads aligned higher than this value. Please consider that the view of the alignment is highly dependent on the reference selected. + #### 3.2 Using Kraken2 [Kraken2](https://github.com/DerrickWood/kraken2) provides the fastest method for the taxonomic classification of the reads. Then, [Bracken](https://github.com/jenniferlu717/Bracken) is used to provide an estimate of the species (or the selected taxonomic rank) abundance in the sample. @@ -310,7 +316,7 @@ There are three types of biodiversity measures described over a special scale [3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/), which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'S' (species-level). Some of the included alpha diversity metrics are: +To provide a quick overview of the alpha-diversity of the microbial community, we provide some of the most common diversity metrics calculated for a specific taxonomic rank [3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/), which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'G' (genus-level). Some of the included alpha diversity metrics are: * Shannon Diversity Index (H): Shannon entropy approaches zero if a community is almost entirely made up of a single taxon. diff --git a/docs/06_input_parameters.md b/docs/06_input_parameters.md index 109586e..b4f081e 100644 --- a/docs/06_input_parameters.md +++ b/docs/06_input_parameters.md @@ -60,7 +60,7 @@ | minimap2filter | string | Filter output of minimap2 by taxids inc. child nodes, E.g. "9606,1404" | Provide a list of taxids if you are only interested in certain ones in your minimap2 analysis outputs. | | | minimap2exclude | boolean | Invert minimap2filter and exclude the given taxids instead | Exclude a list of taxids from analysis outputs. | False | | split_prefix | boolean | Enable if using a very large reference with minimap2 | If reference fasta large enough to require multipart index, set to true to use split-prefix option with minimap2. | False | -| keep_bam | boolean | Copy bam files into the output directory. | | False | +| keep_bam | boolean | Copy bam files into the output directory. It also creates the configuration and reduced reference files needed to load the alignments in IGV. | | False | | minimap2_by_reference | boolean | Add a table with the mean sequencing depth per reference, standard deviation and coefficient of variation. It adds a scatterplot of the sequencing depth vs. the coverage and a heatmap showing the depth per percentile to the report | | False | | min_percent_identity | number | Minimum percentage of identity with the matched reference to define a sequence as classified; sequences with a value lower than this are defined as unclassified. | | 95 | | min_ref_coverage | number | Minimum coverage value to define a sequence as classified; sequences with a coverage value lower than this are defined as unclassified. Use this option if you expect reads whose lengths are similar to the references' lengths. | | 90 | @@ -70,7 +70,7 @@ | Nextflow parameter name | Type | Description | Help | Default | |--------------------------|------|-------------|------|---------| -| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads), use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger than 1. | 1 | +| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads) use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger or equal to 1. | 1 | | n_taxa_barplot | integer | Number of most abundance taxa to be displayed in the barplot. The rest of taxa will be grouped under the "Other" category. | | 9 | @@ -88,6 +88,6 @@ | min_len | integer | Specify read length lower limit. | Any reads shorter than this limit will not be included in the analysis. | 800 | | min_read_qual | number | Specify read quality lower limit. | Any reads with a quality lower than this limit will not be included in the analysis. | | | max_len | integer | Specify read length upper limit | Any reads longer than this limit will not be included in the analysis. | 2000 | -| threads | integer | Maximum number of CPU threads to use per workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. The total CPU resource used by the workflow is constrained by the executor configuration. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 | +| threads | integer | Maximum number of CPU threads to use in each parallel workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 | diff --git a/docs/07_outputs.md b/docs/07_outputs.md index c0ab96c..a77f028 100644 --- a/docs/07_outputs.md +++ b/docs/07_outputs.md @@ -13,3 +13,7 @@ Output files may be aggregated including information for all samples or provided | BAM index file (minimap2) | ./bams/{{ alias }}.reference.bam.bai | Index file generated from mapping filtered input reads to the reference. | per-sample | | BAM flagstat (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.flagstat.tsv | Mapping results per reference | per-sample | | Minimap2 alignment statistics (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.readstats.tsv.gz | Per read stats after aligning | per-sample | +| Reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz | Reference FASTA file containing only those sequences that have reads mapped against them. | aggregated | +| Index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated | +| GZI index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated | +| JSON configuration file for IGV browser | ./igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated | diff --git a/docs/08_pipeline_overview.md b/docs/08_pipeline_overview.md index fd266f5..7d185ea 100644 --- a/docs/08_pipeline_overview.md +++ b/docs/08_pipeline_overview.md @@ -26,6 +26,8 @@ nextflow run epi2me-labs/wf-16s --fastq test_data/case01 --classifier minimap2 The creation of alignment statistics plots can be enabled with the `minimap2_by_reference` flag. Using this option produces a table and scatter plot in the report showing sequencing depth and coverage of each reference. The report also contains a heatmap indicating the sequencing depth over relative genomic coordinates for the references with the highest coverage (references with a mean coverage of less than 1% of the one with the largest value are omitted). +In addition, the user can output BAM files in a folder called `bams` by using the option `keep_bam`. If the user provides a custom database, the workflow will also output the references with reads mappings, as well as an IGV configuration file. This configuration file allows the user to view the alignments in the EPI2ME Desktop Application in the Viewer tab. Note that the number of references can be reduced using the `abundance_threshold` option, which will select those references with a number of reads aligned higher than this value. Please consider that the view of the alignment is highly dependent on the reference selected. + #### 3.2 Using Kraken2 [Kraken2](https://github.com/DerrickWood/kraken2) provides the fastest method for the taxonomic classification of the reads. Then, [Bracken](https://github.com/jenniferlu717/Bracken) is used to provide an estimate of the species (or the selected taxonomic rank) abundance in the sample. @@ -73,7 +75,7 @@ There are three types of biodiversity measures described over a special scale [3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/), which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'S' (species-level). Some of the included alpha diversity metrics are: +To provide a quick overview of the alpha-diversity of the microbial community, we provide some of the most common diversity metrics calculated for a specific taxonomic rank [3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/), which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'G' (genus-level). Some of the included alpha diversity metrics are: * Shannon Diversity Index (H): Shannon entropy approaches zero if a community is almost entirely made up of a single taxon. diff --git a/main.nf b/main.nf index 775e092..114e6af 100644 --- a/main.nf +++ b/main.nf @@ -82,15 +82,18 @@ workflow { } // Input data + // real time wf still requires per-read-stats as it computes its own aggregated statistics + // TODO: investigate chunk option for real time and use of histograms if (params.fastq) { samples = fastq_ingress([ "input":params.fastq, "sample": params.real_time ? null : params.sample, "sample_sheet": params.real_time ? null : params.sample_sheet, "analyse_unclassified":params.analyse_unclassified, - "stats": params.wf.stats, + "stats": true, "fastcat_extra_args": fastcat_extra_args.join(" "), - "watch_path": params.real_time + "watch_path": params.real_time, + "per_read_stats": params.real_time ? true : false ]) } else { // if we didn't get a `--fastq`, there must have been a `--bam` (as is codified @@ -101,9 +104,10 @@ workflow { "sample_sheet":params.sample_sheet, "analyse_unclassified":params.analyse_unclassified, "return_fastq": true, - "keep_unaligned": params.wf.keep_unaligned, - "stats": params.wf.stats, - "watch_path": params.real_time + "keep_unaligned": true, + "stats": true, + "watch_path": params.real_time, + "per_read_stats": params.real_time ? true : false ]) } diff --git a/nextflow.config b/nextflow.config index 119a931..0d86cd6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -96,15 +96,13 @@ params { schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf,amr,amr_db,amr_minid,amr_mincov,database_sets' wf { - stats = true - keep_unaligned = true example_cmd = [ "--fastq 'wf-16s-demo/test_data'", "--minimap2_by_reference" ] agent = null container_sha = "sha44a6dacff5f2001d917b774647bb4cbc1b53bc76" - common_sha = "sha645176f98b8780851f9c476a064d44c2ae76ddf6" + common_sha = "sha338caea0a2532dc0ea8f46638ccc322bb8f9af48" } } @@ -116,7 +114,7 @@ manifest { description = 'Identification of the origin of single reads from 16S/ITS amplicon sequencing.' mainScript = 'main.nf' nextflowVersion = '>=23.04.2' - version = 'v1.1.3' + version = 'v1.2.0' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 5db138f..e283a5f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -61,7 +61,7 @@ }, "exclude_host": { "type": "string", - "format": "path", + "format": "file-path", "title": "Exclude host reads", "description": "A FASTA or MMI file of the host reference. Reads that align with this reference will be excluded from the analysis." } @@ -305,7 +305,7 @@ "type": "boolean", "title": "Enable keep BAM files", "default": false, - "description": "Copy bam files into the output directory." + "description": "Copy bam files into the output directory. It also creates the configuration and reduced reference files needed to load the alignments in IGV." }, "minimap2_by_reference": { "type": "boolean", @@ -343,7 +343,7 @@ "default": 1, "title": "Abundance threshold", "description": "Remove those taxa whose abundance is equal or lower than the chosen value.", - "help_text": "To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads), use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger than 1." + "help_text": "To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads) use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger or equal to 1." }, "n_taxa_barplot": { "type": "integer", @@ -397,9 +397,9 @@ "threads": { "type": "integer", "default": 4, - "title": "Maximum number of CPU threads", - "description": "Maximum number of CPU threads to use per workflow task.", - "help_text": "Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. The total CPU resource used by the workflow is constrained by the executor configuration. See server threads parameter for Kraken specific threads in the real_time pipeline." + "title": "Number of CPU threads per workflow task", + "description": "Maximum number of CPU threads to use in each parallel workflow task.", + "help_text": "Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. See server threads parameter for Kraken specific threads in the real_time pipeline." } } }, diff --git a/output_definition.json b/output_definition.json index 42609e3..b4c0e9d 100644 --- a/output_definition.json +++ b/output_definition.json @@ -87,6 +87,38 @@ "mime-type": "application/gzip", "optional": true, "type": "per-sample" + }, + "reduced-reference": { + "filepath": "./igv_reference/reduced_reference.fasta.gz", + "title": "Reduced reference FASTA file", + "description": "Reference FASTA file containing only those sequences that have reads mapped against them.", + "mime-type": "application/gzip", + "optional": true, + "type": "aggregated" + }, + "reduced-reference-index": { + "filepath": "./igv_reference/reduced_reference.fasta.gz.fai", + "title": "Index of the reduced reference FASTA file", + "description": "Index of the reference FASTA file containing only those sequences that have reads mapped against them.", + "mime-type": "text/tab-separated-values", + "optional": true, + "type": "aggregated" + }, + "reduced-reference-gzi-index": { + "filepath": "./igv_reference/reduced_reference.fasta.gz.gzi", + "title": "GZI index of the reduced reference FASTA file", + "description": "Index of the reference FASTA file containing only those sequences that have reads mapped against them.", + "mime-type": "application/octet-stream", + "optional": true, + "type": "aggregated" + }, + "igv-config": { + "filepath": "./igv.json", + "title": "JSON configuration file for IGV browser", + "description": "JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference.", + "mime-type": "text/json", + "optional": true, + "type": "aggregated" } } } \ No newline at end of file diff --git a/wf-metagenomics b/wf-metagenomics index b12d893..671dff2 160000 --- a/wf-metagenomics +++ b/wf-metagenomics @@ -1 +1 @@ -Subproject commit b12d8938568e4906ddc7a425f162a91a3d1e3577 +Subproject commit 671dff23425e1f76da3ce3337a9685b48657dcbb