Merge branch 'bump-120' into 'dev'

Bump 120 See merge request epi2melabs/workflows/wf-16s!14
epi2me-labs · Jun 5, 2024 · c8e0b7f · c8e0b7f
2 parents 4a8184a + 0d3b5d6
commit c8e0b7f
Show file tree

Hide file tree

Showing 11 changed files with 81 additions and 25 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -65,7 +65,7 @@ docker-run:
      - if: $MATRIX_NAME == "minimap2"
        variables:
           NF_PROCESS_FILES: "wf-metagenomics/subworkflows/minimap_pipeline.nf"
-          NF_WORKFLOW_OPTS: "--fastq test_data/case01 --minimap2_by_reference"
+          NF_WORKFLOW_OPTS: "--fastq test_data/case01 --minimap2_by_reference --keep_bam"
           NF_IGNORE_PROCESSES: "extractMinimap2Reads"
      - if: $MATRIX_NAME == "minimap2-sample-sheet"
        variables:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
+## [v1.2.0]
+### Added
+- Output IGV configuration file if the `keep_bam` option is enabled and a custom reference is provided (in minimap2 mode).
+- Output reduced reference file if the `keep_bam` option is enabled (in minimap2 mode).
+- `abundance_threshold` reduces the number of references to be displayed in IGV.
+### Fixed
+- `exclude-host` can input a file in the EPI2ME Desktop Application.
+### Changed
+- Bump to wf-metagenomics v2.10.0
+
 ## [v1.1.3]
 ### Added
 - Reads below percentages of identity (`min_percent_identity`) and the reference covered (`min_ref_coverage`) are considered as unclassified in the minimap2 approach.

diff --git a/README.md b/README.md
@@ -177,7 +177,7 @@ input_reads.fastq   ─── input_directory  ─── input_directory
 | minimap2filter | string | Filter output of minimap2 by taxids inc. child nodes, E.g. "9606,1404" | Provide a list of taxids if you are only interested in certain ones in your minimap2 analysis outputs. |  |
 | minimap2exclude | boolean | Invert minimap2filter and exclude the given taxids instead | Exclude a list of taxids from analysis outputs. | False |
 | split_prefix | boolean | Enable if using a very large reference with minimap2 | If reference fasta large enough to require multipart index, set to true to use split-prefix option with minimap2.  | False |
-| keep_bam | boolean | Copy bam files into the output directory. |  | False |
+| keep_bam | boolean | Copy bam files into the output directory. It also creates the configuration and reduced reference files needed to load the alignments in IGV. |  | False |
 | minimap2_by_reference | boolean | Add a table with the mean sequencing depth per reference, standard deviation and coefficient of variation. It adds a scatterplot of the sequencing depth vs. the coverage and a heatmap showing the depth per percentile to the report |  | False |
 | min_percent_identity | number | Minimum percentage of identity with the matched reference to define a sequence as classified; sequences with a value lower than this are defined as unclassified. |  | 95 |
 | min_ref_coverage | number | Minimum coverage value to define a sequence as classified; sequences with a coverage value lower than this are defined as unclassified. Use this option if you expect reads whose lengths are similar to the references' lengths. |  | 90 |
@@ -187,7 +187,7 @@ input_reads.fastq   ─── input_directory  ─── input_directory
 
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
-| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads), use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger than 1. | 1 |
+| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads) use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger or equal to 1. | 1 |
 | n_taxa_barplot | integer | Number of most abundance taxa to be displayed in the barplot. The rest of taxa will be grouped under the "Other" category. |  | 9 |
 
 
@@ -205,7 +205,7 @@ input_reads.fastq   ─── input_directory  ─── input_directory
 | min_len | integer | Specify read length lower limit. | Any reads shorter than this limit will not be included in the analysis. | 800 |
 | min_read_qual | number | Specify read quality lower limit. | Any reads with a quality lower than this limit will not be included in the analysis. |  |
 | max_len | integer | Specify read length upper limit | Any reads longer than this limit will not be included in the analysis. | 2000 |
-| threads | integer | Maximum number of CPU threads to use per workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. The total CPU resource used by the workflow is constrained by the executor configuration. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 |
+| threads | integer | Maximum number of CPU threads to use in each parallel workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 |
 
 
 
@@ -229,6 +229,10 @@ Output files may be aggregated including information for all samples or provided
 | BAM index file (minimap2) | ./bams/{{ alias }}.reference.bam.bai | Index file generated from mapping filtered input reads to the reference. | per-sample |
 | BAM flagstat (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.flagstat.tsv | Mapping results per reference | per-sample |
 | Minimap2 alignment statistics (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.readstats.tsv.gz | Per read stats after aligning | per-sample |
+| Reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz | Reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
+| Index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
+| GZI index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
+| JSON configuration file for IGV browser | ./igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated |
 
 
 
@@ -263,6 +267,8 @@ nextflow run epi2me-labs/wf-16s --fastq test_data/case01 --classifier minimap2
 
 The creation of alignment statistics plots can be enabled with the `minimap2_by_reference` flag. Using this option produces a table and scatter plot in the report showing sequencing depth and coverage of each reference. The report also contains a heatmap indicating the sequencing depth over relative genomic coordinates for the references with the highest coverage (references with a mean coverage of less than 1% of the one with the largest value are omitted).
 
+In addition, the user can output BAM files in a folder called `bams` by using the option `keep_bam`. If the user provides a custom database, the workflow will also output the references with reads mappings, as well as an IGV configuration file. This configuration file allows the user to view the alignments in the EPI2ME Desktop Application in the Viewer tab. Note that the number of references can be reduced using the `abundance_threshold` option, which will select those references with a number of reads aligned higher than this value. Please consider that the view of the alignment is highly dependent on the reference selected.
+
 #### 3.2 Using Kraken2
 
 [Kraken2](https://github.com/DerrickWood/kraken2) provides the fastest method for the taxonomic classification of the reads. Then, [Bracken](https://github.com/jenniferlu717/Bracken) is used to provide an estimate of the species (or the selected taxonomic rank) abundance in the sample.
@@ -310,7 +316,7 @@ There are three types of biodiversity measures described over a special scale <s
 * Beta-diversity defined as variation in the identities of species among sites, provides a direct link between biodiversity at local scales (alpha diversity) and the broader regional species pool (gamma diversity).
 * Gamma-diversity is the total observed richness within an entire region.
 
-To provide a quick overview of the alpha-diversity of the microbial community, we provide some of the most common diversity metrics calculated for a specific taxonomic rank <sup>[3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/)</sup>, which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'S' (species-level). Some of the included alpha diversity metrics are:
+To provide a quick overview of the alpha-diversity of the microbial community, we provide some of the most common diversity metrics calculated for a specific taxonomic rank <sup>[3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/)</sup>, which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'G' (genus-level). Some of the included alpha diversity metrics are:
 
 * Shannon Diversity Index (H): Shannon entropy approaches zero if a community is almost entirely made up of a single taxon.
 

diff --git a/docs/06_input_parameters.md b/docs/06_input_parameters.md
@@ -60,7 +60,7 @@
 | minimap2filter | string | Filter output of minimap2 by taxids inc. child nodes, E.g. "9606,1404" | Provide a list of taxids if you are only interested in certain ones in your minimap2 analysis outputs. |  |
 | minimap2exclude | boolean | Invert minimap2filter and exclude the given taxids instead | Exclude a list of taxids from analysis outputs. | False |
 | split_prefix | boolean | Enable if using a very large reference with minimap2 | If reference fasta large enough to require multipart index, set to true to use split-prefix option with minimap2.  | False |
-| keep_bam | boolean | Copy bam files into the output directory. |  | False |
+| keep_bam | boolean | Copy bam files into the output directory. It also creates the configuration and reduced reference files needed to load the alignments in IGV. |  | False |
 | minimap2_by_reference | boolean | Add a table with the mean sequencing depth per reference, standard deviation and coefficient of variation. It adds a scatterplot of the sequencing depth vs. the coverage and a heatmap showing the depth per percentile to the report |  | False |
 | min_percent_identity | number | Minimum percentage of identity with the matched reference to define a sequence as classified; sequences with a value lower than this are defined as unclassified. |  | 95 |
 | min_ref_coverage | number | Minimum coverage value to define a sequence as classified; sequences with a coverage value lower than this are defined as unclassified. Use this option if you expect reads whose lengths are similar to the references' lengths. |  | 90 |
@@ -70,7 +70,7 @@
 
 | Nextflow parameter name  | Type | Description | Help | Default |
 |--------------------------|------|-------------|------|---------|
-| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads), use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger than 1. | 1 |
+| abundance_threshold | number | Remove those taxa whose abundance is equal or lower than the chosen value. | To remove taxa with abundances lower than or equal to a relative value (compared to the total number of reads) use a decimal between 0-1 (1 not inclusive). To remove taxa with abundances lower than or equal to an absolute value, provide a number larger or equal to 1. | 1 |
 | n_taxa_barplot | integer | Number of most abundance taxa to be displayed in the barplot. The rest of taxa will be grouped under the "Other" category. |  | 9 |
 
 
@@ -88,6 +88,6 @@
 | min_len | integer | Specify read length lower limit. | Any reads shorter than this limit will not be included in the analysis. | 800 |
 | min_read_qual | number | Specify read quality lower limit. | Any reads with a quality lower than this limit will not be included in the analysis. |  |
 | max_len | integer | Specify read length upper limit | Any reads longer than this limit will not be included in the analysis. | 2000 |
-| threads | integer | Maximum number of CPU threads to use per workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. The total CPU resource used by the workflow is constrained by the executor configuration. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 |
+| threads | integer | Maximum number of CPU threads to use in each parallel workflow task. | Several tasks in this workflow benefit from using multiple CPU threads. This option sets the number of CPU threads for all such processes. See server threads parameter for Kraken specific threads in the real_time pipeline. | 4 |
 
 
diff --git a/docs/07_outputs.md b/docs/07_outputs.md
@@ -13,3 +13,7 @@ Output files may be aggregated including information for all samples or provided
 | BAM index file (minimap2) | ./bams/{{ alias }}.reference.bam.bai | Index file generated from mapping filtered input reads to the reference. | per-sample |
 | BAM flagstat (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.flagstat.tsv | Mapping results per reference | per-sample |
 | Minimap2 alignment statistics (minimap2) | ./bams/{{ alias }}.bamstats_results/bamstats.readstats.tsv.gz | Per read stats after aligning | per-sample |
+| Reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz | Reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
+| Index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.fai | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
+| GZI index of the reduced reference FASTA file | ./igv_reference/reduced_reference.fasta.gz.gzi | Index of the reference FASTA file containing only those sequences that have reads mapped against them. | aggregated |
+| JSON configuration file for IGV browser | ./igv.json | JSON configuration file to be loaded in IGV for visualising alignments against the reduced reference. | aggregated |
diff --git a/docs/08_pipeline_overview.md b/docs/08_pipeline_overview.md
@@ -26,6 +26,8 @@ nextflow run epi2me-labs/wf-16s --fastq test_data/case01 --classifier minimap2
 
 The creation of alignment statistics plots can be enabled with the `minimap2_by_reference` flag. Using this option produces a table and scatter plot in the report showing sequencing depth and coverage of each reference. The report also contains a heatmap indicating the sequencing depth over relative genomic coordinates for the references with the highest coverage (references with a mean coverage of less than 1% of the one with the largest value are omitted).
 
+In addition, the user can output BAM files in a folder called `bams` by using the option `keep_bam`. If the user provides a custom database, the workflow will also output the references with reads mappings, as well as an IGV configuration file. This configuration file allows the user to view the alignments in the EPI2ME Desktop Application in the Viewer tab. Note that the number of references can be reduced using the `abundance_threshold` option, which will select those references with a number of reads aligned higher than this value. Please consider that the view of the alignment is highly dependent on the reference selected.
+
 #### 3.2 Using Kraken2
 
 [Kraken2](https://github.com/DerrickWood/kraken2) provides the fastest method for the taxonomic classification of the reads. Then, [Bracken](https://github.com/jenniferlu717/Bracken) is used to provide an estimate of the species (or the selected taxonomic rank) abundance in the sample.
@@ -73,7 +75,7 @@ There are three types of biodiversity measures described over a special scale <s
 * Beta-diversity defined as variation in the identities of species among sites, provides a direct link between biodiversity at local scales (alpha diversity) and the broader regional species pool (gamma diversity).
 * Gamma-diversity is the total observed richness within an entire region.
 
-To provide a quick overview of the alpha-diversity of the microbial community, we provide some of the most common diversity metrics calculated for a specific taxonomic rank <sup>[3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/)</sup>, which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'S' (species-level). Some of the included alpha diversity metrics are:
+To provide a quick overview of the alpha-diversity of the microbial community, we provide some of the most common diversity metrics calculated for a specific taxonomic rank <sup>[3](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4224527/)</sup>, which can be chosen by the user with the `taxonomic_rank` parameter ('D'=Domain,'P'=Phylum, 'C'=Class, 'O'=Order, 'F'=Family, 'G'=Genus, 'S'=Species). By default, the rank is 'G' (genus-level). Some of the included alpha diversity metrics are:
 
 * Shannon Diversity Index (H): Shannon entropy approaches zero if a community is almost entirely made up of a single taxon.
 

diff --git a/main.nf b/main.nf
@@ -82,15 +82,18 @@ workflow {
     }
 
     // Input data
+    // real time wf still requires per-read-stats as it computes its own aggregated statistics
+    // TODO: investigate chunk option for real time and use of histograms
     if (params.fastq) {
             samples = fastq_ingress([
                 "input":params.fastq,
                 "sample": params.real_time ? null : params.sample,
                 "sample_sheet": params.real_time ? null : params.sample_sheet,
                 "analyse_unclassified":params.analyse_unclassified,
-                "stats": params.wf.stats,
+                "stats": true,
                 "fastcat_extra_args": fastcat_extra_args.join(" "),
-                "watch_path": params.real_time
+                "watch_path": params.real_time,
+                "per_read_stats": params.real_time ? true : false
             ])
     } else {
             // if we didn't get a `--fastq`, there must have been a `--bam` (as is codified
@@ -101,9 +104,10 @@ workflow {
                 "sample_sheet":params.sample_sheet,
                 "analyse_unclassified":params.analyse_unclassified,
                 "return_fastq": true,
-                "keep_unaligned": params.wf.keep_unaligned,
-                "stats": params.wf.stats,
-                "watch_path": params.real_time
+                "keep_unaligned": true,
+                "stats": true,
+                "watch_path": params.real_time,
+                "per_read_stats": params.real_time ? true : false
             ])
     }
 

diff --git a/nextflow.config b/nextflow.config
@@ -96,15 +96,13 @@ params {
     schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf,amr,amr_db,amr_minid,amr_mincov,database_sets'
 
     wf {
-        stats = true
-        keep_unaligned = true
         example_cmd = [
             "--fastq 'wf-16s-demo/test_data'",
             "--minimap2_by_reference"
         ]
         agent = null
         container_sha = "sha44a6dacff5f2001d917b774647bb4cbc1b53bc76"
-        common_sha = "sha645176f98b8780851f9c476a064d44c2ae76ddf6"
+        common_sha = "sha338caea0a2532dc0ea8f46638ccc322bb8f9af48"
     }
 }
 
@@ -116,7 +114,7 @@ manifest {
     description     = 'Identification of the origin of single reads from 16S/ITS amplicon sequencing.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = 'v1.1.3'
+    version         = 'v1.2.0'
 }