revert changes that do not belong to template update

qbic-pipelines · Jan 9, 2025 · 2efe5e4 · 2efe5e4
1 parent 0263c7e
commit 2efe5e4
Show file tree

Hide file tree

Showing 42 changed files with 2,993 additions and 34 deletions.
diff --git a/bin/vcf2counts.R b/bin/vcf2counts.R
@@ -0,0 +1,43 @@
+#!/usr/bin/env Rscript
+'VCF to count matrix converter
+
+Usage:
+    vcf2counts.R --help
+    vcf2counts.R --output=<ofile> VCF
+
+Options:
+    -h, --help                  help screen
+    -o, --output=<ofile>        output file name [default: mat.csv]
+
+Arguments:
+    VCF                     input vcf file
+'->doc
+
+suppressMessages(library(VariantAnnotation, warn.conflicts = FALSE, quietly=TRUE))
+suppressMessages(library(docopt, warn.conflicts = FALSE, quietly=TRUE))
+suppressMessages(library(Matrix, warn.conflicts = FALSE, quietly=TRUE))
+
+generateMatrixfromVCF <- function(VCF, ofile) {
+    # Read in VCF file
+    vcfobj <- readVcf(VCF)
+    # Convert genotype to SNP matrix
+    genomat <- geno(vcfobj)$GT
+
+    variantmat <- apply(genomat, c(1, 2), function(x) {
+        xstrip <- gsub("[[:punct:]]", "", x)
+        if (xstrip == "11") {
+            return (2)
+        } else if (xstrip %in% c("01", "10")) {
+            return (1)
+        } else if (xstrip %in% c("00", "")) {
+            return (0)
+        } else {
+            return (NA)
+        }
+    })
+    write.csv(variantmat, file = ofile)
+}
+
+opt <- docopt(doc)
+
+generateMatrixfromVCF(opt$VCF, opt[["--output"]])
diff --git a/conf/test.config b/conf/test.config
@@ -20,13 +20,7 @@ process {
 
 params {
     config_profile_name        = 'Test profile'
-    config_profile_description = 'Minimal test dataset to check pipeline function'
+    config_profile_description = 'Minimal test dataset to check pipeline function. Skips processes GATK_GENOTYPEGVCFS and BCFTOOLS_MERGE.'
 
-    // Input data
-    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
-
-    // Genome references
-    genome = 'R64-1-1'
+    input  = "${projectDir}/tests/input.csv"
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -15,10 +15,10 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
+    input  = "${projectDir}/tests/input-full.csv"
 
     // Genome references
-    genome = 'R64-1-1'
+    fasta = "https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/genome/genome.fasta"
+    fai = "https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/genome/genome.fasta.fai"
+    dict = "https://github.com/nf-core/test-datasets/raw/refs/heads/modules/data/genomics/homo_sapiens/genome/genome.dict"
 }
diff --git a/docs/output.md b/docs/output.md
@@ -57,7 +57,9 @@ A custom R script is used to convert the finalized VCF to a CSV which can be use
 
 [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
 
-Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>.### Pipeline information
+Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see <http://multiqc.info>.
+
+### Pipeline information
 
 <details markdown="1">
 <summary>Output files</summary>

diff --git a/docs/usage.md b/docs/usage.md
@@ -4,19 +4,19 @@
 
 ## Introduction
 
-<!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->
+This document describes the usage of the pipeline. The pipeline processes (g)VCF files coming from different variant calling pipelines and converts them to a matrix format which is suitable for downstream clustering analysis. The pipeline is built using [Nextflow](https://www.nextflow.io/).
 
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row as shown in the examples below.
 
 ```bash
---input '[path to samplesheet file]'
+--input '[path to samplesheet file]' --genome '[genome version]'
 ```
 
-### Multiple runs of the same sample
+### Same sample, different callers
 
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
+The `sample` identifiers have to be the same when the vcfs originate from the same bam but were yielded with different callers. The pipeline will merge all vcfs from the same sample into one vcf file but is also able to handle if there is only one vcf file for a sample (merging will then be skipped).
 
 ```csv title="samplesheet.csv"
 sample,label,gvcf,vcf_path,vcf_index_path
@@ -113,7 +113,7 @@ Several generic profiles are bundled with the pipeline which instruct the pipeli
 > [!IMPORTANT]
 > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported.
 
-The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to check if your system is supported, please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation).
+The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to check if your system is suported, please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation).
 
 Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important!
 They are loaded in sequence, so later profiles can overwrite earlier profiles.

diff --git a/main.nf b/main.nf
@@ -24,10 +24,11 @@ include { getGenomeAttribute      } from './subworkflows/local/utils_nfcore_vcft
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 */
 
-// TODO nf-core: Remove this line if you don't need a FASTA file
 //   This is an example of how to use getGenomeAttribute() to fetch parameters
 //   from igenomes.config using `--genome`
 params.fasta = getGenomeAttribute('fasta')
+params.fai   = getGenomeAttribute('fasta_fai')
+params.dict  = getGenomeAttribute('dict')
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -45,11 +46,20 @@ workflow QBICPIPELINES_VCFTOMAT {
 
     main:
 
+    // FASTA
+    fasta        = params.fasta     ? Channel.fromPath(params.fasta).collect()          : Channel.value([])
+    fai          = params.fai       ? Channel.fromPath(params.fai).collect()            : Channel.value([])
+    dict         = params.dict      ? Channel.fromPath(params.dict).collect()           : Channel.value([])
+
+
     //
     // WORKFLOW: Run pipeline
     //
     VCFTOMAT (
-        samplesheet
+        samplesheet,
+        fasta,
+        fai,
+        dict
     )
     emit:
     multiqc_report = VCFTOMAT.out.multiqc_report // channel: /path/to/multiqc_report.html

diff --git a/modules.json b/modules.json
@@ -29,6 +29,11 @@
                         "branch": "master",
                         "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d",
                         "installed_by": ["modules"]
+                    },
+                    "tabix/tabix": {
+                        "branch": "master",
+                        "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
+                        "installed_by": ["modules"]
                     }
                 }
             },

diff --git a/modules/local/vcf2mat/environment.yml b/modules/local/vcf2mat/environment.yml
@@ -0,0 +1,7 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::bioconductor-variantannotation
+  - conda-forge::r-docopt
+  - conda-forge::r-matrix
diff --git a/modules/local/vcf2mat/main.nf b/modules/local/vcf2mat/main.nf
@@ -0,0 +1,45 @@
+process VCF2MAT {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/6c/6c2dd8fc4240adf343ad71f9c56158d87f28b2577f2a6e114b7ab8406f0c4672/data' :
+        'community.wave.seqera.io/library/bioconductor-variantannotation_r-docopt_r-matrix:3cf2f20fdc477746' }"
+
+    input:
+    tuple val(meta), path(vcf)
+
+    output:
+    tuple val(meta), path("*.csv") , emit: csv
+    path  "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.0.0'
+    """
+    vcf2counts.R \\
+        --output ${prefix}.csv \\
+        $vcf
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vcf2counts.R: $VERSION
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def VERSION = '1.0.0'
+    """
+    touch ${prefix}.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        vcf2counts.R: $VERSION
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/bcftools/merge/environment.yml b/modules/nf-core/bcftools/merge/environment.yml
diff --git a/modules/nf-core/bcftools/merge/main.nf b/modules/nf-core/bcftools/merge/main.nf
diff --git a/modules/nf-core/bcftools/merge/meta.yml b/modules/nf-core/bcftools/merge/meta.yml