Merge pull request #334 from mapo9/trust4

unselected RNA-seq based workflow
nf-core · Jul 17, 2024 · 91777ac · 91777ac
2 parents df9ca20 + cfa4f73
commit 91777ac
Show file tree

Hide file tree

Showing 18 changed files with 605 additions and 56 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -62,6 +62,8 @@ jobs:
             "test_10x_sc",
             "test_clontech_umi",
             "test_nebnext_umi",
+            "test_rnaseq_bulk",
+            "test_rnaseq_sc",
           ]
       fail-fast: false
     steps:

diff --git a/conf/modules.config b/conf/modules.config
@@ -422,6 +422,14 @@ process {
         ]
     }
 
+    withName: CHANGEO_PARSEDB_SELECT_LOCUS {
+        publishDir = [
+            path: { "${params.outdir}/vdj_annotation/select-locus/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: CHANGEO_PARSEDB_SPLIT {
         publishDir = [
             path: { "${params.outdir}/vdj_annotation/04-select-productive/${meta.id}" },

diff --git a/conf/test_rnaseq_bulk.config b/conf/test_rnaseq_bulk.config
@@ -0,0 +1,26 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/airrflow -profile test_rnaseq_bulk,<docker/singularity>
+ */
+
+params {
+    config_profile_name = 'Test bulk RNA-seq based workflow using TRUST4'
+    config_profile_description = 'Minimal test dataset to check pipeline function with raw bulk RNA-seq data'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus = 2
+    max_memory = 6.GB
+    max_time = 48.h
+
+    // params
+    mode = 'fastq'
+    library_generation_method = 'trust4'
+    clonal_threshold = 0
+
+    // Input data
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-rnaseq/rnaseq_metadata.tsv'
+}
diff --git a/conf/test_rnaseq_sc.config b/conf/test_rnaseq_sc.config
@@ -0,0 +1,30 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/airrflow -profile test_rnaseq_sc,<docker/singularity>
+ */
+
+params {
+    config_profile_name = 'Test single-cell RNA-seq based workflow using TRUST4'
+    config_profile_description = 'Minimal test dataset to check pipeline function with raw single-cell RNA-seq data'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus = 2
+    max_memory = 6.GB
+    max_time = 48.h
+
+    // params
+    mode = 'fastq'
+    library_generation_method = 'trust4'
+    clonal_threshold = 0
+    barcode_read = "R1"
+    umi_read = "R1"
+    read_format = "bc:0:15,um:16:27"
+    skip_lineage = true
+
+    // Input data
+    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/airrflow/testdata-rnaseq/sc_rnaseq_metadata.tsv'
+}
diff --git a/docs/usage.md b/docs/usage.md
@@ -41,13 +41,13 @@ nextflow run nf-core/airrflow \
 A typical command to run the pipeline from **single cell raw fastq files** is:
 
 ```bash
-nextflow run nf-core/airrflow -r dev \
+nextflow run nf-core/airrflow \
 -profile <docker/singularity/podman/shifter/charliecloud/conda/institute> \
 --mode fastq \
 --input input_samplesheet.tsv \
 --library_generation_method sc_10x_genomics \
 --reference_10x reference/refdata-cellranger-vdj-GRCh38-alts-ensembl-5.0.0.tar.gz \
---outdir ./results
+--outdir results
 ```
 
 A typical command for running the pipeline departing from **single-cell AIRR rearrangement tables or assembled bulk sequencing fasta** data is:
@@ -121,7 +121,7 @@ If you wish to share such profile (such as upload as supplementary material for
 
 ## Input samplesheet
 
-### Fastq input samplesheet (bulk sequencing)
+### Fastq input samplesheet (bulk AIRR sequencing)
 
 The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. An example samplesheet is:
 
@@ -143,27 +143,27 @@ The required input file for processing raw BCR or TCR bulk targeted sequencing d
 - `age`: Subject biological age.
 - `single_cell`: TRUE or FALSE.
 
-Other optional columns can be added. These columns will be available when building the contrasts for the repertoire comparison report. It is recommended that these columns also follow the AIRR nomenclature. Examples are:
+Other optional columns can be added. These columns will be available as metadata in the final repertoire table. It is recommended that these columns also follow the AIRR nomenclature. Examples are:
 
 - `intervention`: Description of intervention.
 - `disease_diagnosis`: Diagnosis of subject.
 - `collection_time_point_relative`: Time point at which sample was taken, relative to `collection_time_point_reference` (e.g. 14d, 6 months, baseline).
 - `collection_time_point_reference`: Event in the study schedule to which `Sample collection time` relates to (e.g. primary vaccination, intervention start).
 - `cell_subset`: Commonly-used designation of isolated cell population.
 
-The metadata specified in the input file will then be automatically annotated in a column with the same header in the tables generated by the pipeline.
+It is possible to provide several fastq files per sample (e.g. sequenced over different chips or lanes). In this case the different fastq files per sample will be merged together prior to processing. Provide one fastq pair R1/R2 per row, and the same `sample_id` field for these rows.
 
 ### Fastq input samplesheet (single cell sequencing)
 
-The required input file for processing raw BCR or TCR single cell targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. You can refer to the bulk fastq input section for documentation on the individual columns.
+The required input file for processing raw BCR or TCR single cell targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename_R1`, `filename_R2`, `subject_id`, `species`, `tissue`, `pcr_target_locus`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. Any other columns you add will be available in the final repertoire file as extra metadata fields. You can refer to the bulk fastq input section for documentation on the individual columns.
 An example samplesheet is:
 
-| sample_id | filename_R1                     | filename_R2                     | subject_id | species | pcr_target_locus | tissue | sex    | age | biomaterial_provider | single_cell | intervention   | collection_time_point_relative | cell_subset  |
-| --------- | ------------------------------- | ------------------------------- | ---------- | ------- | ---------------- | ------ | ------ | --- | -------------------- | ----------- | -------------- | ------------------------------ | ------------ |
-| sample01  | sample1_S1_L001_R1_001.fastq.gz | sample1_S1_L001_R2_001.fastq.gz | Subject02  | human   | IG               | blood  | NA     | 53  | sequencing_facility  | FALSE       | Drug_treatment | Baseline                       | plasmablasts |
-| sample02  | sample2_S1_L001_R1_001.fastq.gz | sample2_S1_L001_R2_001.fastq.gz | Subject02  | human   | TR               | blood  | female | 78  | sequencing_facility  | FALSE       | Drug_treatment | Baseline                       | plasmablasts |
+| sample_id | filename_R1                      | filename_R2                      | subject_id | species | pcr_target_locus | tissue | sex    | age | biomaterial_provider | single_cell |
+| --------- | -------------------------------- | -------------------------------- | ---------- | ------- | ---------------- | ------ | ------ | --- | -------------------- | ----------- |
+| sample01  | sample01_S1_L001_R1_001.fastq.gz | sample01_S1_L001_R2_001.fastq.gz | Subject02  | human   | IG               | blood  | NA     | 53  | sequencing_facility  | TRUE        |
+| sample02  | sample02_S1_L001_R1_001.fastq.gz | sample02_S1_L001_R2_001.fastq.gz | Subject02  | human   | TR               | blood  | female | 78  | sequencing_facility  | TRUE        |
 
-> FASTQ files must confirm the 10xGenomics cellranger naming conventions<br> >**`[SAMPLE-NAME]`_S1_L00`[LANE-NUMBER]` _`[READ-TYPE]`\_001.fastq.gz**
+> FASTQ files must conform with the 10xGenomics cellranger naming conventions with the same sample name as provided in the sample*id column <br> >\*\*`[SAMPLE-NAME]`* S`[CHIP-NUMBER]`_ L00`[LANE-NUMBER]`_`[R1/R2]`\_001.fastq.gz\*\*
 >
 > Read type is one of
 >
@@ -172,6 +172,13 @@ An example samplesheet is:
 > - `R1`: Read 1
 > - `R2`: Read 2
 
+It is possible to provide several fastq files per sample (e.g. sequenced over different chips or lanes). In this case the different fastq files per sample will be provided to the same cellranger process. These rows should then have an identical `sample_id` field.
+
+### Fastq input samplesheet (untargeted bulk or sc RNA sequencing)
+
+When running the untargeted protocol, BCR or TCR sequences will be extracted from the untargeted bulk or single-cell RNA sequencing with tools such as [TRUST4](https://github.com/liulab-dfci/TRUST4).
+The required input file is the same as for the [Fastq bulk AIRR samplesheet](#fastq-input-samplesheet-bulk-airr-sequencing) or [Fastq single-cell AIRR samplesheet](#fastq-input-samplesheet-single-cell-sequencing) depending on the input data type (bulk RNAseq or single-cell RNAseq).
+
 ### Assembled input samplesheet (bulk or single-cell sequencing)
 
 The required input file for processing raw BCR or TCR bulk targeted sequencing data is a sample sheet in TSV format (tab separated). The columns `sample_id`, `filename`, `subject_id`, `species`, `tissue`, `single_cell`, `sex`, `age` and `biomaterial_provider` are required. All fields are explained in the previous section, with the only difference being that there is only one `filename` column for the assembled input samplesheet. The provided file will be different from assembled single-cell or bulk data:
@@ -468,6 +475,42 @@ nextflow run nf-core/airrflow -r dev \
 - The 10xGenomics reference can be downloaded from the [download page](https://www.10xgenomics.com/support/software/cell-ranger/downloads)
 - To generate a V(D)J segment fasta file as reference from IMGT one can follow the [cellranger docs](https://support.10xgenomics.com/single-cell-vdj/software/pipelines/latest/advanced/references#imgt).
 
+## Supported unselected RNA-seq based methods
+
+nf-core/airrflow supports unselected bulk or single-cell RNA-seq fastq files as input. [TRUST4](https://github.com/liulab-dfci/TRUST4) is used to extract TCR/BCR sequences from these files. The resulting AIRR tables are then fed into airrflow's Immcantation based workflow. <br>
+To use unselected RNA-seq based input, specify `--library_generation_method trust4`.
+
+### Bulk RNA-seq
+
+A typical command to run the pipeline from **bulk RNA-seq fastq files** is:
+
+```bash
+nextflow run nf-core/airrfow \
+-profile <docker/singularity/podman/shifter/charliecloud/conda/institute> \
+--mode fastq \
+--input input_samplesheet.tsv \
+--library_generation_method trust4 \
+--outdir results
+```
+
+### Single-cell RNA-seq
+
+A typical command to run the pipeline from **single-cell RNA-seq fastq files** is:
+
+```bash
+nextflow run nf-core/airrfow \
+-profile <docker/singularity/podman/shifter/charliecloud/conda/institute> \
+--mode fastq \
+--input input_samplesheet.tsv \
+--library_generation_method trust4 \
+--umi_read R1 \
+--read_format bc:0:15,um:16:27 \
+--outdir results
+```
+
+- If UMI's are present, the read containing them must be specified using the `--umi_position` parameter.
+- The `--read_format` parameter can be used to specify the Barcode and UMI position within the reads (see TRUST4 [docs](https://github.com/liulab-dfci/TRUST4?tab=readme-ov-file#10x-genomics-data-and-barcode-based-single-cell-data))
+
 ## Core Nextflow arguments
 
 :::note

diff --git a/modules.json b/modules.json
@@ -34,6 +34,11 @@
                         "branch": "master",
                         "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",
                         "installed_by": ["modules"]
+                    },
+                    "trust4": {
+                        "branch": "master",
+                        "git_sha": "bbb9636dbe460f45fe786d0866f8fd7337e4fc7a",
+                        "installed_by": ["modules"]
                     }
                 }
             },

diff --git a/...s/local/changeo/changeo_parsedb_select.nf → ...l/changeo/changeo_parsedb_select_locus.nf b/...s/local/changeo/changeo_parsedb_select.nf → ...l/changeo/changeo_parsedb_select_locus.nf
@@ -1,4 +1,4 @@
-process CHANGEO_PARSEDB_SELECT {
+process CHANGEO_PARSEDB_SELECT_LOCUS {
     tag "$meta.id"
     label 'process_low'
     label 'immcantation'
@@ -18,25 +18,21 @@ process CHANGEO_PARSEDB_SELECT {
     path "versions.yml" , emit: versions
 
     script:
-    def args = task.ext.args ?: ''
-    def args2 = task.ext.args2 ?: ''
     if (meta.locus.toUpperCase() == 'IG'){
         """
-        ParseDb.py select -d $tab $args --outname ${meta.id} > ${meta.id}_select_command_log.txt
+        ParseDb.py select -d $tab -f locus -u "IG[HKL]" --regex --outname ${meta.id} > ${meta.id}_select_command_log.txt
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
-            igblastn: \$( igblastn -version | grep -o "igblast[0-9\\. ]\\+" | grep -o "[0-9\\. ]\\+" )
             changeo: \$( ParseDb.py --version | awk -F' '  '{print \$2}' )
         END_VERSIONS
         """
     } else if (meta.locus.toUpperCase() == 'TR'){
         """
-        ParseDb.py select -d $tab $args2 --outname ${meta.id} > "${meta.id}_command_log.txt"
+        ParseDb.py select -d $tab -f locus -u "TR[ABDG]" --regex --outname ${meta.id} > "${meta.id}_command_log.txt"
 
         cat <<-END_VERSIONS > versions.yml
         "${task.process}":
-            igblastn: \$( igblastn -version | grep -o "igblast[0-9\\. ]\\+" | grep -o "[0-9\\. ]\\+" )
             changeo: \$( ParseDb.py --version | awk -F' '  '{print \$2}' )
         END_VERSIONS
         """

diff --git a/modules/local/prepare_trust4_reference.nf b/modules/local/prepare_trust4_reference.nf
@@ -0,0 +1,24 @@
+process PREPARE_TRUST4_REFERENCE {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::trust4=1.0.13"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/trust4:1.0.13--h43eeafb_0':
+        'biocontainers/trust4:1.0.13--h43eeafb_0' }"
+
+    input:
+    tuple val(meta), path(R1), path(R2)
+    path(reference_igblast)
+
+    output:
+    tuple val(meta), path("trust4_reference.fa") , emit: trust4_reference
+
+    script:
+    """
+    cat ${reference_igblast}/fasta/imgt_${meta.species.toLowerCase()}_*.fasta \\
+    ${reference_igblast}/fasta/imgt_${meta.species.toLowerCase()}_*.fasta >> trust4_reference.fa
+    """
+
+
+}
diff --git a/modules/nf-core/trust4/environment.yml b/modules/nf-core/trust4/environment.yml
diff --git a/modules/nf-core/trust4/main.nf b/modules/nf-core/trust4/main.nf