Merge pull request nf-core#299 from ggabernet/protocols

Provide profile for NEB and TAKARA protocols
ggabernet · Feb 16, 2024 · 06bbaad · 06bbaad
2 parents e80c15c + affe6b6
commit 06bbaad
Show file tree

Hide file tree

Showing 26 changed files with 951 additions and 191 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -49,7 +49,16 @@ jobs:
           - "23.04.0"
           - "latest-everything"
         profile:
-          ["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"]
+          [
+            "test_tcr",
+            "test_no_umi",
+            "test_nocluster",
+            "test_fetchimgt",
+            "test_assembled_hs",
+            "test_assembled_mm",
+            "test_clontech_umi",
+            "test_nebnext_umi",
+          ]
       fail-fast: false
     steps:
       - name: Check out pipeline code

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ### `Added`
 
 - [#294](https://github.com/nf-core/airrflow/pull/294) Merge template updates nf-core/tools v2.11.1
+- [#299](https://github.com/nf-core/airrflow/pull/299) Add profile for common NEB and TAKARA protocols
 
 ### `Fixed`
 

diff --git a/bin/log_parsing.py b/bin/log_parsing.py
@@ -52,7 +52,7 @@
 df_process_list = []
 
 for process in processes:
-    find = subprocess.check_output(["find", process, "-name", "*command_log.txt"])
+    find = subprocess.check_output(["find", process, "-name", "*command_log*"])
     log_files = find.decode().split("\n")
     log_files = list(filter(None, log_files))
 
@@ -90,50 +90,37 @@
 
     elif process in ["mask_primers", "filter_by_sequence_quality"]:
         s_code = []
+        s_readtype = []
         output_file = []
-        seqs_R1 = []
-        seqs_R2 = []
-        pass_R1 = []
-        pass_R2 = []
-        fail_R1 = []
-        fail_R2 = []
+        n_seqs = []
+        n_pass = []
+        n_fail = []
         process_name = []
 
         for logfile in log_files:
-            c = 0
+            if "_R1" in logfile:
+                s_readtype.append("R1")
+            elif "_R2" in logfile:
+                s_readtype.append("R2")
             with open(logfile, "r") as f:
                 for line in f:
                     if " START>" in line:
-                        if c < 1:
-                            s_code.append(logfile.split("/")[1].split("_command_log")[0])
-
-                            process_name.append(process)
+                        s_code.append(logfile.split("/")[1].split("_command_log")[0])
+                        process_name.append(process)
                     elif "SEQUENCES>" in line:
-                        if c < 1:
-                            seqs_R1.append(line.strip().removeprefix("SEQUENCES> "))
-                        else:
-                            seqs_R2.append(line.strip().removeprefix("SEQUENCES> "))
+                        n_seqs.append(line.strip().removeprefix("SEQUENCES> "))
                     elif "PASS>" in line:
-                        if c < 1:
-                            pass_R1.append(line.strip().removeprefix("PASS> "))
-                        else:
-                            pass_R2.append(line.strip().removeprefix("PASS> "))
+                        n_pass.append(line.strip().removeprefix("PASS> "))
                     elif "FAIL>" in line:
-                        if c < 1:
-                            fail_R1.append(line.strip().removeprefix("FAIL> "))
-                            c += 1
-                        else:
-                            fail_R2.append(line.strip().removeprefix("FAIL> "))
+                        n_fail.append(line.strip().removeprefix("FAIL> "))
 
         df_process = pd.DataFrame.from_dict(
             {
                 "Sample": s_code,
-                "start_R1": seqs_R1,
-                "start_R2": seqs_R2,
-                "pass_R1": pass_R1,
-                "pass_R2": pass_R2,
-                "fail_R1": fail_R1,
-                "fail_R2": fail_R2,
+                "readtype": s_readtype,
+                "start": n_seqs,
+                "pass": n_pass,
+                "fail": n_fail,
                 "process": process_name,
             }
         )
@@ -344,56 +331,21 @@
 
         df_process_list.append(df_process)
 
-# Getting table colnames
-
-colnames = [
-    "Sample",
-    "Sequences_R1",
-    "Sequences_R2",
-    "Filtered_quality_R1",
-    "Filtered_quality_R2",
-    "Mask_primers_R1",
-    "Mask_primers_R2",
-    "Paired",
-    "Build_consensus",
-    "Assemble_pairs",
-    "Unique",
-    "Representative_2",
-    "Igblast",
-]
-
-
-values = [
-    df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
-    df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
-    df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
-    df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
-    df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
-    df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
-    df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
-    df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
-]
-
 
 # Tables provide extra info and help debugging
 df_process_list[0].to_csv(
     path_or_buf="Table_all_details_filter_quality.tsv",
     sep="\t",
     header=True,
-    index=False,
+    index=True,
 )
 df_process_list[1].to_csv(path_or_buf="Table_all_details_mask_primers.tsv", sep="\t", header=True, index=False)
 df_process_list[2].to_csv(path_or_buf="Table_all_details_paired.tsv", sep="\t", header=True, index=False)
 df_process_list[3].to_csv(
     path_or_buf="Table_all_details_build_consensus.tsv",
     sep="\t",
     header=True,
-    index=False,
+    index=True,
 )
 df_process_list[4].to_csv(path_or_buf="Table_all_details_repaired.tsv", sep="\t", header=True, index=False)
 df_process_list[5].to_csv(
@@ -413,6 +365,43 @@
         index=False,
     )
 
+# Getting table colnames
+
+colnames = [
+    "Sample",
+    "Sequences_R1",
+    "Sequences_R2",
+    "Filtered_quality_R1",
+    "Filtered_quality_R2",
+    "Mask_primers_R1",
+    "Mask_primers_R2",
+    "Paired",
+    "Build_consensus",
+    "Assemble_pairs",
+    "Unique",
+    "Representative_2",
+    "Igblast",
+]
+
+print(df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype"))
+
+values = [
+    df_process_list[2].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
+    df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R1"].tolist(),
+    df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R2"].tolist(),
+    df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
+    df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
+    df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
+    df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
+    df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
+    df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
+    df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
+    df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
+    df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
+    df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
+]
+
+
 final_table = dict(zip(colnames, values))
 print(final_table)
 df_final_table = pd.DataFrame.from_dict(final_table)

diff --git a/conf/clontech_umi_bcr.config b/conf/clontech_umi_bcr.config
@@ -0,0 +1,40 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/airrflow -profile clontech_umi_bcr,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Takara Bio / Clontech SMARTer v2'
+    config_profile_description = 'Profile to run pipeline for the Takara Bio / Clontech SMARTer v2 (UMI) BCR protocol profile'
+
+    mode = 'fastq'
+
+    library_generation_method = 'dt_5p_race_umi'
+
+    cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/c98269b194e9c6262fe3b098be3600ba7f64b85c/protocols/Universal/Human_IG_CRegion_RC.fasta'
+
+    // primer options
+    cprimer_position = 'R1'
+    cprimer_start = 0
+    vprimer_start = 0
+    umi_length = 12
+    umi_position = 'R2'
+    cluster_sets = false
+
+
+    // Mask primer options
+    maskprimers_align = true
+    primer_extract_len = 7
+    primer_mask_mode = 'cut'
+    primer_maxlen = 70
+    primer_r1_maxerror = 0.2
+    assemblepairs_sequential = true
+    primer_consensus = 0.6
+}
diff --git a/conf/clontech_umi_tcr.config b/conf/clontech_umi_tcr.config
@@ -0,0 +1,44 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/airrflow -profile clontech_umi_tcr,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Takara Bio / Clontech SMARTer v2 TCR'
+    config_profile_description = 'Profile to run pipeline for the Takara Bio / Clontech SMARTer v2 (UMI) TCR protocol profile'
+
+    mode = 'fastq'
+
+    library_generation_method = 'dt_5p_race_umi'
+
+    cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/16f94088c1df5c7a0ee1c9ea8b403cd4d2488e8a/protocols/Universal/Human_TR_CRegion_RC.fasta'
+
+    // primer options
+    cprimer_position = 'R1'
+    cprimer_start = 0
+    vprimer_start = 0
+    umi_length = 12
+    umi_position = 'R2'
+    cluster_sets = false
+
+
+    // Mask primer options
+    maskprimers_align = true
+    primer_extract_len = 7
+    primer_mask_mode = 'cut'
+    primer_maxlen = 70
+    primer_r1_maxerror = 0.2
+    assemblepairs_sequential = true
+    primer_consensus = 0.6
+
+    // TCR options
+    clonal_threshold = 0
+    skip_lineage = true
+}