Skip to content

Commit

Permalink
Merge pull request nf-core#299 from ggabernet/protocols
Browse files Browse the repository at this point in the history
Provide profile for NEB and TAKARA protocols
  • Loading branch information
ggabernet authored Feb 16, 2024
2 parents e80c15c + affe6b6 commit 06bbaad
Show file tree
Hide file tree
Showing 26 changed files with 951 additions and 191 deletions.
11 changes: 10 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,16 @@ jobs:
- "23.04.0"
- "latest-everything"
profile:
["test_tcr", "test_no_umi", "test_nocluster", "test_fetchimgt", "test_assembled_hs", "test_assembled_mm"]
[
"test_tcr",
"test_no_umi",
"test_nocluster",
"test_fetchimgt",
"test_assembled_hs",
"test_assembled_mm",
"test_clontech_umi",
"test_nebnext_umi",
]
fail-fast: false
steps:
- name: Check out pipeline code
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### `Added`

- [#294](https://github.com/nf-core/airrflow/pull/294) Merge template updates nf-core/tools v2.11.1
- [#299](https://github.com/nf-core/airrflow/pull/299) Add profile for common NEB and TAKARA protocols

### `Fixed`

Expand Down
125 changes: 57 additions & 68 deletions bin/log_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
df_process_list = []

for process in processes:
find = subprocess.check_output(["find", process, "-name", "*command_log.txt"])
find = subprocess.check_output(["find", process, "-name", "*command_log*"])
log_files = find.decode().split("\n")
log_files = list(filter(None, log_files))

Expand Down Expand Up @@ -90,50 +90,37 @@

elif process in ["mask_primers", "filter_by_sequence_quality"]:
s_code = []
s_readtype = []
output_file = []
seqs_R1 = []
seqs_R2 = []
pass_R1 = []
pass_R2 = []
fail_R1 = []
fail_R2 = []
n_seqs = []
n_pass = []
n_fail = []
process_name = []

for logfile in log_files:
c = 0
if "_R1" in logfile:
s_readtype.append("R1")
elif "_R2" in logfile:
s_readtype.append("R2")
with open(logfile, "r") as f:
for line in f:
if " START>" in line:
if c < 1:
s_code.append(logfile.split("/")[1].split("_command_log")[0])

process_name.append(process)
s_code.append(logfile.split("/")[1].split("_command_log")[0])
process_name.append(process)
elif "SEQUENCES>" in line:
if c < 1:
seqs_R1.append(line.strip().removeprefix("SEQUENCES> "))
else:
seqs_R2.append(line.strip().removeprefix("SEQUENCES> "))
n_seqs.append(line.strip().removeprefix("SEQUENCES> "))
elif "PASS>" in line:
if c < 1:
pass_R1.append(line.strip().removeprefix("PASS> "))
else:
pass_R2.append(line.strip().removeprefix("PASS> "))
n_pass.append(line.strip().removeprefix("PASS> "))
elif "FAIL>" in line:
if c < 1:
fail_R1.append(line.strip().removeprefix("FAIL> "))
c += 1
else:
fail_R2.append(line.strip().removeprefix("FAIL> "))
n_fail.append(line.strip().removeprefix("FAIL> "))

df_process = pd.DataFrame.from_dict(
{
"Sample": s_code,
"start_R1": seqs_R1,
"start_R2": seqs_R2,
"pass_R1": pass_R1,
"pass_R2": pass_R2,
"fail_R1": fail_R1,
"fail_R2": fail_R2,
"readtype": s_readtype,
"start": n_seqs,
"pass": n_pass,
"fail": n_fail,
"process": process_name,
}
)
Expand Down Expand Up @@ -344,56 +331,21 @@

df_process_list.append(df_process)

# Getting table colnames

colnames = [
"Sample",
"Sequences_R1",
"Sequences_R2",
"Filtered_quality_R1",
"Filtered_quality_R2",
"Mask_primers_R1",
"Mask_primers_R2",
"Paired",
"Build_consensus",
"Assemble_pairs",
"Unique",
"Representative_2",
"Igblast",
]


values = [
df_process_list[0].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "start_R2"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R1"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).loc[:, "pass_R2"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
]


# Tables provide extra info and help debugging
df_process_list[0].to_csv(
path_or_buf="Table_all_details_filter_quality.tsv",
sep="\t",
header=True,
index=False,
index=True,
)
df_process_list[1].to_csv(path_or_buf="Table_all_details_mask_primers.tsv", sep="\t", header=True, index=False)
df_process_list[2].to_csv(path_or_buf="Table_all_details_paired.tsv", sep="\t", header=True, index=False)
df_process_list[3].to_csv(
path_or_buf="Table_all_details_build_consensus.tsv",
sep="\t",
header=True,
index=False,
index=True,
)
df_process_list[4].to_csv(path_or_buf="Table_all_details_repaired.tsv", sep="\t", header=True, index=False)
df_process_list[5].to_csv(
Expand All @@ -413,6 +365,43 @@
index=False,
)

# Getting table colnames

colnames = [
"Sample",
"Sequences_R1",
"Sequences_R2",
"Filtered_quality_R1",
"Filtered_quality_R2",
"Mask_primers_R1",
"Mask_primers_R2",
"Paired",
"Build_consensus",
"Assemble_pairs",
"Unique",
"Representative_2",
"Igblast",
]

print(df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype"))

values = [
df_process_list[2].sort_values(by=["Sample"]).iloc[:, 0].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["start"]["R2"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
df_process_list[0].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R1"].tolist(),
df_process_list[1].sort_values(by=["Sample"]).pivot(index="Sample", columns="readtype")["pass"]["R2"].tolist(),
df_process_list[2].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[4].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[5].sort_values(by=["Sample"]).loc[:, "pass_pairs"].tolist(),
df_process_list[6].sort_values(by=["Sample"]).loc[:, "unique"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "repres_2"].tolist(),
df_process_list[7].sort_values(by=["Sample"]).loc[:, "pass_igblast"].tolist(),
]


final_table = dict(zip(colnames, values))
print(final_table)
df_final_table = pd.DataFrame.from_dict(final_table)
Expand Down
40 changes: 40 additions & 0 deletions conf/clontech_umi_bcr.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/airrflow -profile clontech_umi_bcr,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Takara Bio / Clontech SMARTer v2'
config_profile_description = 'Profile to run pipeline for the Takara Bio / Clontech SMARTer v2 (UMI) BCR protocol profile'

mode = 'fastq'

library_generation_method = 'dt_5p_race_umi'

cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/c98269b194e9c6262fe3b098be3600ba7f64b85c/protocols/Universal/Human_IG_CRegion_RC.fasta'

// primer options
cprimer_position = 'R1'
cprimer_start = 0
vprimer_start = 0
umi_length = 12
umi_position = 'R2'
cluster_sets = false


// Mask primer options
maskprimers_align = true
primer_extract_len = 7
primer_mask_mode = 'cut'
primer_maxlen = 70
primer_r1_maxerror = 0.2
assemblepairs_sequential = true
primer_consensus = 0.6
}
44 changes: 44 additions & 0 deletions conf/clontech_umi_tcr.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/airrflow -profile clontech_umi_tcr,<docker/singularity> --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Takara Bio / Clontech SMARTer v2 TCR'
config_profile_description = 'Profile to run pipeline for the Takara Bio / Clontech SMARTer v2 (UMI) TCR protocol profile'

mode = 'fastq'

library_generation_method = 'dt_5p_race_umi'

cprimers = 'https://bitbucket.org/kleinstein/immcantation/raw/16f94088c1df5c7a0ee1c9ea8b403cd4d2488e8a/protocols/Universal/Human_TR_CRegion_RC.fasta'

// primer options
cprimer_position = 'R1'
cprimer_start = 0
vprimer_start = 0
umi_length = 12
umi_position = 'R2'
cluster_sets = false


// Mask primer options
maskprimers_align = true
primer_extract_len = 7
primer_mask_mode = 'cut'
primer_maxlen = 70
primer_r1_maxerror = 0.2
assemblepairs_sequential = true
primer_consensus = 0.6

// TCR options
clonal_threshold = 0
skip_lineage = true
}
Loading

0 comments on commit 06bbaad

Please sign in to comment.