From 48e0e7a55275175244da041204e5e30f7e58da97 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Wed, 27 Sep 2023 12:30:12 +0100 Subject: [PATCH 01/16] untested changes - still awaiting inputs --- automate_demultiplex_config.py | 147 +++++++++++++++++++--- demultiplex.py | 61 ++++++--- upload_and_setoff_workflows.py | 218 ++++++++++++++++++++++++++++++--- 3 files changed, 378 insertions(+), 48 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index 8a213bf0..2a1e9036 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -189,6 +189,7 @@ upload_multiqc_path = "Apps/upload_multiqc_v1.4.0" # RPKM path RPKM_path = "Apps/RPKM_using_conifer_v1.6" + # FastQC app fastqc_app = "Apps/fastqc_v1.4.0" # bedfile folder @@ -268,6 +269,41 @@ # Mokapipe FH_PRS BED file FH_PRS_bedfile_name = "Pan4909.bed" +### exome depth +# exome depth readcount app +ED_readcount_path = "Apps/ED_readcount_analysis_v1.2.0" +ED_readcount_path_instance_type = "mem1_ssd1_v2_x8" +#exome depth variant calling app +ED_cnvcalling_path = "Apps/ED_cnv_calling_v1.2.0" +ED_cnvcalling_instance_type = "mem1_ssd1_v2_x4" +#VCP1 exome depth +ED_readcount_normals_VCP1_file="XXX" +ED_VCP1_readcount_BEDfile_pannum = "Pan4398" # just put pan number - full bedfile name is made in nexus_bedfiles function +#VCP2 normals data file +ED_readcount_normals_VCP2_file="XXX" +ED_VCP2_readcount_BEDfile_pannum = "Pan4973" # just put pan number - full bedfile name is made in nexus_bedfiles function +#VCP3 normals data file +ED_readcount_normals_VCP3_file="XXX" +ED_VCP3_readcount_BEDfile_pannum = "Pan5149" # just put pan number - full bedfile name is made in nexus_bedfiles function + +exomedepth_refgenome_file = "project-ByfFPz00jy1fk6PjpZ95F27J:file-B6ZY7VG2J35Vfvpkj8y0KZ01" #hs37d5.fa.gz from 001 +## readcount app inputs +exomedepth_readcount_reference_genome_input="-ireference_genome=%s" % (exomedepth_refgenome_file) +exomedepth_readcount_bedfile_input="-ibedfile=" +exomedepth_readcount_normalsRdata_input="-inormals_RData=" +exomedepth_readcount_projectname_input="-iproject_name=" +exomedepth_readcount_pannumbers_input="-ibamfile_pannumbers=" +exomedepth_readcount_rdata_output="rdataout" + + +## ED CNV calling inputs +exomedepth_cnvcalling_reference_genome_input="-ireference_genome=%s" % (exomedepth_refgenome_file) +exomedepth_cnvcalling_readcount_file_input="-ireadcount_file=" +exomedepth_cnvcalling_subpanel_bed_input="-isubpanel_bed=" +exomedepth_cnvcalling_projectname_input="-iproject_name=" +exomedepth_cnvcalling_pannumbers_input="-ibamfile_pannumbers=" + + # MokaWES workflow_inputs wes_fastqc1 = " -istage-Ff0P5Jj0GYKY717pKX3vX8Z3.reads=" # FastQC Read 1 wes_fastqc2 = " -istage-Ff0P5V00GYKyJfpX5bqX69Yg.reads=" # FastQC Read 2 @@ -448,10 +484,10 @@ "Pan4146", # VCP3 Viapath R81 (CM) "Pan4149", # VCP2 Viapath R208 (BRCA) "Pan4150", # VCP2 Viapath R207 (ovarian) - "Pan4127", # VCP2 Viapath R209 (colorectal) "Pan4129", # VCP2 Viapath R210 (lynch) "Pan4964", # VCP2 Viapath R259 (nijmegen) "Pan4130", # VCP2 Viapath R211 (polyposis) + "Pan5186", # VCP2 Viapath R414 APC "Pan5121", # VCP2 Viapath R430 (prostate) "Pan5143", # VCP2 Viapath R444.1 Breast cancer (PARP treatment) "Pan5147", # VCP2 Viapath R444.2 Prostate cancer (PARP treatment) @@ -474,9 +510,9 @@ "Pan4825", # VCP1 STG R337 CADASIL "Pan4816", # VCP2 STG R208 BRCA "Pan4817", # VCP2 STG R207 ovarian - "Pan4818", # VCP2 STG R209 colorectal "Pan4819", # VCP2 STG R210 lynch "Pan4820", # VCP2 STG R211 polyposis + "Pan5185", # VCP2 STG R414 APC "Pan5122", # VCP2 STG R430 prostate "Pan5144", # VCP2 STG R444.1 Breast cancer (PARP treatment) "Pan5148", # VCP2 STG R444.2 Prostate cancer (PARP treatment) @@ -501,6 +537,7 @@ "Pan5014", # LRPCR STG R211 PMS2 "Pan5015", # LRPCR Via R71 SMN1 "Pan5016", # LRPCR Via R239 IKBKG + "Pan5180", # development run - stops warning messages ] @@ -533,14 +570,12 @@ vcp2_panel_list = [ "Pan4149", "Pan4150", - "Pan4127", "Pan4129", "Pan4130", "Pan4042", "Pan4049", "Pan4816", "Pan4817", - "Pan4818", "Pan4819", "Pan4820", "Pan4964", @@ -549,7 +584,9 @@ "Pan5143", "Pan5144", "Pan5147", - "Pan5148" + "Pan5148", + "Pan5185", + "Pan5186", ] vcp3_panel_list = [ "Pan4132", @@ -596,6 +633,7 @@ "Pan5015", "Pan5016", ] +development_pannumber_list=["Pan5180"] tso500_panel_list = [ "Pan4969", "Pan5085", @@ -646,6 +684,9 @@ "TSO500_high_throughput": False, "drylab_dnanexus_id": None, "masked_reference": False, + "exome_depth_cnvcalling_BED": False, + "development_run":False, # used to stopunknown pan number errors but will only demultiplex + } # override default panel settings @@ -703,6 +744,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4042": { # VCP2 STG BRCA "mokapipe": True, @@ -715,6 +757,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan5144": { # VCP2 R444.1 Breast cancer (PARP treatment- STG) "mokapipe": True, @@ -727,6 +770,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5183" }, "Pan5148": { # VCP2 R444.2 Prostate cancer (PARP treatment- STG) "mokapipe": True, @@ -739,6 +783,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5184" }, "Pan4009": { # MokaSNP "mokasnp": True, @@ -756,6 +801,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4043": { # VCP3 STG "mokapipe": True, @@ -768,6 +814,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4119": { # VCP1 R134_Familial hypercholesterolaemia-Familial hypercholesterolaemia Small panel (Viapath) "mokapipe": True, @@ -779,6 +826,7 @@ "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", "FH": True, + "exome_depth_cnvcalling_BED": "Pan4702" }, "Pan4121": { # VCP1 R184 CF (Viapath) "mokapipe": True, @@ -789,6 +837,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4703" }, "Pan4122": { # VCP1 R25 FGFR Viapath "mokapipe": True, @@ -799,6 +848,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4125": { # VCP1 R73 DMD (Viapath) "mokapipe": True, @@ -809,6 +859,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan5135" }, "Pan4126": { # VCP1 R337_CADASIL Viapath "mokapipe": True, @@ -819,6 +870,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4549" }, "Pan4974": { # VCP1 Viapath (Molecular Haemostasis) R112 "mokapipe": True, @@ -829,6 +881,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4985" }, "Pan4975": { # VCP1 Viapath (Molecular Haemostasis) R115 "mokapipe": True, @@ -839,6 +892,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4986" }, "Pan4976": { # VCP1 Viapath (Molecular Haemostasis) R116 "mokapipe": True, @@ -849,6 +903,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4987" }, "Pan4977": { # VCP1 Viapath (Molecular Haemostasis) R117 "mokapipe": True, @@ -859,6 +914,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4988" }, "Pan4978": { # VCP1 Viapath (Molecular Haemostasis) R118 "mokapipe": True, @@ -869,6 +925,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4989" }, "Pan4979": { # VCP1 Viapath (Molecular Haemostasis) R119 "mokapipe": True, @@ -879,6 +936,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4990" }, "Pan4980": { # VCP1 Viapath (Molecular Haemostasis) R120 "mokapipe": True, @@ -889,6 +947,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4991" }, "Pan4981": { # VCP1 Viapath (Molecular Haemostasis) R121 "mokapipe": True, @@ -899,6 +958,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4708" }, "Pan4982": { # VCP1 Viapath (Molecular Haemostasis) R122 "mokapipe": True, @@ -909,6 +969,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4992" }, "Pan4983": { # VCP1 Viapath (Molecular Haemostasis) R123 "mokapipe": True, @@ -919,6 +980,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4993" }, "Pan4984": { # VCP1 Viapath (Molecular Haemostasis) R124 "mokapipe": True, @@ -929,8 +991,9 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4994" }, - "Pan4149": { # VCP2 BRCA (Viapath) + "Pan4149": { # VCP2 BRCA (Viapath) R208 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -939,6 +1002,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5158" }, "Pan4964": { # VCP2 R259 nijmegen breakage (Viapath) "mokapipe": True, @@ -949,6 +1013,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5161" }, "Pan4150": { # VCP2 R207 ovarian cancer (Viapath) "mokapipe": True, @@ -960,29 +1025,33 @@ "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5152" }, - "Pan4127": { # VCP2 R209 colorectal cancer (Viapath) + "Pan4129": { # VCP2 R210 Lynch syndrome (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", - "congenica_project": "5093", + "congenica_project": "5094", "RPKM_also_analyse": vcp2_panel_list, "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5159" }, - "Pan4129": { # VCP2 R210 Lynch syndrome (Viapath) + "Pan4130": { # VCP2 R211 polyposis (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", - "congenica_project": "5094", + "congenica_project": "5095", "RPKM_also_analyse": vcp2_panel_list, "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5160" }, - "Pan4130": { # VCP2 R211 polyposis (Viapath) + "Pan5186": { # VCP2 R414 APC (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -991,7 +1060,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", - "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5162" }, "Pan5121": { # VCP2 R430 prostate (Viapath) "mokapipe": True, @@ -1003,6 +1072,7 @@ "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5165", }, "Pan5143": { # VCP2 R444.1 Breast cancer (PARP treatment- Viapath) "mokapipe": True, @@ -1013,6 +1083,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5183" }, "Pan5147": { # VCP2 R444.2 Prostate cancer (PARP treatment- Viapath) "mokapipe": True, @@ -1023,6 +1094,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5184", }, "Pan4132": { # VCP3 R56 (Viapath) "mokapipe": True, @@ -1033,6 +1105,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan4561", }, "Pan4134": { # VCP3 R57 (Viapath) "mokapipe": True, @@ -1043,6 +1116,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan4565" }, "Pan4136": { # VCP3 R58 (Viapath) "mokapipe": True, @@ -1053,6 +1127,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan4566" }, "Pan4137": { # VCP3 R60 (Viapath) "mokapipe": True, @@ -1063,6 +1138,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan4567" }, "Pan4138": { # VCP3 R62 (Viapath) "mokapipe": True, @@ -1073,6 +1149,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan4552" }, "Pan4143": { # VCP3 R66 (Viapath) "mokapipe": True, @@ -1083,6 +1160,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan5174" }, "Pan4144": { # VCP3 R78 (Viapath) "mokapipe": True, @@ -1093,6 +1171,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan4531", }, "Pan4145": { # VCP3 R79 - CMD (Viapath) "mokapipe": True, @@ -1103,6 +1182,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan5168" }, "Pan4146": { # VCP3 R81 CM (Viapath) "mokapipe": True, @@ -1113,6 +1193,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan5170" }, "Pan4151": { # VCP3 R82 limb girdle (Viapath) "mokapipe": True, @@ -1123,6 +1204,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4351": { # VCP3 R227 (Viapath) "mokapipe": True, @@ -1133,6 +1215,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan5177" }, "Pan4387": { # VCP3 R90 Bleeding and platelet disorders (Viapath) "mokapipe": True, @@ -1143,6 +1226,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan5171" }, "Pan4390": { # VCP3 R97 Thrombophilia with a likely monogenic cause (Viapath) "mokapipe": True, @@ -1153,6 +1237,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan5173", }, "Pan4314": { # VCP3 R229 (Viapath) "mokapipe": True, @@ -1163,6 +1248,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + "exome_depth_cnvcalling_BED": "Pan5179", }, "Pan4396": { # ArcherDx (Synnovis) "archerdx": True, @@ -1226,6 +1312,7 @@ "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, "FH": True, + "exome_depth_cnvcalling_BED": "Pan4702" }, "Pan4822": { # VCP1 STG R184_CF "mokapipe": True, @@ -1239,6 +1326,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, + "exome_depth_cnvcalling_BED": "Pan4703" }, "Pan4823": { # VCP1 STG R25_FGFR "mokapipe": True, @@ -1252,6 +1340,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4824": { # VCP1 STG R73_DMD "mokapipe": True, @@ -1265,6 +1354,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, + "exome_depth_cnvcalling_BED": "Pan5135" }, "Pan4825": { # VCP1 STG R337_cadasil "mokapipe": True, @@ -1278,6 +1368,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, + "exome_depth_cnvcalling_BED": "Pan4549" }, "Pan4826": { # VCP3 STG R56 "mokapipe": True, @@ -1290,6 +1381,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan4561" }, "Pan4827": { # VCP3 STG R57 "mokapipe": True, @@ -1302,6 +1394,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan4865" }, "Pan4828": { # VCP3 STG R58 "mokapipe": True, @@ -1314,6 +1407,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan4566" }, "Pan4829": { # VCP3 STG R60 "mokapipe": True, @@ -1326,6 +1420,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan4567" }, "Pan4830": { # VCP3 STG R62 "mokapipe": True, @@ -1338,6 +1433,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan4552" }, "Pan4831": { # VCP3 STG R66 "mokapipe": True, @@ -1350,6 +1446,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5174" }, "Pan4832": { # VCP3 STG R78 "mokapipe": True, @@ -1362,6 +1459,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan4531", }, "Pan4833": { # VCP3 STG R79 "mokapipe": True, @@ -1374,6 +1472,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5168" }, "Pan4834": { # VCP3 STG R81 "mokapipe": True, @@ -1386,6 +1485,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5170", }, "Pan4835": { # VCP3 STG R82 "mokapipe": True, @@ -1398,6 +1498,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4836": { # VCP3 STG R229 "mokapipe": True, @@ -1410,8 +1511,9 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5179" }, - "Pan4818": { # VCP2 STG R209 + "Pan4819": { # VCP2 STG R210 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -1422,8 +1524,10 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5159" }, - "Pan4819": { # VCP2 STG R210 + "Pan4820": { # VCP2 STG R211 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -1435,8 +1539,9 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5160" }, - "Pan4820": { # VCP2 STG R211 + "Pan5185": { # VCP2 STG R414 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -1447,7 +1552,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", - "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5162" }, "Pan4816": { # VCP2 STG R208 "mokapipe": True, @@ -1460,6 +1565,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5158" }, "Pan4817": { # VCP2 STG R207 "mokapipe": True, @@ -1473,6 +1579,7 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5152" }, "Pan5122": { # VCP2 STG R430 prostate "mokapipe": True, @@ -1486,6 +1593,7 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5165" }, "Pan5007": { # LRPCR Via R207 PMS2 "mokapipe": True, @@ -1497,6 +1605,7 @@ "variant_calling_bedfile": "Pan4767data.bed", "sambamba_bedfile": "Pan5018dataSambamba.bed", "masked_reference": "project-ByfFPz00jy1fk6PjpZ95F27J:file-GF84GF00QfBfzV35Gf8Qg53q", # hs37d5_Pan4967.bwa-index.tar.gz + }, "Pan5008": { # LRPCR STG R207 PMS2 "mokapipe": True, @@ -1509,6 +1618,7 @@ "variant_calling_bedfile": "Pan4767data.bed", "sambamba_bedfile": "Pan5018dataSambamba.bed", "masked_reference": "project-ByfFPz00jy1fk6PjpZ95F27J:file-GF84GF00QfBfzV35Gf8Qg53q", # hs37d5_Pan4967.bwa-index.tar.gz + }, "Pan5011": { # LRPCR Via R210 PMS2 "mokapipe": True, @@ -1601,6 +1711,9 @@ "sambamba_bedfile": "Pan5018dataSambamba.bed", "masked_reference": "project-ByfFPz00jy1fk6PjpZ95F27J:file-GF84GF00QfBfzV35Gf8Qg53q", # hs37d5_Pan4967.bwa-index.tar.gz }, + "Pan5180": { # DEVELOPMENT run - used to allow demultiplexing, but stop samplesheet checks/incorrect pan number alerts + "development_run": True, + }, } @@ -1676,7 +1789,7 @@ "tso_pannumbers": "-itso_pannumbers=Pan4969,Pan5085,Pan5114", "stg_pannumbers": ( "-istg_pannumbers=Pan4042,Pan4043,Pan4044,Pan4049,Pan4821,Pan4822," - "Pan4823,Pan4824,Pan4825,Pan4816,Pan4817,Pan4818,Pan4819,Pan4820," + "Pan4823,Pan4824,Pan4825,Pan4816,Pan4817,Pan4819,Pan4820," "Pan4826,Pan4827,Pan4828,Pan4829,Pan4830,Pan4831,Pan4832,Pan4833," "Pan4834,Pan4835,Pan4836,Pan5008,Pan5010,Pan5012,Pan5014,Pan5122," "Pan5144,Pan5148" diff --git a/demultiplex.py b/demultiplex.py index 67b4d001..78575b11 100644 --- a/demultiplex.py +++ b/demultiplex.py @@ -212,23 +212,52 @@ def already_demultiplexed(self, runfolder): self.script_logfile.write("Checking if already demultiplexed .........Run has not yet been demultiplexed\n") self.samplesheet = self.runfolder + "_SampleSheet.csv" self.samplesheet_path = os.path.join(config.samplesheets_dir, self.samplesheet) - # run samplesheet checks (uses try to ensure that should an error occur this doesn't affect the other - # script functionality - ss_verification_results = samplesheet_verifier.run_ss_checks(self.samplesheet_path) - ss_fail = "" - ss_pass = "" - # If the value is True (i.e. check has passed), append to pass list, else append to fail list - for key in ss_verification_results: - if ss_verification_results[key][0]: - ss_pass += ss_verification_results[key][1] - else: - ss_fail += ss_verification_results[key][1] - if ss_pass: - self.logger("Following samplesheet checked were passed by {}: {}".format(self.samplesheet, ss_pass), - "demultiplex_success") - if ss_fail: - self.logger("SAMPLESHEET CHECKS FAILED {}: {}".format(self.samplesheet, ss_fail), "samplesheet_warning") + # if development run skip the samplesheet check to avoid endless alerts + if not self.check_for_development_run(): + # run samplesheet checks (uses try to ensure that should an error occur this doesn't affect the other + # script functionality + ss_verification_results = samplesheet_verifier.run_ss_checks(self.samplesheet_path) + ss_fail = "" + ss_pass = "" + # If the value is True (i.e. check has passed), append to pass list, else append to fail list + for key in ss_verification_results: + if ss_verification_results[key][0]: + ss_pass += ss_verification_results[key][1] + else: + ss_fail += ss_verification_results[key][1] + if ss_pass: + self.logger("Following samplesheet checked were passed by {}: {}".format(self.samplesheet, ss_pass), + "demultiplex_success") + if ss_fail: + self.logger("SAMPLESHEET CHECKS FAILED {}: {}".format(self.samplesheet, ss_fail), "samplesheet_warning") + else: + self.script_logfile.write("development pan number identified.skipping samplesheet checks\n") self.has_run_finished() + + def check_for_development_run(self,samplesheet_path): + """ + Read samplesheet looking for development pan number. + If pannumber where development_run is True is present add samplename to list + return sample_list (will return False if empty) + """ + sample_list = [] + + with open(self.runfolder_obj.runfolder_samplesheet_path, "r") as samplesheet_stream: + # read the file into a list and loop through the list in reverse (bottom to top). + # this allows us to access the sample names, and stop when reach the column headers, skipping the header of the file. + for line in reversed(samplesheet_stream.readlines()): + if line.startswith("Sample_ID") or "[Data]" in line: + break + # skip empty lines (check first element of the line, after splitting on comma) + elif len(line.split(",")[0]) < 2: + pass + # if it's a line detailing a sample + else: + for pannum in config.development_pannumber_list: + if pannum in line: + sample_list.append(line.split(",")[0]) + + return sample_list def has_run_finished(self): """Check if sequencing has completed for the current runfolder. This is denoted by the diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index d171b676..8a952b68 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -200,6 +200,14 @@ def __init__(self, runfolder, now, debug_mode=False): "jobid=$(dx run %s%s --priority high -y --instance-type mem1_ssd1_v2_x8" % (config.app_project, config.RPKM_path) ) + self.ED_readcount_command = ( + "EDjobid=$(dx run %s%s --priority high -y --instance-type %s" + % (config.app_project, config.ED_readcount_path, config.ED_readcount_path_instance_type) + ) + self.ED_cnvcalling_command = ( + "jobid=$(dx run %s%s --priority high -y --instance-type %s" + % (config.app_project, config.ED_cnvcalling_path, config.ED_cnvcalling_instance_type) + ) self.mokaamp_command = ( "jobid=$(dx run %s%s --priority high -y --name " % (config.app_project, config.mokaamp_path) @@ -316,21 +324,25 @@ def quarterback(self): self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_name, ) - # check for TSO500 run - this is not demultiplexed locally but the entire runfolder is uploaded - # read samplesheet to create a list of samples - TSO500_sample_list = self.check_for_TSO500() - # if not TSO500 will return None - if TSO500_sample_list: - self.list_of_processed_samples, self.fastq_string = ( - TSO500_sample_list, - self.runfolder_obj.runfolder_samplesheet_path, - ) - + # check for development pan number. If found self.list_of_processed_sampels will be empty and no further processing will occur + if self.check_for_development_run(): + self.loggers.script.info("development pan number identified in samplesheet. Stopping any further processing") else: - ( - self.list_of_processed_samples, - self.fastq_string, - ) = self.find_fastqs(self.runfolder_obj.fastq_folder_path) + # check for TSO500 run - this is not demultiplexed locally but the entire runfolder is uploaded + # read samplesheet to create a list of samples + TSO500_sample_list = self.check_for_TSO500() + # if not TSO500 will return None + if TSO500_sample_list: + self.list_of_processed_samples, self.fastq_string = ( + TSO500_sample_list, + self.runfolder_obj.runfolder_samplesheet_path, + ) + + else: + ( + self.list_of_processed_samples, + self.fastq_string, + ) = self.find_fastqs(self.runfolder_obj.fastq_folder_path) if self.list_of_processed_samples: # build the project name using the WES batch and NGS run numbers @@ -611,6 +623,40 @@ def check_for_TSO500(self): open(self.loggers.upload_agent.filepath, "w").close() return sample_list + def check_for_development_run(self): + """ + Read samplesheet looking for development pan number. + If pannumber where development_run is True is present add samplename to list + return sample_list (will return False if empty) + """ + sample_list = [] + # build list of development pan numbers + development_panel_list=[] + for pan in self.panel_dictionary: + if pan["development_run"]: + development_panel_list.append(pan) + + with open( + self.runfolder_obj.runfolder_samplesheet_path, "r" + ) as samplesheet_stream: + # read the file into a list and loop through the list in reverse (bottom to top). + # this allows us to access the sample names, and stop when reach the column headers, skipping the header of the file. + for line in reversed(samplesheet_stream.readlines()): + if line.startswith("Sample_ID") or "[Data]" in line: + break + # skip empty lines (check first element of the line, after splitting on comma) + elif len(line.split(",")[0]) < 2: + pass + # if it's a line detailing a sample + else: + for pannum in development_panel_list: + if pannum in line: + sample_list.append(line.split(",")[0]) + # as it takes a long time before the upload create the file to stop further processing + if sample_list: + open(self.loggers.upload_agent.filepath, "w").close() + return sample_list + def calculate_cluster_density(self, runfolder_path, runfolder_name): """ Inputs = runfolder name and runfolder path @@ -1350,6 +1396,20 @@ def nexus_bedfiles(self, pannumber): + self.panel_dictionary[pannumber]["RPKM_bedfile_pan_number"] + "_RPKM.bed" ) + if self.panel_dictionary[pannumber]["exome_depth_readcount_BED"]: + bed_dict["ED_readcount_bedfile"] = ( + config.app_project + + config.bedfile_folder + + self.panel_dictionary[pannumber]["exome_depth_readcount_BED"] + + "exomedepth.bed" + ) + if self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"]: + bed_dict["exome_depth_cnvcalling_BED"] = ( + config.app_project + + config.bedfile_folder + + self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"] + + "_CNV.bed" + ) return bed_dict def start_building_dx_run_cmds(self): @@ -1377,6 +1437,8 @@ def start_building_dx_run_cmds(self): congenica_upload = False joint_variant_calling = False # not currently in use rpkm_list = [] # list for panels needing RPKM analysis + pannnumber_list= [] + exome_depth = False TSO500 = False # loop through samples @@ -1393,9 +1455,15 @@ def start_building_dx_run_cmds(self): elif re.search(r"_R1_", fastq): # extract Pan number and use this to determine which dx run commands are needed for the sample panel = re.search(r"Pan\d+", fastq).group() + #create a list of all pan numbers in the run + pannnumber_list.append(panel) # The order in which the modules are called here is important to ensure the order # of dx run commands is correct. This affects which decision support tool data is sent to. - + + # determine if exome depth is needed - the exact commands will be determined in the function which handles exome_depth commands + if self.panel_dictionary[panel]["exome_depth_cnvcalling_BED"]: + exome_depth = True + # If panel is to be processed using MokaWES if self.panel_dictionary[panel]["mokawes"]: # call function to build the MokaWES command and add to command list and depends list @@ -1485,6 +1553,11 @@ def start_building_dx_run_cmds(self): # to stop custom panels being analysed by peddy - may cause problems commands_list.append(self.run_peddy_command()) commands_list.append(self.add_to_depends_list("peddy", 'depends_list')) + + if exome_depth: + # exome depth is run once per capture, and then for each capture, one per pannumber. This function returns a list of commands so need to add these to commands list + for cmd in self.determine_exome_depth_requirements(pannnumber_list): + commands_list.append(cmd) if TSO500: # build command for the TSO500 app and set off fastqc commands @@ -1540,6 +1613,121 @@ def start_building_dx_run_cmds(self): return commands_list + def determine_exome_depth_requirements(self,pannnumber_list): + """ + This function takes a list of all pan numbers found on this run. + Exome depth is run in 2 stages, firstly readcounts are calculated for each capture panel (VCP1 or VCP2 etc). + The jobid will be saved to $EDjobid which allows the output of this stage to be used to filter CNVs with a panel specific BEDfile. + The CNV calling steps should be a dependancy of multiqc + This function controls the order these commands are built and run so the output of the readcount step can be used as an input to the cnvcalling step + Inputs: + List of Pannumbers on the run + Returns: + List of dx run commands + """ + + VCP1=[] + VCP2=[] + VCP3=[] + command_list=[] + for pannumber in set(pannnumber_list): + if pannumber in config.vcp1_panel_list: + VCP1.append(pannumber) + if pannumber in config.vcp2_panel_list: + VCP2.append(pannumber) + if pannumber in config.vcp3_panel_list: + VCP3.append(pannumber) + + # make sure there are enough samples for that capture + if len(VCP1)>2: + # first build readcount command. + command_list.append(self.build_ED_readcount_cmd(set(VCP1), config.ED_readcount_normals_VCP1_file,config.ED_VCP1_readcount_BEDfile_pannum)) + # The output of readcount can be used in multiqc so add this to the multiqc depends list + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list')) + # the cnvcalling stage can use the jobid from the readcount stage as an input so run these before the next capture panel + for panel in set(VCP1):# then build cnvcalling commands + command_list.append(self.build_ED_cnv_calling_cmd(panel)) + + if len(VCP2)>2: + # first build readcount command + command_list.append(self.build_ED_readcount_cmd(set(VCP2), config.ED_readcount_normals_VCP2_file,config.ED_VCP2_readcount_BEDfile_pannum)) + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list')) + for panel in set(VCP2):# then build cnvcalling commands + command_list.append(self.build_ED_cnv_calling_cmd(panel)) + + if len(VCP3)>2: + # first build readcount command + command_list.append(self.build_ED_readcount_cmd(set(VCP3), config.ED_readcount_normals_VCP3_file,config.ED_VCP3_readcount_BEDfile_pannum)) + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list')) + for panel in set(VCP2):# then build cnvcalling commands + command_list.append(self.build_ED_cnv_calling_cmd(panel)) + + return command_list + def build_ED_readcount_cmd(self,pannumber_list, normals_file,readcount_bedfile_pannum): + """ + This function builds the dx run command for the exome depth readcount app + This is run once per capture panel + Inputs: + pannumber_list = list of Pan numbers for this capture panel on this run. used to determine which BAM files to download + normals_file = predefined panel of normals data file (from config) + readcount bedfile pannumber = predefined capture panel wide BEDfile (from config) + Returns: + dx run cmd (string) + """ + #build bedfile address from the readcount_bedfile_pannum input + bedfiles = self.nexus_bedfiles(readcount_bedfile_pannum) + readcount_bedfile=bedfiles["exome_depth_readcount_BED"] + + dx_command_list = [ + self.ED_readcount_command, + config.exomedepth_readcount_reference_genome_input, + config.exomedepth_readcount_bedfile_input, + readcount_bedfile, + config.exomedepth_cnvcalling_subpanel_bed_input, + config.exomedepth_readcount_normalsRdata_input, + normals_file, + config.exomedepth_readcount_projectname_input, + self.runfolder_obj.nexus_project_name, + config.exomedepth_readcount_pannumbers_input, + ",".join(pannumber_list), + self.depends_gatk, # use list of gatk related jobs to delay start + self.dest, + self.dest_cmd, + self.token, + ] + dx_command = "".join(map(str, dx_command_list)) + return dx_command + + def build_ED_cnv_calling_cmd(self,pannumber): + """ + This function builds the dx run command to filter the CNV calls for a specific R number using a BEDfile + Input: + pannumber = pannumber to filter CNV calls + Returns: + dx run cmd (string) + """ + # build bedfile address using the given pan number extract exome_depth_cnvcalling_BED from panel config dict + bedfiles = self.nexus_bedfiles(pannumber) + ed_cnvcalling_bedfile = bedfiles["exome_depth_cnvcalling_BED"] + + dx_command_list = [ + self.ED_cnvcalling_command, + config.exomedepth_cnvcalling_reference_genome_input, + config.exomedepth_cnvcalling_readcount_file_input, + "$EDjobid:%s" % (config.exomedepth_readcount_rdata_output), + config.exomedepth_cnvcalling_subpanel_bed_input, + ed_cnvcalling_bedfile, + config.exomedepth_cnvcalling_projectname_input, + self.runfolder_obj.nexus_project_name, + config.exomedepth_cnvcalling_pannumbers_input, + pannumber, + self.dest, + self.dest_cmd, + self.token, + ] + dx_command = "".join(map(str, dx_command_list)) + return dx_command + def create_mokawes_command(self, fastq, pannumber): """ Input = R1 fastq filename and Pan number for a single sample From 434d04108a6791ba7a638ede1b7c93ab81ef2488 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Wed, 27 Sep 2023 12:36:49 +0100 Subject: [PATCH 02/16] add expected RData file names --- automate_demultiplex_config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index 2a1e9036..c474848f 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -277,13 +277,13 @@ ED_cnvcalling_path = "Apps/ED_cnv_calling_v1.2.0" ED_cnvcalling_instance_type = "mem1_ssd1_v2_x4" #VCP1 exome depth -ED_readcount_normals_VCP1_file="XXX" -ED_VCP1_readcount_BEDfile_pannum = "Pan4398" # just put pan number - full bedfile name is made in nexus_bedfiles function +ED_readcount_normals_VCP1_file="Pan5134_normals_v1.0.0.RData" +ED_VCP1_readcount_BEDfile_pannum = "Pan5134" # just put pan number - full bedfile name is made in nexus_bedfiles function #VCP2 normals data file -ED_readcount_normals_VCP2_file="XXX" -ED_VCP2_readcount_BEDfile_pannum = "Pan4973" # just put pan number - full bedfile name is made in nexus_bedfiles function +ED_readcount_normals_VCP2_file="Pan5132_normals_v1.0.0.RData" +ED_VCP2_readcount_BEDfile_pannum = "Pan5132" # just put pan number - full bedfile name is made in nexus_bedfiles function #VCP3 normals data file -ED_readcount_normals_VCP3_file="XXX" +ED_readcount_normals_VCP3_file="Pan5149_normals_v1.0.0.RData" ED_VCP3_readcount_BEDfile_pannum = "Pan5149" # just put pan number - full bedfile name is made in nexus_bedfiles function exomedepth_refgenome_file = "project-ByfFPz00jy1fk6PjpZ95F27J:file-B6ZY7VG2J35Vfvpkj8y0KZ01" #hs37d5.fa.gz from 001 From e57e58f1e066ac2bab5f55153b65fb063ee96200 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Wed, 27 Sep 2023 13:02:47 +0100 Subject: [PATCH 03/16] tidy up pan numbers --- automate_demultiplex_config.py | 130 ++++++++++++++------------------- 1 file changed, 56 insertions(+), 74 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index c474848f..2fb3ec67 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -732,33 +732,33 @@ "hsmetrics_bedfile": "Pan4082.bed", "sambamba_bedfile": "Pan4082Sambamba.bed", }, - "Pan4044": { # VCP1 STG - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan4399", - "RPKM_also_analyse": vcp1_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "4203", - "hsmetrics_bedfile": "Pan4397data.bed", - "variant_calling_bedfile": "Pan4398data.bed", - "sambamba_bedfile": "Pan4397dataSambamba.bed", - "STG": True, - "exome_depth_cnvcalling_BED": "PanXXXX" - }, - "Pan4042": { # VCP2 STG BRCA - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan5109", - "RPKM_also_analyse": vcp2_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "1099", - "hsmetrics_bedfile": "Pan5123data.bed", - "variant_calling_bedfile": "Pan5119data.bed", - "sambamba_bedfile": "Pan5123dataSambamba.bed", - "exome_depth_cnvcalling_BED": "PanXXXX" - }, + # "Pan4044": { # VCP1 STG + # "mokapipe": True, + # "multiqc_coverage_level": 30, + # "RPKM_bedfile_pan_number": "Pan4399", + # "RPKM_also_analyse": vcp1_panel_list, + # "congenica_credentials": "STG", + # "congenica_IR_template": "non-priority", + # "congenica_project": "4203", + # "hsmetrics_bedfile": "Pan4397data.bed", + # "variant_calling_bedfile": "Pan4398data.bed", + # "sambamba_bedfile": "Pan4397dataSambamba.bed", + # "STG": True, + # "exome_depth_cnvcalling_BED": "PanXXXX" + # }, + # "Pan4042": { # VCP2 STG BRCA + # "mokapipe": True, + # "multiqc_coverage_level": 30, + # "RPKM_bedfile_pan_number": "Pan5109", + # "RPKM_also_analyse": vcp2_panel_list, + # "congenica_credentials": "STG", + # "congenica_IR_template": "non-priority", + # "congenica_project": "1099", + # "hsmetrics_bedfile": "Pan5123data.bed", + # "variant_calling_bedfile": "Pan5119data.bed", + # "sambamba_bedfile": "Pan5123dataSambamba.bed", + # "exome_depth_cnvcalling_BED": "PanXXXX" + # }, "Pan5144": { # VCP2 R444.1 Breast cancer (PARP treatment- STG) "mokapipe": True, "multiqc_coverage_level": 30, @@ -790,32 +790,32 @@ "multiqc_coverage_level": 30, "variant_calling_bedfile": "Pan4009.bed", }, - "Pan4049": { # VCP2 STG CrCa - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan5109", - "RPKM_also_analyse": vcp2_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "4202", - "hsmetrics_bedfile": "Pan5123data.bed", - "variant_calling_bedfile": "Pan5119data.bed", - "sambamba_bedfile": "Pan5123dataSambamba.bed", - "exome_depth_cnvcalling_BED": "PanXXXX" - }, - "Pan4043": { # VCP3 STG - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan4362", - "RPKM_also_analyse": vcp3_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "4201", - "hsmetrics_bedfile": "Pan4995data.bed", - "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "PanXXXX" - }, + # "Pan4049": { # VCP2 STG CrCa + # "mokapipe": True, + # "multiqc_coverage_level": 30, + # "RPKM_bedfile_pan_number": "Pan5109", + # "RPKM_also_analyse": vcp2_panel_list, + # "congenica_credentials": "STG", + # "congenica_IR_template": "non-priority", + # "congenica_project": "4202", + # "hsmetrics_bedfile": "Pan5123data.bed", + # "variant_calling_bedfile": "Pan5119data.bed", + # "sambamba_bedfile": "Pan5123dataSambamba.bed", + # "exome_depth_cnvcalling_BED": "PanXXXX" + # }, + # "Pan4043": { # VCP3 STG + # "mokapipe": True, + # "multiqc_coverage_level": 30, + # "RPKM_bedfile_pan_number": "Pan4362", + # "RPKM_also_analyse": vcp3_panel_list, + # "congenica_credentials": "STG", + # "congenica_IR_template": "non-priority", + # "congenica_project": "4201", + # "hsmetrics_bedfile": "Pan4995data.bed", + # "variant_calling_bedfile": "Pan4995data.bed", + # "sambamba_bedfile": "Pan4995dataSambamba.bed", + # "exome_depth_cnvcalling_BED": "PanXXXX" + # }, "Pan4119": { # VCP1 R134_Familial hypercholesterolaemia-Familial hypercholesterolaemia Small panel (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, @@ -848,7 +848,6 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", - "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4125": { # VCP1 R73 DMD (Viapath) "mokapipe": True, @@ -1105,7 +1104,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan4561", }, "Pan4134": { # VCP3 R57 (Viapath) "mokapipe": True, @@ -1116,7 +1114,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan4565" }, "Pan4136": { # VCP3 R58 (Viapath) "mokapipe": True, @@ -1127,7 +1124,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan4566" }, "Pan4137": { # VCP3 R60 (Viapath) "mokapipe": True, @@ -1138,7 +1134,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan4567" }, "Pan4138": { # VCP3 R62 (Viapath) "mokapipe": True, @@ -1149,7 +1144,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan4552" }, "Pan4143": { # VCP3 R66 (Viapath) "mokapipe": True, @@ -1171,7 +1165,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan4531", }, "Pan4145": { # VCP3 R79 - CMD (Viapath) "mokapipe": True, @@ -1204,7 +1197,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4351": { # VCP3 R227 (Viapath) "mokapipe": True, @@ -1340,7 +1332,6 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, - "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4824": { # VCP1 STG R73_DMD "mokapipe": True, @@ -1381,7 +1372,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan4561" }, "Pan4827": { # VCP3 STG R57 "mokapipe": True, @@ -1394,7 +1384,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan4865" }, "Pan4828": { # VCP3 STG R58 "mokapipe": True, @@ -1407,7 +1396,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan4566" }, "Pan4829": { # VCP3 STG R60 "mokapipe": True, @@ -1420,7 +1408,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan4567" }, "Pan4830": { # VCP3 STG R62 "mokapipe": True, @@ -1433,7 +1420,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan4552" }, "Pan4831": { # VCP3 STG R66 "mokapipe": True, @@ -1459,7 +1445,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan4531", }, "Pan4833": { # VCP3 STG R79 "mokapipe": True, @@ -1498,7 +1483,6 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "PanXXXX" }, "Pan4836": { # VCP3 STG R229 "mokapipe": True, @@ -1788,11 +1772,9 @@ # tso_pannumbers should not include the dry lab pan number "tso_pannumbers": "-itso_pannumbers=Pan4969,Pan5085,Pan5114", "stg_pannumbers": ( - "-istg_pannumbers=Pan4042,Pan4043,Pan4044,Pan4049,Pan4821,Pan4822," - "Pan4823,Pan4824,Pan4825,Pan4816,Pan4817,Pan4819,Pan4820," - "Pan4826,Pan4827,Pan4828,Pan4829,Pan4830,Pan4831,Pan4832,Pan4833," - "Pan4834,Pan4835,Pan4836,Pan5008,Pan5010,Pan5012,Pan5014,Pan5122," - "Pan5144,Pan5148" + "-istg_pannumbers=Pan4821,Pan4822,Pan4823,Pan4824,Pan4825,Pan4816,Pan4817,Pan4819,Pan4820," + "Pan4826,Pan4827,Pan4828,Pan4829,Pan4830,Pan4831,Pan4832,Pan4833,Pan4834,Pan4835,Pan4836," + "Pan5008,Pan5010,Pan5012,Pan5014,Pan5122,Pan5144,Pan5148" ), "cp_capture_pannos": "-icp_capture_pannos=Pan5109,Pan4399,Pan4362", } From 1cf628b271cf2786e3889c0630a508df045fce95 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Wed, 27 Sep 2023 13:13:09 +0100 Subject: [PATCH 04/16] exclude VCP pan numbers that don't need CNV calling --- upload_and_setoff_workflows.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index 8a952b68..8ba38a62 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -1630,13 +1630,16 @@ def determine_exome_depth_requirements(self,pannnumber_list): VCP2=[] VCP3=[] command_list=[] + # could prob do list comprehension here for pannumber in set(pannnumber_list): - if pannumber in config.vcp1_panel_list: - VCP1.append(pannumber) - if pannumber in config.vcp2_panel_list: - VCP2.append(pannumber) - if pannumber in config.vcp3_panel_list: - VCP3.append(pannumber) + # not all VCP1/2/3 pan numbers need CNV calling + if self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"]: + if pannumber in config.vcp1_panel_list: + VCP1.append(pannumber) + if pannumber in config.vcp2_panel_list: + VCP2.append(pannumber) + if pannumber in config.vcp3_panel_list: + VCP3.append(pannumber) # make sure there are enough samples for that capture if len(VCP1)>2: From e43a852ed3e25619f6a8c5f6526541fc9441a9fc Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Wed, 4 Oct 2023 10:36:49 +0100 Subject: [PATCH 05/16] tested ed automation for vcp1 and 2.. also fix #504 --- automate_demultiplex_config.py | 110 ++++++++++++++++----------------- demultiplex.py | 4 +- upload_and_setoff_workflows.py | 33 +++++----- 3 files changed, 71 insertions(+), 76 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index 2fb3ec67..e579fb4b 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -8,7 +8,7 @@ import os # Set debug mode -testing = False +testing = True # =====location of input/output files===== # root of folder that contains the apps, automate_demultiplexing_logfiles and @@ -277,31 +277,31 @@ ED_cnvcalling_path = "Apps/ED_cnv_calling_v1.2.0" ED_cnvcalling_instance_type = "mem1_ssd1_v2_x4" #VCP1 exome depth -ED_readcount_normals_VCP1_file="Pan5134_normals_v1.0.0.RData" -ED_VCP1_readcount_BEDfile_pannum = "Pan5134" # just put pan number - full bedfile name is made in nexus_bedfiles function +ED_readcount_normals_VCP1_file= "project-ByfFPz00jy1fk6PjpZ95F27J:file-GZ47PPj0xygQ8z3z8yQK1qJF"#"Pan5134_normals_v1.0.0.RData" +ED_VCP1_readcount_BEDfile_pannum = "Pan5134_exomedepth.bed" #VCP2 normals data file -ED_readcount_normals_VCP2_file="Pan5132_normals_v1.0.0.RData" -ED_VCP2_readcount_BEDfile_pannum = "Pan5132" # just put pan number - full bedfile name is made in nexus_bedfiles function +ED_readcount_normals_VCP2_file="project-ByfFPz00jy1fk6PjpZ95F27J:file-GZ8ybG00bx11vq9fXP1j7QQK"#"Pan5132_normals_v1.0.0.RData" +ED_VCP2_readcount_BEDfile_pannum = "Pan5132_exomedepth.bed" #VCP3 normals data file -ED_readcount_normals_VCP3_file="Pan5149_normals_v1.0.0.RData" -ED_VCP3_readcount_BEDfile_pannum = "Pan5149" # just put pan number - full bedfile name is made in nexus_bedfiles function +ED_readcount_normals_VCP3_file=None#"Pan5149_normals_v1.0.0.RData" +ED_VCP3_readcount_BEDfile_pannum = "Pan5149_exomedepth.bed" exomedepth_refgenome_file = "project-ByfFPz00jy1fk6PjpZ95F27J:file-B6ZY7VG2J35Vfvpkj8y0KZ01" #hs37d5.fa.gz from 001 ## readcount app inputs -exomedepth_readcount_reference_genome_input="-ireference_genome=%s" % (exomedepth_refgenome_file) -exomedepth_readcount_bedfile_input="-ibedfile=" -exomedepth_readcount_normalsRdata_input="-inormals_RData=" -exomedepth_readcount_projectname_input="-iproject_name=" -exomedepth_readcount_pannumbers_input="-ibamfile_pannumbers=" -exomedepth_readcount_rdata_output="rdataout" +exomedepth_readcount_reference_genome_input=" -ireference_genome=%s" % (exomedepth_refgenome_file) +exomedepth_readcount_bedfile_input=" -ibedfile=" +exomedepth_readcount_normalsRdata_input=" -inormals_RData=" +exomedepth_readcount_projectname_input=" -iproject_name=" +exomedepth_readcount_pannumbers_input=" -ibamfile_pannumbers=" +exomedepth_readcount_rdata_output="RData" ## ED CNV calling inputs -exomedepth_cnvcalling_reference_genome_input="-ireference_genome=%s" % (exomedepth_refgenome_file) -exomedepth_cnvcalling_readcount_file_input="-ireadcount_file=" -exomedepth_cnvcalling_subpanel_bed_input="-isubpanel_bed=" -exomedepth_cnvcalling_projectname_input="-iproject_name=" -exomedepth_cnvcalling_pannumbers_input="-ibamfile_pannumbers=" +exomedepth_cnvcalling_reference_genome_input=" -ireference_genome=%s" % (exomedepth_refgenome_file) +exomedepth_cnvcalling_readcount_file_input=" -ireadcount_file=" +exomedepth_cnvcalling_subpanel_bed_input=" -isubpanel_bed=" +exomedepth_cnvcalling_projectname_input=" -iproject_name=" +exomedepth_cnvcalling_pannumbers_input=" -ibamfile_pannumbers=" # MokaWES workflow_inputs @@ -460,10 +460,10 @@ "Pan5085", # TSO500 High throughput Synnovis. no UTRS TERT promoter "Pan5112", # TSO500 High throughput BSPS. no UTRS TERT promoter "Pan5114", # TSO500 High throughput Control. no UTRS TERT promoter - "Pan4042", # STG VCP2 BRCA - TO BE REMOVED IN FUTURE UPDATE - "Pan4043", # STG VCP3 - TO BE REMOVED IN FUTURE UPDATE - "Pan4044", # STG VCP1 - TO BE REMOVED IN FUTURE UPDATE - "Pan4049", # STG VCP2 CrCa - TO BE REMOVED IN FUTURE UPDATE + #"Pan4042", # STG VCP2 BRCA - TO BE REMOVED IN FUTURE UPDATE + #"Pan4043", # STG VCP3 - TO BE REMOVED IN FUTURE UPDATE + #"Pan4044", # STG VCP1 - TO BE REMOVED IN FUTURE UPDATE + #"Pan4049", # STG VCP2 CrCa - TO BE REMOVED IN FUTURE UPDATE "Pan4119", # VCP1 Viapath R134 (FH) "Pan4121", # VCP1 Viapath R184 (CF) "Pan4122", # VCP1 Viapath R25 (FGFR) @@ -549,7 +549,7 @@ "Pan4122", "Pan4125", "Pan4126", - "Pan4044", + #"Pan4044", "Pan4821", "Pan4822", "Pan4823", @@ -572,8 +572,8 @@ "Pan4150", "Pan4129", "Pan4130", - "Pan4042", - "Pan4049", + #"Pan4042", + #"Pan4049", "Pan4816", "Pan4817", "Pan4819", @@ -599,7 +599,7 @@ "Pan4145", "Pan4146", "Pan4151", - "Pan4043", + #"Pan4043", "Pan4314", "Pan4351", "Pan4387", @@ -847,7 +847,7 @@ "RPKM_also_analyse": vcp1_panel_list, "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "variant_calling_bedfile": "Pan4398data.bed", + "variant_calling_bedfile": "Pan4398data.bed", # CNV not required }, "Pan4125": { # VCP1 R73 DMD (Viapath) "mokapipe": True, @@ -858,7 +858,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", - "exome_depth_cnvcalling_BED": "Pan5135" + "exome_depth_cnvcalling_BED": "Pan4622" }, "Pan4126": { # VCP1 R337_CADASIL Viapath "mokapipe": True, @@ -868,8 +868,7 @@ "RPKM_also_analyse": vcp1_panel_list, "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "variant_calling_bedfile": "Pan4398data.bed", - "exome_depth_cnvcalling_BED": "Pan4549" + "variant_calling_bedfile": "Pan4398data.bed",# cnv not required }, "Pan4974": { # VCP1 Viapath (Molecular Haemostasis) R112 "mokapipe": True, @@ -1036,7 +1035,7 @@ "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", - "exome_depth_cnvcalling_BED": "Pan5159" + "exome_depth_cnvcalling_BED": "Pan5160" # use R211 CNV bedfile }, "Pan4130": { # VCP2 R211 polyposis (Viapath) "mokapipe": True, @@ -1103,7 +1102,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed", # CNV not required }, "Pan4134": { # VCP3 R57 (Viapath) "mokapipe": True, @@ -1113,7 +1112,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed", # CNV not required }, "Pan4136": { # VCP3 R58 (Viapath) "mokapipe": True, @@ -1123,7 +1122,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4137": { # VCP3 R60 (Viapath) "mokapipe": True, @@ -1133,7 +1132,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4138": { # VCP3 R62 (Viapath) "mokapipe": True, @@ -1143,7 +1142,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4143": { # VCP3 R66 (Viapath) "mokapipe": True, @@ -1154,7 +1153,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan5174" + # "exome_depth_cnvcalling_BED": "Pan5174" # CNV BED not yet available }, "Pan4144": { # VCP3 R78 (Viapath) "mokapipe": True, @@ -1164,7 +1163,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4145": { # VCP3 R79 - CMD (Viapath) "mokapipe": True, @@ -1196,7 +1195,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4351": { # VCP3 R227 (Viapath) "mokapipe": True, @@ -1240,7 +1239,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan5179", + # "exome_depth_cnvcalling_BED": "Pan5179", bedfile not yet made }, "Pan4396": { # ArcherDx (Synnovis) "archerdx": True, @@ -1318,7 +1317,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, - "exome_depth_cnvcalling_BED": "Pan4703" + "exome_depth_cnvcalling_BED": "Pan4703", }, "Pan4823": { # VCP1 STG R25_FGFR "mokapipe": True, @@ -1331,7 +1330,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "STG": True, + "STG": True, # CNV not required }, "Pan4824": { # VCP1 STG R73_DMD "mokapipe": True, @@ -1345,7 +1344,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, - "exome_depth_cnvcalling_BED": "Pan5135" + "exome_depth_cnvcalling_BED": "Pan4622" }, "Pan4825": { # VCP1 STG R337_cadasil "mokapipe": True, @@ -1358,8 +1357,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "STG": True, - "exome_depth_cnvcalling_BED": "Pan4549" + "STG": True,# CNV not required }, "Pan4826": { # VCP3 STG R56 "mokapipe": True, @@ -1371,7 +1369,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4827": { # VCP3 STG R57 "mokapipe": True, @@ -1383,7 +1381,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4828": { # VCP3 STG R58 "mokapipe": True, @@ -1395,7 +1393,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4829": { # VCP3 STG R60 "mokapipe": True, @@ -1407,7 +1405,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4830": { # VCP3 STG R62 "mokapipe": True, @@ -1419,7 +1417,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4831": { # VCP3 STG R66 "mokapipe": True, @@ -1432,7 +1430,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan5174" + # "exome_depth_cnvcalling_BED": "Pan5174" BEDfile not yet available }, "Pan4832": { # VCP3 STG R78 "mokapipe": True, @@ -1444,7 +1442,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4833": { # VCP3 STG R79 "mokapipe": True, @@ -1457,7 +1455,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan5168" + "exome_depth_cnvcalling_BED": "Pan5168", }, "Pan4834": { # VCP3 STG R81 "mokapipe": True, @@ -1482,7 +1480,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4836": { # VCP3 STG R229 "mokapipe": True, @@ -1495,7 +1493,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan5179" + #"exome_depth_cnvcalling_BED": "Pan5179" BEDfile not yet available }, "Pan4819": { # VCP2 STG R210 "mokapipe": True, @@ -1509,7 +1507,7 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", - "exome_depth_cnvcalling_BED": "Pan5159" + "exome_depth_cnvcalling_BED": "Pan5160" # useR211 bedfile }, "Pan4820": { # VCP2 STG R211 "mokapipe": True, diff --git a/demultiplex.py b/demultiplex.py index 78575b11..35d7e160 100644 --- a/demultiplex.py +++ b/demultiplex.py @@ -213,7 +213,7 @@ def already_demultiplexed(self, runfolder): self.samplesheet = self.runfolder + "_SampleSheet.csv" self.samplesheet_path = os.path.join(config.samplesheets_dir, self.samplesheet) # if development run skip the samplesheet check to avoid endless alerts - if not self.check_for_development_run(): + if not self.check_for_development_run(self.samplesheet_path): # run samplesheet checks (uses try to ensure that should an error occur this doesn't affect the other # script functionality ss_verification_results = samplesheet_verifier.run_ss_checks(self.samplesheet_path) @@ -242,7 +242,7 @@ def check_for_development_run(self,samplesheet_path): """ sample_list = [] - with open(self.runfolder_obj.runfolder_samplesheet_path, "r") as samplesheet_stream: + with open(samplesheet_path, "r") as samplesheet_stream: # read the file into a list and loop through the list in reverse (bottom to top). # this allows us to access the sample names, and stop when reach the column headers, skipping the header of the file. for line in reversed(samplesheet_stream.readlines()): diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index 8ba38a62..4ec167ef 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -252,6 +252,7 @@ def __init__(self, runfolder, now, debug_mode=False): # arguments to capture jobids self.depends_list = 'depends_list="${depends_list} -d ${jobid} "' + self.depends_list_ED_readcount = 'depends_list="${depends_list} -d ${EDjobid} "' self.depends_list_gatk = 'depends_list_gatk="${depends_list_gatk} -d ${jobid} "' self.depends_list_recombined = 'depends_list="${depends_list} ${depends_list_gatk} "' # Argument to define depends_list only if the job ID exists @@ -632,8 +633,8 @@ def check_for_development_run(self): sample_list = [] # build list of development pan numbers development_panel_list=[] - for pan in self.panel_dictionary: - if pan["development_run"]: + for pan in self.panel_dictionary.keys(): + if self.panel_dictionary[pan]["development_run"]: development_panel_list.append(pan) with open( @@ -1396,13 +1397,7 @@ def nexus_bedfiles(self, pannumber): + self.panel_dictionary[pannumber]["RPKM_bedfile_pan_number"] + "_RPKM.bed" ) - if self.panel_dictionary[pannumber]["exome_depth_readcount_BED"]: - bed_dict["ED_readcount_bedfile"] = ( - config.app_project - + config.bedfile_folder - + self.panel_dictionary[pannumber]["exome_depth_readcount_BED"] - + "exomedepth.bed" - ) + if self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"]: bed_dict["exome_depth_cnvcalling_BED"] = ( config.app_project @@ -1410,6 +1405,7 @@ def nexus_bedfiles(self, pannumber): + self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"] + "_CNV.bed" ) + return bed_dict def start_building_dx_run_cmds(self): @@ -1626,11 +1622,12 @@ def determine_exome_depth_requirements(self,pannnumber_list): List of dx run commands """ + # generate list of pan numbers in samplenames to process in ED VCP1=[] VCP2=[] VCP3=[] command_list=[] - # could prob do list comprehension here + for pannumber in set(pannnumber_list): # not all VCP1/2/3 pan numbers need CNV calling if self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"]: @@ -1646,7 +1643,7 @@ def determine_exome_depth_requirements(self,pannnumber_list): # first build readcount command. command_list.append(self.build_ED_readcount_cmd(set(VCP1), config.ED_readcount_normals_VCP1_file,config.ED_VCP1_readcount_BEDfile_pannum)) # The output of readcount can be used in multiqc so add this to the multiqc depends list - command_list.append(self.add_to_depends_list("exomedepth", 'depends_list')) + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list_ED_readcount')) # the cnvcalling stage can use the jobid from the readcount stage as an input so run these before the next capture panel for panel in set(VCP1):# then build cnvcalling commands command_list.append(self.build_ED_cnv_calling_cmd(panel)) @@ -1654,14 +1651,14 @@ def determine_exome_depth_requirements(self,pannnumber_list): if len(VCP2)>2: # first build readcount command command_list.append(self.build_ED_readcount_cmd(set(VCP2), config.ED_readcount_normals_VCP2_file,config.ED_VCP2_readcount_BEDfile_pannum)) - command_list.append(self.add_to_depends_list("exomedepth", 'depends_list')) + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list_ED_readcount')) for panel in set(VCP2):# then build cnvcalling commands command_list.append(self.build_ED_cnv_calling_cmd(panel)) if len(VCP3)>2: # first build readcount command command_list.append(self.build_ED_readcount_cmd(set(VCP3), config.ED_readcount_normals_VCP3_file,config.ED_VCP3_readcount_BEDfile_pannum)) - command_list.append(self.add_to_depends_list("exomedepth", 'depends_list')) + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list_ED_readcount')) for panel in set(VCP2):# then build cnvcalling commands command_list.append(self.build_ED_cnv_calling_cmd(panel)) @@ -1678,15 +1675,13 @@ def build_ED_readcount_cmd(self,pannumber_list, normals_file,readcount_bedfile_p dx run cmd (string) """ #build bedfile address from the readcount_bedfile_pannum input - bedfiles = self.nexus_bedfiles(readcount_bedfile_pannum) - readcount_bedfile=bedfiles["exome_depth_readcount_BED"] + readcount_bedfile = "%s%s%s" % (config.app_project,config.bedfile_folder,readcount_bedfile_pannum) dx_command_list = [ self.ED_readcount_command, config.exomedepth_readcount_reference_genome_input, config.exomedepth_readcount_bedfile_input, readcount_bedfile, - config.exomedepth_cnvcalling_subpanel_bed_input, config.exomedepth_readcount_normalsRdata_input, normals_file, config.exomedepth_readcount_projectname_input, @@ -1709,13 +1704,13 @@ def build_ED_cnv_calling_cmd(self,pannumber): Returns: dx run cmd (string) """ - # build bedfile address using the given pan number extract exome_depth_cnvcalling_BED from panel config dict + # pull out the appropriate bedfile for ED cnvcalling app BEDfrom panel config dict (exome_depth_cnvcalling_BED) + # note the Pan number for this BED will be different to that used to name the sample bedfiles = self.nexus_bedfiles(pannumber) ed_cnvcalling_bedfile = bedfiles["exome_depth_cnvcalling_BED"] dx_command_list = [ self.ED_cnvcalling_command, - config.exomedepth_cnvcalling_reference_genome_input, config.exomedepth_cnvcalling_readcount_file_input, "$EDjobid:%s" % (config.exomedepth_readcount_rdata_output), config.exomedepth_cnvcalling_subpanel_bed_input, @@ -2364,6 +2359,8 @@ def add_to_depends_list(self, fastq, depends_type): return self.depends_list_gatk elif depends_type=='depends_list_recombined': return self.depends_list_recombined + elif depends_type=='depends_list_ED_readcount': + return self.depends_list_ED_readcount def create_multiqc_command(self): """ From cc8a966e5c1eaca346ef5dc8436599f346917e1f Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Fri, 6 Oct 2023 11:52:45 +0100 Subject: [PATCH 06/16] think this relates to comments in dx run cmds --- upload_and_setoff_workflows.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index 4ec167ef..db0704d3 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -149,11 +149,11 @@ def __init__(self, runfolder, now, debug_mode=False): self.list_of_processed_samples = [] # DNA Nexus commands to be built on later - self.source_command = "#!/bin/bash\n. %s\n" % ( + self.source_command = "#!/bin/bash\n. %s" % ( config.sdk_source_cmd ) - self.empty_depends = "depends_list=''\n" - self.empty_gatk_depends = "depends_list_gatk=''\n" + self.empty_depends = "depends_list=''" + self.empty_gatk_depends = "depends_list_gatk=''" self.createprojectcommand = 'project_id="$(dx new project --bill-to %s "%s" --brief --auth-token %s)"\n' self.mokapipe_command = ( "jobid=$(dx run %s%s --priority high -y --name " @@ -893,7 +893,7 @@ def write_create_project_script(self): # open bash script with open(self.project_bash_script_path, "w") as project_script: project_script.write(self.source_command + "\n") - project_script.write(self.empty_depends) + project_script.write(self.empty_depends + "\n") project_script.write( self.createprojectcommand % ( @@ -1492,6 +1492,11 @@ def start_building_dx_run_cmds(self): # If panel is to be processed using mokapipe if self.panel_dictionary[panel]["mokapipe"]: + commands_list.append("#For each sample there are 5 lines of commands. The dx run command for the workflow. and the jobid is then added to two depends_on lists.") + commands_list.append("#The gatk depends on list is used for apps that only need to wait for the individual sample processes to finish (eg cnv calling") + commands_list.append("#The depends_on list is used for jobs that also require run wide jobs to finish (eg peddy)") + commands_list.append("#The 4th line passes the jobid to decision_support_tool_inputs.py which returns some inputs for the congenica upload command") + commands_list.append("#The 5th line uses this output and echos the dx run command to a bash script to be run after QC is checked") # call function to build the Mokapipe command and add to command list and depends list commands_list.append( self.create_mokapipe_command(fastq, panel) @@ -1551,11 +1556,14 @@ def start_building_dx_run_cmds(self): commands_list.append(self.add_to_depends_list("peddy", 'depends_list')) if exome_depth: + commands_list.append("# Exome depth is run once per capture and then once per Pan number within that capture") # exome depth is run once per capture, and then for each capture, one per pannumber. This function returns a list of commands so need to add these to commands list for cmd in self.determine_exome_depth_requirements(pannnumber_list): commands_list.append(cmd) if TSO500: + commands_list.append("#The TSOapp is set off first. This utilises the --wait flag, so the bash script waits until this job finishes before running the coverage, hap.py and fastqc commands using the samplesheet to determine expected files and thier locations ") + commands_list.append("#All jobs apart from control samples are added to the depends on list used to delay multiqc") # build command for the TSO500 app and set off fastqc commands commands_list.append(self.create_tso500_command()) commands_list.append(self.add_to_depends_list("TSO500", 'depends_list')) @@ -2227,6 +2235,7 @@ def prepare_rpkm_list(self, rpkm_list): # return list to be used to build rpkm command(s). return cleaned_list + # TODO set this up so it only runs the RPKM app if there are enough samples files (minimum 3 required by the app) def create_rpkm_command(self, pannumber): """ Input = Pannumber for a single RPKM analysis @@ -2993,13 +3002,7 @@ def upload_log_files(self): Returns = filepath to the logfile containing output from the command, string of files to be uploaded and name of the stage to test """ # define where files to be uploaded to - nexus_upload_folder = ( - "/" - + self.runfolder_obj.nexus_project_name.replace( - self.nexusproject, "" - ) - + "/Logfiles/" - ) + nexus_upload_folder = ("/%s/Logfiles/" % ("_".join(self.runfolder_obj.nexus_project_name.split("_")[1:]))) # create a list of files to be used to check outputs files_to_upload_list = [] # create a space delimited string of files to be uploaded defined by the logger class From 3772ed08d3f4673ba29c3b2ce9fafe81a8bb50db Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Fri, 13 Oct 2023 13:48:34 +0100 Subject: [PATCH 07/16] TSO updates started --- automate_demultiplex_config.py | 7 ++-- upload_and_setoff_workflows.py | 67 +++++++++++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index e579fb4b..24602d93 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -170,9 +170,9 @@ congenica_app_path = "Apps/congenica_upload_v1.3.2" congenica_SFTP_upload_app = "applet-GFfJpj80jy1x1Bz1P1Bk3vQf" -# TSO500 app +# TSO500 app TODO update to new version of app v1.6.0 tso500_app = "applet-GPgkz0j0jy1Yf4XxkXjVgKfv" # Apps/TSO500_v1.5.1 -tso500_app_name = "TSO500_v1.5.1" +tso500_app_name = "TSO500_v1.6.0" tso500_docker_image = ( "project-ByfFPz00jy1fk6PjpZ95F27J:file-Fz9Zyx00b5j8xKVkKv4fZ6JB" ) @@ -383,6 +383,7 @@ TSO500_samplesheet_stage = " -isamplesheet=" TSO500_analysis_options_stage = " -ianalysis_options=" TSO500_project_name_stage = " -iproject_name=" +TSO500_runfolder_name_stage = " -irunfolder_name=" # app instance types TSO500_analysis_instance_high_throughput = "mem1_ssd1_v2_x72" @@ -639,7 +640,7 @@ "Pan5085", "Pan5112", "Pan5114", -] # note the settings from the first item in this list are used when setting off the TSO500_output_parser commands. +] default_panel_properties = { diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index db0704d3..5840e4bb 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -114,6 +114,7 @@ def __init__(self, runfolder): + self.runfolder_name + "_congenica_upload_commands.sh" ) + #TODO copy lines above to create separate dx run commands output script for TSO (to be run by duty binfx) self.nexus_project_name = "" self.nexus_path = "" self.nexus_project_id = "" @@ -148,6 +149,9 @@ def __init__(self, runfolder, now, debug_mode=False): # list of fastqs to get ngs run number and WES batch self.list_of_processed_samples = [] + #list of TSO samplesheets + self.TSO500_samplesheets_list = [] + # DNA Nexus commands to be built on later self.source_command = "#!/bin/bash\n. %s" % ( config.sdk_source_cmd @@ -336,7 +340,7 @@ def quarterback(self): if TSO500_sample_list: self.list_of_processed_samples, self.fastq_string = ( TSO500_sample_list, - self.runfolder_obj.runfolder_samplesheet_path, + self.runfolder_obj.runfolder_samplesheet_path, #TODO this sets the fastq_string to be the samplesheet path ) else: @@ -367,9 +371,12 @@ def quarterback(self): view_users_list, admin_users_list ).rstrip() ) + # split tso samplesheet and write split versions to the runfolder # build upload agent command for fastq upload and write stdout to ua_stdout_log # pass path to function which checks files were uploaded without error if TSO500_sample_list: + # split TSO samplesheet to multiple sheets with <=16 samples/sheet + self.TSO500_samplesheets_list = self.split_tso500_sampleheet(): backup_attempt_count = 1 while backup_attempt_count < 5: self.loggers.script.info( @@ -385,6 +392,9 @@ def quarterback(self): # increase backup count backup_attempt_count += 1 + #upload fastqs. if TSO500 run, this uploads the samplesheet to the project root + #TODO make this an else for the above if TSO500_sample_list. then split and upload TSO samplesheets separately. + # TODO check whether upload_fastqs() output or related variables are used elsewhere self.look_for_upload_errors(self.upload_fastqs()) # upload cluster density files and check upload was successful. @@ -624,6 +634,43 @@ def check_for_TSO500(self): open(self.loggers.upload_agent.filepath, "w").close() return sample_list + def split_tso500_sampleheet(self): + """ + take TSO500 samplesheet and split in to parts with <=16 samples/sheet + write samplesheets to runfolder + return list of samplesheet paths? or just names (if they're saved in the runfolder, + they'll be uploaded to DNAnexus, can access from there for dx run cmds) + """ + # samplesheet in the runfolder + samplesheet_file = self.runfolder_samplesheet_name + # Read all lines from the sample sheet + with open(samplesheet_file) as samplesheet: + all_lines = samplesheet.readlines() + + # Separate header from samples. TSO samplesheet header is the first 25 lines of the file + samplesheet_header = all_lines[:25] + # sample lines start with "TSO". This excludes empty lines below the samples list, i.e. lines containing ",,,,,,," + samples = [sample for sample in all_lines[25:] if sample.startswith("TSO")] + + # Split samples into batches of 16 + batches = [samples[i:i + 16] for i in range(0, len(samples), 16)] + + # Write batches to separate files named "PartXofY", and add samplesheet to list + samplesheet_list = [] + number_of_batches = len(batches) + samplesheet_base_name = samplesheet_file.split(".csv")[0] + for samplesheet_count, batch in enumerate(batches, start=1): + #capture samplesheet file path to write samplesheet paths to the runfolder + samplesheet_filename = "%sPart%sof%s.csv" % (samplesheet_base_name,samplesheet_count,number_of_batches) + # capture samplesheet name to write to list- use runfolder name + samplesheet_name = "%s_SampleSheetPart%sof%s.csv" % (self.runfolder_obj.runfolder_name,samplesheet_count,number_of_batches) + samplesheet_list.append(samplesheet_name) + with open(samplesheet_filename, "a") as new_samplesheet: + new_samplesheet.writelines(samplesheet_header) + new_samplesheet.writelines(batch) + + return(samplesheet_list) + def check_for_development_run(self): """ Read samplesheet looking for development pan number. @@ -1562,10 +1609,14 @@ def start_building_dx_run_cmds(self): commands_list.append(cmd) if TSO500: - commands_list.append("#The TSOapp is set off first. This utilises the --wait flag, so the bash script waits until this job finishes before running the coverage, hap.py and fastqc commands using the samplesheet to determine expected files and thier locations ") + commands_list.append("#The TSOapp is set off first. This utilises the --wait flag, so the bash script waits until this job finishes before running the coverage, hap.py and fastqc commands using the samplesheet to determine expected files and their locations ") commands_list.append("#All jobs apart from control samples are added to the depends on list used to delay multiqc") # build command for the TSO500 app and set off fastqc commands - commands_list.append(self.create_tso500_command()) + # TODO add for loop here to loop through samplesheets and write command for each + for samplesheet in self.TSO500_samplesheets_list: + commands_list.append(self.create_tso500_command(samplesheet)) + + # TODO modify this to handle creating separate file for TSO commands commands_list.append(self.add_to_depends_list("TSO500", 'depends_list')) # For TSO samples, the fastqs are created within DNAnexus and the @@ -1806,7 +1857,7 @@ def create_fastqc_command(self, fastqs): return dx_command - def create_tso500_command(self): + def create_tso500_command(self,samplesheet): """ Build dx run command for tso500 docker app. Will assess if it's a novaseq or not from the runfoldername and if it's @@ -1852,6 +1903,7 @@ def create_tso500_command(self): ## docker image (from config) ## runfolder_tar and samplesheet paths (from runfolder_obj class) ## analysis options eg --isNovaSeq flag + # TODO modify for new way of setting off app. WAIT removed dx_command_list = [ self.tso500_dx_command, # ends with --name so supply the runfolder name to name the job self.runfolder_obj.runfolder_name, @@ -1860,13 +1912,16 @@ def create_tso500_command(self): config.TSO500_samplesheet_stage, self.runfolder_obj.nexus_project_id + ":" - + self.runfolder_obj.runfolder_samplesheet_name, + + self.#TODO not sure if this will work...find runfolder name in DNAnexus project + + "/" + + samplesheet config.TSO500_project_name_stage, self.runfolder_obj.nexus_project_name, + config.TSO500_runfolder_name_stage, #TODO take this out again? + self.#find runfolder name in DNAnexus project config.TSO500_analysis_options_stage, TSO500_analysis_options, instance_type, - "--wait ", self.dest, self.dest_cmd, self.token, From 363bddda424face0e861f9d8ff6b89e0038cfb9b Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Mon, 16 Oct 2023 11:35:08 +0100 Subject: [PATCH 08/16] update to newer exome depth bed and PON --- automate_demultiplex_config.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index e579fb4b..77b58179 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -277,11 +277,11 @@ ED_cnvcalling_path = "Apps/ED_cnv_calling_v1.2.0" ED_cnvcalling_instance_type = "mem1_ssd1_v2_x4" #VCP1 exome depth -ED_readcount_normals_VCP1_file= "project-ByfFPz00jy1fk6PjpZ95F27J:file-GZ47PPj0xygQ8z3z8yQK1qJF"#"Pan5134_normals_v1.0.0.RData" -ED_VCP1_readcount_BEDfile_pannum = "Pan5134_exomedepth.bed" +ED_readcount_normals_VCP1_file= "project-ByfFPz00jy1fk6PjpZ95F27J:file-GZYK6380f66PPy4kjzVQ7xj8"#"Pan5191_normals_v1.0.0.RData" +ED_VCP1_readcount_BEDfile_pannum = "Pan5191_exomedepth.bed" #VCP2 normals data file -ED_readcount_normals_VCP2_file="project-ByfFPz00jy1fk6PjpZ95F27J:file-GZ8ybG00bx11vq9fXP1j7QQK"#"Pan5132_normals_v1.0.0.RData" -ED_VCP2_readcount_BEDfile_pannum = "Pan5132_exomedepth.bed" +ED_readcount_normals_VCP2_file="project-ByfFPz00jy1fk6PjpZ95F27J:file-GZYbq400YG627Q12g1bbP440"#"Pan5188_normals_v1.0.0.RData" +ED_VCP2_readcount_BEDfile_pannum = "Pan5188_exomedepth.bed" #VCP3 normals data file ED_readcount_normals_VCP3_file=None#"Pan5149_normals_v1.0.0.RData" ED_VCP3_readcount_BEDfile_pannum = "Pan5149_exomedepth.bed" @@ -1035,7 +1035,7 @@ "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", - "exome_depth_cnvcalling_BED": "Pan5160" # use R211 CNV bedfile + "exome_depth_cnvcalling_BED": "Pan5193" # use R211 CNV bedfile }, "Pan4130": { # VCP2 R211 polyposis (Viapath) "mokapipe": True, @@ -1047,7 +1047,7 @@ "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", - "exome_depth_cnvcalling_BED": "Pan5160" + "exome_depth_cnvcalling_BED": "Pan5193" }, "Pan5186": { # VCP2 R414 APC (Viapath) "mokapipe": True, @@ -1507,7 +1507,7 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", - "exome_depth_cnvcalling_BED": "Pan5160" # useR211 bedfile + "exome_depth_cnvcalling_BED": "Pan5193" # useR211 bedfile }, "Pan4820": { # VCP2 STG R211 "mokapipe": True, @@ -1521,7 +1521,7 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", - "exome_depth_cnvcalling_BED": "Pan5160" + "exome_depth_cnvcalling_BED": "Pan5193" }, "Pan5185": { # VCP2 STG R414 "mokapipe": True, From b5bc093e5cd853b5c22cddfb2016234f94094794 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Wed, 18 Oct 2023 15:52:55 +0100 Subject: [PATCH 09/16] changes to split up TSO runs, use latest TSO app, latest duty_csv app --- automate_demultiplex_config.py | 22 +++-- upload_and_setoff_workflows.py | 150 +++++++++++++++++++++------------ 2 files changed, 108 insertions(+), 64 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index 24602d93..8bcf26b1 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -56,6 +56,12 @@ "999999_A01229_0182_AHM2TSO500", ] +# TSO500 batch size (for splitting samplesheet) +if testing: + batch_size = 2 +else: + batch_size = 16 + # path to log file which records the output of the upload agent upload_and_setoff_workflow_logfile = ( "{document_root}/automate_demultiplexing_logfiles/upload_agent_script_logfiles/" @@ -134,7 +140,7 @@ # MokaSNP ID mokasnp_pipeline_ID = "5091" # TSO500 pipeline ID -TSO_pipeline_ID = "5237" +TSO_pipeline_ID = "5288" #TSO v1.6 # -- Moka WES test status-- # Test Status = NextSEQ sequencing @@ -170,8 +176,8 @@ congenica_app_path = "Apps/congenica_upload_v1.3.2" congenica_SFTP_upload_app = "applet-GFfJpj80jy1x1Bz1P1Bk3vQf" -# TSO500 app TODO update to new version of app v1.6.0 -tso500_app = "applet-GPgkz0j0jy1Yf4XxkXjVgKfv" # Apps/TSO500_v1.5.1 +# TSO500 app +tso500_app = "applet-GZgv0Jj0jy1Yfbx3QvqyKjzp" # Apps/TSO500_v1.6.0 tso500_app_name = "TSO500_v1.6.0" tso500_docker_image = ( "project-ByfFPz00jy1fk6PjpZ95F27J:file-Fz9Zyx00b5j8xKVkKv4fZ6JB" @@ -1256,7 +1262,7 @@ }, "Pan4969": { # TSO500 no UTRs. TERT promoter "TSO500": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1265,7 +1271,7 @@ "Pan5085": { # TSO500 High throughput Synnovis. no UTRs. TERT promoter "TSO500": True, "TSO500_high_throughput": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1274,7 +1280,7 @@ "Pan5112": { # TSO500 High throughput BSPS. no UTRs. TERT promoter "TSO500": True, "TSO500_high_throughput": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1284,7 +1290,7 @@ "Pan5114": { # TSO500 High throughput Control. no UTRs. TERT promoter "TSO500": True, "TSO500_high_throughput": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1765,7 +1771,7 @@ } duty_csv_id = ( - "project-ByfFPz00jy1fk6PjpZ95F27J:applet-GQg9J280jy1Zf79KGx9gk5K3" + "project-ByfFPz00jy1fk6PjpZ95F27J:applet-GZYx3Kj0kKj3YBV7qgK6VjXQ" ) duty_csv_inputs = { # tso_pannumbers should not include the dry lab pan number diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index 5840e4bb..a458a896 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -114,6 +114,11 @@ def __init__(self, runfolder): + self.runfolder_name + "_congenica_upload_commands.sh" ) + self.TSO500_post_run_command_script = ( + config.DNA_Nexus_workflow_logfolder + + self.runfolder_name + + "_TSO_post_run_commands.sh" + ) #TODO copy lines above to create separate dx run commands output script for TSO (to be run by duty binfx) self.nexus_project_name = "" self.nexus_path = "" @@ -340,7 +345,7 @@ def quarterback(self): if TSO500_sample_list: self.list_of_processed_samples, self.fastq_string = ( TSO500_sample_list, - self.runfolder_obj.runfolder_samplesheet_path, #TODO this sets the fastq_string to be the samplesheet path + self.runfolder_obj.runfolder_samplesheet_path, #this sets the fastq_string to be the samplesheet path ) else: @@ -376,7 +381,7 @@ def quarterback(self): # pass path to function which checks files were uploaded without error if TSO500_sample_list: # split TSO samplesheet to multiple sheets with <=16 samples/sheet - self.TSO500_samplesheets_list = self.split_tso500_sampleheet(): + self.TSO500_samplesheets_list = self.split_TSO500_sampleheet() backup_attempt_count = 1 while backup_attempt_count < 5: self.loggers.script.info( @@ -393,8 +398,6 @@ def quarterback(self): backup_attempt_count += 1 #upload fastqs. if TSO500 run, this uploads the samplesheet to the project root - #TODO make this an else for the above if TSO500_sample_list. then split and upload TSO samplesheets separately. - # TODO check whether upload_fastqs() output or related variables are used elsewhere self.look_for_upload_errors(self.upload_fastqs()) # upload cluster density files and check upload was successful. @@ -634,7 +637,7 @@ def check_for_TSO500(self): open(self.loggers.upload_agent.filepath, "w").close() return sample_list - def split_tso500_sampleheet(self): + def split_TSO500_sampleheet(self): """ take TSO500 samplesheet and split in to parts with <=16 samples/sheet write samplesheets to runfolder @@ -642,7 +645,8 @@ def split_tso500_sampleheet(self): they'll be uploaded to DNAnexus, can access from there for dx run cmds) """ # samplesheet in the runfolder - samplesheet_file = self.runfolder_samplesheet_name + samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) + # Read all lines from the sample sheet with open(samplesheet_file) as samplesheet: all_lines = samplesheet.readlines() @@ -652,8 +656,8 @@ def split_tso500_sampleheet(self): # sample lines start with "TSO". This excludes empty lines below the samples list, i.e. lines containing ",,,,,,," samples = [sample for sample in all_lines[25:] if sample.startswith("TSO")] - # Split samples into batches of 16 - batches = [samples[i:i + 16] for i in range(0, len(samples), 16)] + # Split samples into batches (size specified in config) + batches = [samples[i:i + config.batch_size] for i in range(0, len(samples), config.batch_size)] # Write batches to separate files named "PartXofY", and add samplesheet to list samplesheet_list = [] @@ -661,11 +665,11 @@ def split_tso500_sampleheet(self): samplesheet_base_name = samplesheet_file.split(".csv")[0] for samplesheet_count, batch in enumerate(batches, start=1): #capture samplesheet file path to write samplesheet paths to the runfolder - samplesheet_filename = "%sPart%sof%s.csv" % (samplesheet_base_name,samplesheet_count,number_of_batches) + samplesheet_filepath = "%sPart%sof%s.csv" % (samplesheet_base_name,samplesheet_count,number_of_batches) # capture samplesheet name to write to list- use runfolder name samplesheet_name = "%s_SampleSheetPart%sof%s.csv" % (self.runfolder_obj.runfolder_name,samplesheet_count,number_of_batches) samplesheet_list.append(samplesheet_name) - with open(samplesheet_filename, "a") as new_samplesheet: + with open(samplesheet_filepath, "a") as new_samplesheet: new_samplesheet.writelines(samplesheet_header) new_samplesheet.writelines(batch) @@ -1608,51 +1612,23 @@ def start_building_dx_run_cmds(self): for cmd in self.determine_exome_depth_requirements(pannnumber_list): commands_list.append(cmd) + # write TSO commands if a TSO run. if TSO500: - commands_list.append("#The TSOapp is set off first. This utilises the --wait flag, so the bash script waits until this job finishes before running the coverage, hap.py and fastqc commands using the samplesheet to determine expected files and their locations ") - commands_list.append("#All jobs apart from control samples are added to the depends on list used to delay multiqc") - # build command for the TSO500 app and set off fastqc commands - # TODO add for loop here to loop through samplesheets and write command for each + commands_list.append("#The TSOapp is set off once for each samplesheet made") + commands_list.append("#Other jobs must be set off manually by running the file once the pipeline has finished") + # build commands for the TSO500 app and set off fastqc commands (need a command per samplesheet) for samplesheet in self.TSO500_samplesheets_list: commands_list.append(self.create_tso500_command(samplesheet)) - # TODO modify this to handle creating separate file for TSO commands - commands_list.append(self.add_to_depends_list("TSO500", 'depends_list')) + self.build_TSO500_post_run_commands() + + # TSO500 multiqc commands are written to a separate file with a function called above + if not TSO500: + commands_list.append(self.create_multiqc_command()) + commands_list.append(self.add_to_depends_list("MultiQC", 'depends_list')) + commands_list.append(self.create_upload_multiqc_command(TSO500)) + commands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) - # For TSO samples, the fastqs are created within DNAnexus and the - # commands are generated using sample names parsed from the - # samplesheet. If for whatever reason those fastqs are not created - # by the DNAnexus app, the downstream job will not set off and - # therefore will produce no job ID to provide to the depends_list, - # which will create an error/ slack alert. To solve this problem, - # the job ID is only added to the depends list if it exits - for sample in self.list_of_processed_samples: - pannumber = re.search(r"Pan\d+", sample).group() - commands_list.append( - self.create_fastqc_command(sample) - ) - # Only add to depends_list if job ID from previous command - # is not empty - commands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) - - commands_list.append(self.create_sambamba_cmd(sample, pannumber)) - # Exclude negative controls from the depends list as the NTC - # coverage calculation can often fail. We want the coverage - # report for the NTC sample to help assess contamination. - # Only add to depends_list if job ID from previous command - # is not empty - commands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) - - if "HD200" in sample: - commands_list.append(self.create_sompy_cmd(sample, pannumber)) - # Only add to depends_list if job ID from previous command - # is not empty - commands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list("sompy", 'depends_list')) - - commands_list.append(self.create_multiqc_command()) - commands_list.append(self.add_to_depends_list("MultiQC", 'depends_list')) - commands_list.append(self.create_upload_multiqc_command(TSO500)) - commands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) # setoff the below commands later as they are not depended upon by # MultiQC but are required for duty_csv if rpkm_list: @@ -1664,10 +1640,73 @@ def start_building_dx_run_cmds(self): commands_list.append(self.add_to_depends_list("rpkm", 'depends_list')) commands_list.append(self.add_to_depends_list("depends", 'depends_list_recombined')) - commands_list.append(self.create_duty_csv_command()) + if not TSO500: + commands_list.append(self.create_duty_csv_command()) return commands_list + def build_TSO500_post_run_commands(self): + """ + Function to build TSO500 commands to run after pipeline, i.e. + Fastqc, sambamba, sompy, multiqc, upload multiqc and duty_csv + Commands must be written to file _TSO_post_run_commands.sh + which can be run manually once pipeline done. + For TSO samples, the fastqs are created within DNAnexus and the + commands are generated using sample names parsed from the + samplesheet. If for whatever reason those fastqs are not created + by the DNAnexus app, the downstream job will not set off and + therefore will produce no job ID to provide to the depends_list, + which will create an error/ slack alert. To solve this problem, + the job ID is only added to the depends list if it exits + """ + # Update script log file to say what is being done. + self.loggers.script.info("Building dx run commands for TSO500 post pipeline processing") + + # list to hold all commands. + TSO500 = True + TSOcommands_list = [] + TSOcommands_list.append(self.source_command) + TSOcommands_list.append(self.empty_depends) + TSOcommands_list.append(self.empty_gatk_depends) + + for sample in self.list_of_processed_samples: + pannumber = re.search(r"Pan\d+", sample).group() + TSOcommands_list.append( + self.create_fastqc_command(sample) + ) + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + + TSOcommands_list.append(self.create_sambamba_cmd(sample, pannumber)) + # Exclude negative controls from the depends list as the NTC + # coverage calculation can often fail. We want the coverage + # report for the NTC sample to help assess contamination. + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + + if "HD200" in sample: + TSOcommands_list.append(self.create_sompy_cmd(sample, pannumber)) + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list("sompy", 'depends_list')) + + TSOcommands_list.append(self.create_upload_multiqc_command(TSO500)) + TSOcommands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) + + TSOcommands_list.append(self.create_duty_csv_command()) + + with open( + self.runfolder_obj.TSO500_post_run_command_script, "w" + ) as TSO500_commands: + # remove any None values from the command_list + TSO500_commands.writelines( + [line + "\n" for line in filter(None, TSOcommands_list)] + ) + + return TSOcommands_list + def determine_exome_depth_requirements(self,pannnumber_list): """ This function takes a list of all pan numbers found on this run. @@ -1903,7 +1942,6 @@ def create_tso500_command(self,samplesheet): ## docker image (from config) ## runfolder_tar and samplesheet paths (from runfolder_obj class) ## analysis options eg --isNovaSeq flag - # TODO modify for new way of setting off app. WAIT removed dx_command_list = [ self.tso500_dx_command, # ends with --name so supply the runfolder name to name the job self.runfolder_obj.runfolder_name, @@ -1912,13 +1950,13 @@ def create_tso500_command(self,samplesheet): config.TSO500_samplesheet_stage, self.runfolder_obj.nexus_project_id + ":" - + self.#TODO not sure if this will work...find runfolder name in DNAnexus project + + self.runfolder_subdir + "/" - + samplesheet + + samplesheet, config.TSO500_project_name_stage, self.runfolder_obj.nexus_project_name, - config.TSO500_runfolder_name_stage, #TODO take this out again? - self.#find runfolder name in DNAnexus project + config.TSO500_runfolder_name_stage, + self.runfolder_subdir, config.TSO500_analysis_options_stage, TSO500_analysis_options, instance_type, From 080c32a0cb63ed3f83a3a384447ee6e578c227f9 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Wed, 18 Oct 2023 16:46:30 +0100 Subject: [PATCH 10/16] corrections from code review --- upload_and_setoff_workflows.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index a458a896..027670d3 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -381,7 +381,7 @@ def quarterback(self): # pass path to function which checks files were uploaded without error if TSO500_sample_list: # split TSO samplesheet to multiple sheets with <=16 samples/sheet - self.TSO500_samplesheets_list = self.split_TSO500_sampleheet() + self.TSO500_samplesheets_list = self.split_TSO500_samplesheet() backup_attempt_count = 1 while backup_attempt_count < 5: self.loggers.script.info( @@ -637,7 +637,7 @@ def check_for_TSO500(self): open(self.loggers.upload_agent.filepath, "w").close() return sample_list - def split_TSO500_sampleheet(self): + def split_TSO500_samplesheet(self): """ take TSO500 samplesheet and split in to parts with <=16 samples/sheet write samplesheets to runfolder @@ -657,6 +657,8 @@ def split_TSO500_sampleheet(self): samples = [sample for sample in all_lines[25:] if sample.startswith("TSO")] # Split samples into batches (size specified in config) + # batches is a list of lists, where each list is a subset of the samples from the samplesheet + # e.g. if batch_size=16, each list will contain up to 16 samples batches = [samples[i:i + config.batch_size] for i in range(0, len(samples), config.batch_size)] # Write batches to separate files named "PartXofY", and add samplesheet to list From df4842dd05352559a4bce005dc833dcf85237e71 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Thu, 19 Oct 2023 12:05:49 +0100 Subject: [PATCH 11/16] update samplesheet splitting function and correct error in for loop for post run commands list for TSO --- upload_and_setoff_workflows.py | 60 +++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 19 deletions(-) diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index 027670d3..d110dba1 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -647,14 +647,36 @@ def split_TSO500_samplesheet(self): # samplesheet in the runfolder samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) + samplesheet_header = [] + samples = [] + no_sample_lines = 0 + expected_data_headers = ["Sample_ID", "Sample_Name", "index"] + # Read all lines from the sample sheet with open(samplesheet_file) as samplesheet: - all_lines = samplesheet.readlines() + for line in reversed(samplesheet.readlines()): + # stop when get to data headers section + if any(header in line for header in expected_data_headers): + break + # skip empty lines (check first element of the line, after splitting on comma) + elif len(line.split(",")[0]) < 2: + pass + # If its a line containing a sample:: + elif line.startswith("TSO"): + samples.append(line) + no_sample_lines += 1 + # get header + with open(samplesheet_file) as samplesheet: + for line in samplesheet.readlines(): + # stop when get to data headers section- add header line to header then break + if any(header in line for header in expected_data_headers): + samplesheet_header.append(line) + break + else: + samplesheet_header.append(line) - # Separate header from samples. TSO samplesheet header is the first 25 lines of the file - samplesheet_header = all_lines[:25] - # sample lines start with "TSO". This excludes empty lines below the samples list, i.e. lines containing ",,,,,,," - samples = [sample for sample in all_lines[25:] if sample.startswith("TSO")] + # reverse samples list to get back in correct order (starting at sample 1) + samples.reverse() # Split samples into batches (size specified in config) # batches is a list of lists, where each list is a subset of the samples from the samplesheet @@ -1676,23 +1698,23 @@ def build_TSO500_post_run_commands(self): TSOcommands_list.append( self.create_fastqc_command(sample) ) - # Only add to depends_list if job ID from previous command - # is not empty - TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) - - TSOcommands_list.append(self.create_sambamba_cmd(sample, pannumber)) - # Exclude negative controls from the depends list as the NTC - # coverage calculation can often fail. We want the coverage - # report for the NTC sample to help assess contamination. - # Only add to depends_list if job ID from previous command - # is not empty - TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) - if "HD200" in sample: - TSOcommands_list.append(self.create_sompy_cmd(sample, pannumber)) + TSOcommands_list.append(self.create_sambamba_cmd(sample, pannumber)) + # Exclude negative controls from the depends list as the NTC + # coverage calculation can often fail. We want the coverage + # report for the NTC sample to help assess contamination. # Only add to depends_list if job ID from previous command # is not empty - TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list("sompy", 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + + if "HD200" in sample: + TSOcommands_list.append(self.create_sompy_cmd(sample, pannumber)) + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list("sompy", 'depends_list')) TSOcommands_list.append(self.create_upload_multiqc_command(TSO500)) TSOcommands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) From 0db361266ce7557e8cabad5975b4e6f7f1fac9f6 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Thu, 19 Oct 2023 13:22:29 +0100 Subject: [PATCH 12/16] add multiqc command to TSOcommands_list and echo job IDs --- upload_and_setoff_workflows.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index d110dba1..e1f3a50e 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -1701,6 +1701,7 @@ def build_TSO500_post_run_commands(self): # Only add to depends_list if job ID from previous command # is not empty TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) TSOcommands_list.append(self.create_sambamba_cmd(sample, pannumber)) # Exclude negative controls from the depends list as the NTC @@ -1709,17 +1710,24 @@ def build_TSO500_post_run_commands(self): # Only add to depends_list if job ID from previous command # is not empty TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) if "HD200" in sample: TSOcommands_list.append(self.create_sompy_cmd(sample, pannumber)) # Only add to depends_list if job ID from previous command # is not empty TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list("sompy", 'depends_list')) - + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) + + TSOcommands_list.append(self.create_multiqc_command()) + TSOcommands_list.append(self.add_to_depends_list("MultiQC", 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) TSOcommands_list.append(self.create_upload_multiqc_command(TSO500)) TSOcommands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) TSOcommands_list.append(self.create_duty_csv_command()) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) with open( self.runfolder_obj.TSO500_post_run_command_script, "w" From 58e07050dbc0b05631b5239ebda0fc11ef0c1b08 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Fri, 20 Oct 2023 10:56:39 +0100 Subject: [PATCH 13/16] remove vcp3 exome depth and code review changes --- automate_demultiplex_config.py | 81 ++++------------------------------ upload_and_setoff_workflows.py | 10 ++--- 2 files changed, 14 insertions(+), 77 deletions(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index 4415b373..4d75c5dc 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -289,8 +289,8 @@ ED_readcount_normals_VCP2_file="project-ByfFPz00jy1fk6PjpZ95F27J:file-GZYbq400YG627Q12g1bbP440"#"Pan5188_normals_v1.0.0.RData" ED_VCP2_readcount_BEDfile_pannum = "Pan5188_exomedepth.bed" #VCP3 normals data file -ED_readcount_normals_VCP3_file=None#"Pan5149_normals_v1.0.0.RData" -ED_VCP3_readcount_BEDfile_pannum = "Pan5149_exomedepth.bed" +ED_readcount_normals_VCP3_file=None #"Pan5149_normals_v1.0.0.RData" +ED_VCP3_readcount_BEDfile_pannum = None #"Pan5149_exomedepth.bed" exomedepth_refgenome_file = "project-ByfFPz00jy1fk6PjpZ95F27J:file-B6ZY7VG2J35Vfvpkj8y0KZ01" #hs37d5.fa.gz from 001 ## readcount app inputs @@ -467,10 +467,6 @@ "Pan5085", # TSO500 High throughput Synnovis. no UTRS TERT promoter "Pan5112", # TSO500 High throughput BSPS. no UTRS TERT promoter "Pan5114", # TSO500 High throughput Control. no UTRS TERT promoter - #"Pan4042", # STG VCP2 BRCA - TO BE REMOVED IN FUTURE UPDATE - #"Pan4043", # STG VCP3 - TO BE REMOVED IN FUTURE UPDATE - #"Pan4044", # STG VCP1 - TO BE REMOVED IN FUTURE UPDATE - #"Pan4049", # STG VCP2 CrCa - TO BE REMOVED IN FUTURE UPDATE "Pan4119", # VCP1 Viapath R134 (FH) "Pan4121", # VCP1 Viapath R184 (CF) "Pan4122", # VCP1 Viapath R25 (FGFR) @@ -556,7 +552,6 @@ "Pan4122", "Pan4125", "Pan4126", - #"Pan4044", "Pan4821", "Pan4822", "Pan4823", @@ -579,8 +574,6 @@ "Pan4150", "Pan4129", "Pan4130", - #"Pan4042", - #"Pan4049", "Pan4816", "Pan4817", "Pan4819", @@ -606,7 +599,6 @@ "Pan4145", "Pan4146", "Pan4151", - #"Pan4043", "Pan4314", "Pan4351", "Pan4387", @@ -739,33 +731,6 @@ "hsmetrics_bedfile": "Pan4082.bed", "sambamba_bedfile": "Pan4082Sambamba.bed", }, - # "Pan4044": { # VCP1 STG - # "mokapipe": True, - # "multiqc_coverage_level": 30, - # "RPKM_bedfile_pan_number": "Pan4399", - # "RPKM_also_analyse": vcp1_panel_list, - # "congenica_credentials": "STG", - # "congenica_IR_template": "non-priority", - # "congenica_project": "4203", - # "hsmetrics_bedfile": "Pan4397data.bed", - # "variant_calling_bedfile": "Pan4398data.bed", - # "sambamba_bedfile": "Pan4397dataSambamba.bed", - # "STG": True, - # "exome_depth_cnvcalling_BED": "PanXXXX" - # }, - # "Pan4042": { # VCP2 STG BRCA - # "mokapipe": True, - # "multiqc_coverage_level": 30, - # "RPKM_bedfile_pan_number": "Pan5109", - # "RPKM_also_analyse": vcp2_panel_list, - # "congenica_credentials": "STG", - # "congenica_IR_template": "non-priority", - # "congenica_project": "1099", - # "hsmetrics_bedfile": "Pan5123data.bed", - # "variant_calling_bedfile": "Pan5119data.bed", - # "sambamba_bedfile": "Pan5123dataSambamba.bed", - # "exome_depth_cnvcalling_BED": "PanXXXX" - # }, "Pan5144": { # VCP2 R444.1 Breast cancer (PARP treatment- STG) "mokapipe": True, "multiqc_coverage_level": 30, @@ -797,32 +762,6 @@ "multiqc_coverage_level": 30, "variant_calling_bedfile": "Pan4009.bed", }, - # "Pan4049": { # VCP2 STG CrCa - # "mokapipe": True, - # "multiqc_coverage_level": 30, - # "RPKM_bedfile_pan_number": "Pan5109", - # "RPKM_also_analyse": vcp2_panel_list, - # "congenica_credentials": "STG", - # "congenica_IR_template": "non-priority", - # "congenica_project": "4202", - # "hsmetrics_bedfile": "Pan5123data.bed", - # "variant_calling_bedfile": "Pan5119data.bed", - # "sambamba_bedfile": "Pan5123dataSambamba.bed", - # "exome_depth_cnvcalling_BED": "PanXXXX" - # }, - # "Pan4043": { # VCP3 STG - # "mokapipe": True, - # "multiqc_coverage_level": 30, - # "RPKM_bedfile_pan_number": "Pan4362", - # "RPKM_also_analyse": vcp3_panel_list, - # "congenica_credentials": "STG", - # "congenica_IR_template": "non-priority", - # "congenica_project": "4201", - # "hsmetrics_bedfile": "Pan4995data.bed", - # "variant_calling_bedfile": "Pan4995data.bed", - # "sambamba_bedfile": "Pan4995dataSambamba.bed", - # "exome_depth_cnvcalling_BED": "PanXXXX" - # }, "Pan4119": { # VCP1 R134_Familial hypercholesterolaemia-Familial hypercholesterolaemia Small panel (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, @@ -1181,7 +1120,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan5168" + #"exome_depth_cnvcalling_BED": "Pan5168" #Exome depth does not support VCP3 yet }, "Pan4146": { # VCP3 R81 CM (Viapath) "mokapipe": True, @@ -1192,7 +1131,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan5170" + #"exome_depth_cnvcalling_BED": "Pan5170" #Exome depth does not support VCP3 yet }, "Pan4151": { # VCP3 R82 limb girdle (Viapath) "mokapipe": True, @@ -1213,7 +1152,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan5177" + #"exome_depth_cnvcalling_BED": "Pan5177" #Exome depth does not support VCP3 yet }, "Pan4387": { # VCP3 R90 Bleeding and platelet disorders (Viapath) "mokapipe": True, @@ -1224,7 +1163,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan5171" + #"exome_depth_cnvcalling_BED": "Pan5171" #Exome depth does not support VCP3 yet }, "Pan4390": { # VCP3 R97 Thrombophilia with a likely monogenic cause (Viapath) "mokapipe": True, @@ -1235,7 +1174,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", - "exome_depth_cnvcalling_BED": "Pan5173", + #"exome_depth_cnvcalling_BED": "Pan5173", #Exome depth does not support VCP3 yet }, "Pan4314": { # VCP3 R229 (Viapath) "mokapipe": True, @@ -1462,7 +1401,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan5168", + # "exome_depth_cnvcalling_BED": "Pan5168", #Exome depth does not support VCP3 yet }, "Pan4834": { # VCP3 STG R81 "mokapipe": True, @@ -1475,7 +1414,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "exome_depth_cnvcalling_BED": "Pan5170", + #"exome_depth_cnvcalling_BED": "Pan5170", #Exome depth does not support VCP3 yet }, "Pan4835": { # VCP3 STG R82 "mokapipe": True, @@ -1594,7 +1533,6 @@ "variant_calling_bedfile": "Pan4767data.bed", "sambamba_bedfile": "Pan5018dataSambamba.bed", "masked_reference": "project-ByfFPz00jy1fk6PjpZ95F27J:file-GF84GF00QfBfzV35Gf8Qg53q", # hs37d5_Pan4967.bwa-index.tar.gz - }, "Pan5008": { # LRPCR STG R207 PMS2 "mokapipe": True, @@ -1607,7 +1545,6 @@ "variant_calling_bedfile": "Pan4767data.bed", "sambamba_bedfile": "Pan5018dataSambamba.bed", "masked_reference": "project-ByfFPz00jy1fk6PjpZ95F27J:file-GF84GF00QfBfzV35Gf8Qg53q", # hs37d5_Pan4967.bwa-index.tar.gz - }, "Pan5011": { # LRPCR Via R210 PMS2 "mokapipe": True, diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index e1f3a50e..e8ca36d5 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -119,7 +119,7 @@ def __init__(self, runfolder): + self.runfolder_name + "_TSO_post_run_commands.sh" ) - #TODO copy lines above to create separate dx run commands output script for TSO (to be run by duty binfx) + self.nexus_project_name = "" self.nexus_path = "" self.nexus_project_id = "" @@ -639,10 +639,9 @@ def check_for_TSO500(self): def split_TSO500_samplesheet(self): """ - take TSO500 samplesheet and split in to parts with <=16 samples/sheet + take TSO500 samplesheet and split in to parts with x samples per samplesheet (x defined in config.batch_size) write samplesheets to runfolder - return list of samplesheet paths? or just names (if they're saved in the runfolder, - they'll be uploaded to DNAnexus, can access from there for dx run cmds) + returns: list of samplesheet names """ # samplesheet in the runfolder samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) @@ -1793,6 +1792,7 @@ def determine_exome_depth_requirements(self,pannnumber_list): command_list.append(self.build_ED_cnv_calling_cmd(panel)) return command_list + def build_ED_readcount_cmd(self,pannumber_list, normals_file,readcount_bedfile_pannum): """ This function builds the dx run command for the exome depth readcount app @@ -2360,7 +2360,7 @@ def prepare_rpkm_list(self, rpkm_list): # return list to be used to build rpkm command(s). return cleaned_list - # TODO set this up so it only runs the RPKM app if there are enough samples files (minimum 3 required by the app) + def create_rpkm_command(self, pannumber): """ Input = Pannumber for a single RPKM analysis From 49686483bcb4dc1b8b693fb3bd6e8a545fac2725 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Fri, 20 Oct 2023 13:03:30 +0100 Subject: [PATCH 14/16] corrected samplesheet paths for TSO samplesheet splitting function --- upload_and_setoff_workflows.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index e8ca36d5..e1b7a320 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -644,7 +644,7 @@ def split_TSO500_samplesheet(self): returns: list of samplesheet names """ # samplesheet in the runfolder - samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) + samplesheet_file = self.runfolder_obj.runfolder_samplesheet_path samplesheet_header = [] samples = [] @@ -685,7 +685,9 @@ def split_TSO500_samplesheet(self): # Write batches to separate files named "PartXofY", and add samplesheet to list samplesheet_list = [] number_of_batches = len(batches) - samplesheet_base_name = samplesheet_file.split(".csv")[0] + #capture path for samplesheet in runfolder + runfolder_samplesheet_file = samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) + samplesheet_base_name = runfolder_samplesheet_file.split(".csv")[0] for samplesheet_count, batch in enumerate(batches, start=1): #capture samplesheet file path to write samplesheet paths to the runfolder samplesheet_filepath = "%sPart%sof%s.csv" % (samplesheet_base_name,samplesheet_count,number_of_batches) From 6f726bd1d7c22fa5a321556990df1878922199f0 Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Fri, 20 Oct 2023 16:28:03 +0100 Subject: [PATCH 15/16] corrected error in TSO samplesheet splitting function and removed comment lines for dx run commands file --- upload_and_setoff_workflows.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index e1b7a320..acc47376 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -686,7 +686,7 @@ def split_TSO500_samplesheet(self): samplesheet_list = [] number_of_batches = len(batches) #capture path for samplesheet in runfolder - runfolder_samplesheet_file = samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) + runfolder_samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) samplesheet_base_name = runfolder_samplesheet_file.split(".csv")[0] for samplesheet_count, batch in enumerate(batches, start=1): #capture samplesheet file path to write samplesheet paths to the runfolder @@ -1568,11 +1568,6 @@ def start_building_dx_run_cmds(self): # If panel is to be processed using mokapipe if self.panel_dictionary[panel]["mokapipe"]: - commands_list.append("#For each sample there are 5 lines of commands. The dx run command for the workflow. and the jobid is then added to two depends_on lists.") - commands_list.append("#The gatk depends on list is used for apps that only need to wait for the individual sample processes to finish (eg cnv calling") - commands_list.append("#The depends_on list is used for jobs that also require run wide jobs to finish (eg peddy)") - commands_list.append("#The 4th line passes the jobid to decision_support_tool_inputs.py which returns some inputs for the congenica upload command") - commands_list.append("#The 5th line uses this output and echos the dx run command to a bash script to be run after QC is checked") # call function to build the Mokapipe command and add to command list and depends list commands_list.append( self.create_mokapipe_command(fastq, panel) From fb36a6e363e77eceb5b135a0336e807c13a8fb8c Mon Sep 17 00:00:00 2001 From: MokaGuys Date: Mon, 23 Oct 2023 10:23:33 +0100 Subject: [PATCH 16/16] change testing to False --- automate_demultiplex_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index 4d75c5dc..bf796d3b 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -8,7 +8,7 @@ import os # Set debug mode -testing = True +testing = False # =====location of input/output files===== # root of folder that contains the apps, automate_demultiplexing_logfiles and