diff --git a/automate_demultiplex_config.py b/automate_demultiplex_config.py index 8a213bf0..bf796d3b 100644 --- a/automate_demultiplex_config.py +++ b/automate_demultiplex_config.py @@ -56,6 +56,12 @@ "999999_A01229_0182_AHM2TSO500", ] +# TSO500 batch size (for splitting samplesheet) +if testing: + batch_size = 2 +else: + batch_size = 16 + # path to log file which records the output of the upload agent upload_and_setoff_workflow_logfile = ( "{document_root}/automate_demultiplexing_logfiles/upload_agent_script_logfiles/" @@ -134,7 +140,7 @@ # MokaSNP ID mokasnp_pipeline_ID = "5091" # TSO500 pipeline ID -TSO_pipeline_ID = "5237" +TSO_pipeline_ID = "5288" #TSO v1.6 # -- Moka WES test status-- # Test Status = NextSEQ sequencing @@ -170,9 +176,9 @@ congenica_app_path = "Apps/congenica_upload_v1.3.2" congenica_SFTP_upload_app = "applet-GFfJpj80jy1x1Bz1P1Bk3vQf" -# TSO500 app -tso500_app = "applet-GPgkz0j0jy1Yf4XxkXjVgKfv" # Apps/TSO500_v1.5.1 -tso500_app_name = "TSO500_v1.5.1" +# TSO500 app +tso500_app = "applet-GZgv0Jj0jy1Yfbx3QvqyKjzp" # Apps/TSO500_v1.6.0 +tso500_app_name = "TSO500_v1.6.0" tso500_docker_image = ( "project-ByfFPz00jy1fk6PjpZ95F27J:file-Fz9Zyx00b5j8xKVkKv4fZ6JB" ) @@ -189,6 +195,7 @@ upload_multiqc_path = "Apps/upload_multiqc_v1.4.0" # RPKM path RPKM_path = "Apps/RPKM_using_conifer_v1.6" + # FastQC app fastqc_app = "Apps/fastqc_v1.4.0" # bedfile folder @@ -268,6 +275,41 @@ # Mokapipe FH_PRS BED file FH_PRS_bedfile_name = "Pan4909.bed" +### exome depth +# exome depth readcount app +ED_readcount_path = "Apps/ED_readcount_analysis_v1.2.0" +ED_readcount_path_instance_type = "mem1_ssd1_v2_x8" +#exome depth variant calling app +ED_cnvcalling_path = "Apps/ED_cnv_calling_v1.2.0" +ED_cnvcalling_instance_type = "mem1_ssd1_v2_x4" +#VCP1 exome depth +ED_readcount_normals_VCP1_file= "project-ByfFPz00jy1fk6PjpZ95F27J:file-GZYK6380f66PPy4kjzVQ7xj8"#"Pan5191_normals_v1.0.0.RData" +ED_VCP1_readcount_BEDfile_pannum = "Pan5191_exomedepth.bed" +#VCP2 normals data file +ED_readcount_normals_VCP2_file="project-ByfFPz00jy1fk6PjpZ95F27J:file-GZYbq400YG627Q12g1bbP440"#"Pan5188_normals_v1.0.0.RData" +ED_VCP2_readcount_BEDfile_pannum = "Pan5188_exomedepth.bed" +#VCP3 normals data file +ED_readcount_normals_VCP3_file=None #"Pan5149_normals_v1.0.0.RData" +ED_VCP3_readcount_BEDfile_pannum = None #"Pan5149_exomedepth.bed" + +exomedepth_refgenome_file = "project-ByfFPz00jy1fk6PjpZ95F27J:file-B6ZY7VG2J35Vfvpkj8y0KZ01" #hs37d5.fa.gz from 001 +## readcount app inputs +exomedepth_readcount_reference_genome_input=" -ireference_genome=%s" % (exomedepth_refgenome_file) +exomedepth_readcount_bedfile_input=" -ibedfile=" +exomedepth_readcount_normalsRdata_input=" -inormals_RData=" +exomedepth_readcount_projectname_input=" -iproject_name=" +exomedepth_readcount_pannumbers_input=" -ibamfile_pannumbers=" +exomedepth_readcount_rdata_output="RData" + + +## ED CNV calling inputs +exomedepth_cnvcalling_reference_genome_input=" -ireference_genome=%s" % (exomedepth_refgenome_file) +exomedepth_cnvcalling_readcount_file_input=" -ireadcount_file=" +exomedepth_cnvcalling_subpanel_bed_input=" -isubpanel_bed=" +exomedepth_cnvcalling_projectname_input=" -iproject_name=" +exomedepth_cnvcalling_pannumbers_input=" -ibamfile_pannumbers=" + + # MokaWES workflow_inputs wes_fastqc1 = " -istage-Ff0P5Jj0GYKY717pKX3vX8Z3.reads=" # FastQC Read 1 wes_fastqc2 = " -istage-Ff0P5V00GYKyJfpX5bqX69Yg.reads=" # FastQC Read 2 @@ -347,6 +389,7 @@ TSO500_samplesheet_stage = " -isamplesheet=" TSO500_analysis_options_stage = " -ianalysis_options=" TSO500_project_name_stage = " -iproject_name=" +TSO500_runfolder_name_stage = " -irunfolder_name=" # app instance types TSO500_analysis_instance_high_throughput = "mem1_ssd1_v2_x72" @@ -424,10 +467,6 @@ "Pan5085", # TSO500 High throughput Synnovis. no UTRS TERT promoter "Pan5112", # TSO500 High throughput BSPS. no UTRS TERT promoter "Pan5114", # TSO500 High throughput Control. no UTRS TERT promoter - "Pan4042", # STG VCP2 BRCA - TO BE REMOVED IN FUTURE UPDATE - "Pan4043", # STG VCP3 - TO BE REMOVED IN FUTURE UPDATE - "Pan4044", # STG VCP1 - TO BE REMOVED IN FUTURE UPDATE - "Pan4049", # STG VCP2 CrCa - TO BE REMOVED IN FUTURE UPDATE "Pan4119", # VCP1 Viapath R134 (FH) "Pan4121", # VCP1 Viapath R184 (CF) "Pan4122", # VCP1 Viapath R25 (FGFR) @@ -448,10 +487,10 @@ "Pan4146", # VCP3 Viapath R81 (CM) "Pan4149", # VCP2 Viapath R208 (BRCA) "Pan4150", # VCP2 Viapath R207 (ovarian) - "Pan4127", # VCP2 Viapath R209 (colorectal) "Pan4129", # VCP2 Viapath R210 (lynch) "Pan4964", # VCP2 Viapath R259 (nijmegen) "Pan4130", # VCP2 Viapath R211 (polyposis) + "Pan5186", # VCP2 Viapath R414 APC "Pan5121", # VCP2 Viapath R430 (prostate) "Pan5143", # VCP2 Viapath R444.1 Breast cancer (PARP treatment) "Pan5147", # VCP2 Viapath R444.2 Prostate cancer (PARP treatment) @@ -474,9 +513,9 @@ "Pan4825", # VCP1 STG R337 CADASIL "Pan4816", # VCP2 STG R208 BRCA "Pan4817", # VCP2 STG R207 ovarian - "Pan4818", # VCP2 STG R209 colorectal "Pan4819", # VCP2 STG R210 lynch "Pan4820", # VCP2 STG R211 polyposis + "Pan5185", # VCP2 STG R414 APC "Pan5122", # VCP2 STG R430 prostate "Pan5144", # VCP2 STG R444.1 Breast cancer (PARP treatment) "Pan5148", # VCP2 STG R444.2 Prostate cancer (PARP treatment) @@ -501,6 +540,7 @@ "Pan5014", # LRPCR STG R211 PMS2 "Pan5015", # LRPCR Via R71 SMN1 "Pan5016", # LRPCR Via R239 IKBKG + "Pan5180", # development run - stops warning messages ] @@ -512,7 +552,6 @@ "Pan4122", "Pan4125", "Pan4126", - "Pan4044", "Pan4821", "Pan4822", "Pan4823", @@ -533,14 +572,10 @@ vcp2_panel_list = [ "Pan4149", "Pan4150", - "Pan4127", "Pan4129", "Pan4130", - "Pan4042", - "Pan4049", "Pan4816", "Pan4817", - "Pan4818", "Pan4819", "Pan4820", "Pan4964", @@ -549,7 +584,9 @@ "Pan5143", "Pan5144", "Pan5147", - "Pan5148" + "Pan5148", + "Pan5185", + "Pan5186", ] vcp3_panel_list = [ "Pan4132", @@ -562,7 +599,6 @@ "Pan4145", "Pan4146", "Pan4151", - "Pan4043", "Pan4314", "Pan4351", "Pan4387", @@ -596,12 +632,13 @@ "Pan5015", "Pan5016", ] +development_pannumber_list=["Pan5180"] tso500_panel_list = [ "Pan4969", "Pan5085", "Pan5112", "Pan5114", -] # note the settings from the first item in this list are used when setting off the TSO500_output_parser commands. +] default_panel_properties = { @@ -646,6 +683,9 @@ "TSO500_high_throughput": False, "drylab_dnanexus_id": None, "masked_reference": False, + "exome_depth_cnvcalling_BED": False, + "development_run":False, # used to stopunknown pan number errors but will only demultiplex + } # override default panel settings @@ -691,31 +731,6 @@ "hsmetrics_bedfile": "Pan4082.bed", "sambamba_bedfile": "Pan4082Sambamba.bed", }, - "Pan4044": { # VCP1 STG - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan4399", - "RPKM_also_analyse": vcp1_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "4203", - "hsmetrics_bedfile": "Pan4397data.bed", - "variant_calling_bedfile": "Pan4398data.bed", - "sambamba_bedfile": "Pan4397dataSambamba.bed", - "STG": True, - }, - "Pan4042": { # VCP2 STG BRCA - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan5109", - "RPKM_also_analyse": vcp2_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "1099", - "hsmetrics_bedfile": "Pan5123data.bed", - "variant_calling_bedfile": "Pan5119data.bed", - "sambamba_bedfile": "Pan5123dataSambamba.bed", - }, "Pan5144": { # VCP2 R444.1 Breast cancer (PARP treatment- STG) "mokapipe": True, "multiqc_coverage_level": 30, @@ -727,6 +742,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5183" }, "Pan5148": { # VCP2 R444.2 Prostate cancer (PARP treatment- STG) "mokapipe": True, @@ -739,36 +755,13 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5184" }, "Pan4009": { # MokaSNP "mokasnp": True, "multiqc_coverage_level": 30, "variant_calling_bedfile": "Pan4009.bed", }, - "Pan4049": { # VCP2 STG CrCa - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan5109", - "RPKM_also_analyse": vcp2_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "4202", - "hsmetrics_bedfile": "Pan5123data.bed", - "variant_calling_bedfile": "Pan5119data.bed", - "sambamba_bedfile": "Pan5123dataSambamba.bed", - }, - "Pan4043": { # VCP3 STG - "mokapipe": True, - "multiqc_coverage_level": 30, - "RPKM_bedfile_pan_number": "Pan4362", - "RPKM_also_analyse": vcp3_panel_list, - "congenica_credentials": "STG", - "congenica_IR_template": "non-priority", - "congenica_project": "4201", - "hsmetrics_bedfile": "Pan4995data.bed", - "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", - }, "Pan4119": { # VCP1 R134_Familial hypercholesterolaemia-Familial hypercholesterolaemia Small panel (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, @@ -779,6 +772,7 @@ "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", "FH": True, + "exome_depth_cnvcalling_BED": "Pan4702" }, "Pan4121": { # VCP1 R184 CF (Viapath) "mokapipe": True, @@ -789,6 +783,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4703" }, "Pan4122": { # VCP1 R25 FGFR Viapath "mokapipe": True, @@ -798,7 +793,7 @@ "RPKM_also_analyse": vcp1_panel_list, "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "variant_calling_bedfile": "Pan4398data.bed", + "variant_calling_bedfile": "Pan4398data.bed", # CNV not required }, "Pan4125": { # VCP1 R73 DMD (Viapath) "mokapipe": True, @@ -809,6 +804,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4622" }, "Pan4126": { # VCP1 R337_CADASIL Viapath "mokapipe": True, @@ -818,7 +814,7 @@ "RPKM_also_analyse": vcp1_panel_list, "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "variant_calling_bedfile": "Pan4398data.bed", + "variant_calling_bedfile": "Pan4398data.bed",# cnv not required }, "Pan4974": { # VCP1 Viapath (Molecular Haemostasis) R112 "mokapipe": True, @@ -829,6 +825,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4985" }, "Pan4975": { # VCP1 Viapath (Molecular Haemostasis) R115 "mokapipe": True, @@ -839,6 +836,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4986" }, "Pan4976": { # VCP1 Viapath (Molecular Haemostasis) R116 "mokapipe": True, @@ -849,6 +847,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4987" }, "Pan4977": { # VCP1 Viapath (Molecular Haemostasis) R117 "mokapipe": True, @@ -859,6 +858,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4988" }, "Pan4978": { # VCP1 Viapath (Molecular Haemostasis) R118 "mokapipe": True, @@ -869,6 +869,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4989" }, "Pan4979": { # VCP1 Viapath (Molecular Haemostasis) R119 "mokapipe": True, @@ -879,6 +880,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4990" }, "Pan4980": { # VCP1 Viapath (Molecular Haemostasis) R120 "mokapipe": True, @@ -889,6 +891,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4991" }, "Pan4981": { # VCP1 Viapath (Molecular Haemostasis) R121 "mokapipe": True, @@ -899,6 +902,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4708" }, "Pan4982": { # VCP1 Viapath (Molecular Haemostasis) R122 "mokapipe": True, @@ -909,6 +913,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4992" }, "Pan4983": { # VCP1 Viapath (Molecular Haemostasis) R123 "mokapipe": True, @@ -919,6 +924,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4993" }, "Pan4984": { # VCP1 Viapath (Molecular Haemostasis) R124 "mokapipe": True, @@ -929,8 +935,9 @@ "hsmetrics_bedfile": "Pan4397data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "variant_calling_bedfile": "Pan4398data.bed", + "exome_depth_cnvcalling_BED": "Pan4994" }, - "Pan4149": { # VCP2 BRCA (Viapath) + "Pan4149": { # VCP2 BRCA (Viapath) R208 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -939,6 +946,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5158" }, "Pan4964": { # VCP2 R259 nijmegen breakage (Viapath) "mokapipe": True, @@ -949,6 +957,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5161" }, "Pan4150": { # VCP2 R207 ovarian cancer (Viapath) "mokapipe": True, @@ -960,29 +969,33 @@ "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5152" }, - "Pan4127": { # VCP2 R209 colorectal cancer (Viapath) + "Pan4129": { # VCP2 R210 Lynch syndrome (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", - "congenica_project": "5093", + "congenica_project": "5094", "RPKM_also_analyse": vcp2_panel_list, "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5193" # use R211 CNV bedfile }, - "Pan4129": { # VCP2 R210 Lynch syndrome (Viapath) + "Pan4130": { # VCP2 R211 polyposis (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", - "congenica_project": "5094", + "congenica_project": "5095", "RPKM_also_analyse": vcp2_panel_list, "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5193" }, - "Pan4130": { # VCP2 R211 polyposis (Viapath) + "Pan5186": { # VCP2 R414 APC (Viapath) "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -991,7 +1004,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", - "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5162" }, "Pan5121": { # VCP2 R430 prostate (Viapath) "mokapipe": True, @@ -1003,6 +1016,7 @@ "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5165", }, "Pan5143": { # VCP2 R444.1 Breast cancer (PARP treatment- Viapath) "mokapipe": True, @@ -1013,6 +1027,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5183" }, "Pan5147": { # VCP2 R444.2 Prostate cancer (PARP treatment- Viapath) "mokapipe": True, @@ -1023,6 +1038,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "variant_calling_bedfile": "Pan5119data.bed", + "exome_depth_cnvcalling_BED": "Pan5184", }, "Pan4132": { # VCP3 R56 (Viapath) "mokapipe": True, @@ -1032,7 +1048,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed", # CNV not required }, "Pan4134": { # VCP3 R57 (Viapath) "mokapipe": True, @@ -1042,7 +1058,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed", # CNV not required }, "Pan4136": { # VCP3 R58 (Viapath) "mokapipe": True, @@ -1052,7 +1068,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4137": { # VCP3 R60 (Viapath) "mokapipe": True, @@ -1062,7 +1078,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4138": { # VCP3 R62 (Viapath) "mokapipe": True, @@ -1072,7 +1088,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4143": { # VCP3 R66 (Viapath) "mokapipe": True, @@ -1083,6 +1099,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + # "exome_depth_cnvcalling_BED": "Pan5174" # CNV BED not yet available }, "Pan4144": { # VCP3 R78 (Viapath) "mokapipe": True, @@ -1092,7 +1109,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4145": { # VCP3 R79 - CMD (Viapath) "mokapipe": True, @@ -1103,6 +1120,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + #"exome_depth_cnvcalling_BED": "Pan5168" #Exome depth does not support VCP3 yet }, "Pan4146": { # VCP3 R81 CM (Viapath) "mokapipe": True, @@ -1113,6 +1131,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + #"exome_depth_cnvcalling_BED": "Pan5170" #Exome depth does not support VCP3 yet }, "Pan4151": { # VCP3 R82 limb girdle (Viapath) "mokapipe": True, @@ -1122,7 +1141,7 @@ "RPKM_also_analyse": vcp3_panel_list, "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", - "variant_calling_bedfile": "Pan4995data.bed", + "variant_calling_bedfile": "Pan4995data.bed",# CNV not required }, "Pan4351": { # VCP3 R227 (Viapath) "mokapipe": True, @@ -1133,6 +1152,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + #"exome_depth_cnvcalling_BED": "Pan5177" #Exome depth does not support VCP3 yet }, "Pan4387": { # VCP3 R90 Bleeding and platelet disorders (Viapath) "mokapipe": True, @@ -1143,6 +1163,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + #"exome_depth_cnvcalling_BED": "Pan5171" #Exome depth does not support VCP3 yet }, "Pan4390": { # VCP3 R97 Thrombophilia with a likely monogenic cause (Viapath) "mokapipe": True, @@ -1153,6 +1174,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + #"exome_depth_cnvcalling_BED": "Pan5173", #Exome depth does not support VCP3 yet }, "Pan4314": { # VCP3 R229 (Viapath) "mokapipe": True, @@ -1163,6 +1185,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", "variant_calling_bedfile": "Pan4995data.bed", + # "exome_depth_cnvcalling_BED": "Pan5179", bedfile not yet made }, "Pan4396": { # ArcherDx (Synnovis) "archerdx": True, @@ -1178,7 +1201,7 @@ }, "Pan4969": { # TSO500 no UTRs. TERT promoter "TSO500": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1187,7 +1210,7 @@ "Pan5085": { # TSO500 High throughput Synnovis. no UTRs. TERT promoter "TSO500": True, "TSO500_high_throughput": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1196,7 +1219,7 @@ "Pan5112": { # TSO500 High throughput BSPS. no UTRs. TERT promoter "TSO500": True, "TSO500_high_throughput": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1206,7 +1229,7 @@ "Pan5114": { # TSO500 High throughput Control. no UTRs. TERT promoter "TSO500": True, "TSO500_high_throughput": True, - "sambamba_bedfile": "Pan5130dataSambamba.bed", + "sambamba_bedfile": "Pan5205dataSambamba.bed", "clinical_coverage_depth": 100, "multiqc_coverage_level": 100, "coverage_min_basecall_qual": 25, @@ -1226,6 +1249,7 @@ "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, "FH": True, + "exome_depth_cnvcalling_BED": "Pan4702" }, "Pan4822": { # VCP1 STG R184_CF "mokapipe": True, @@ -1239,6 +1263,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, + "exome_depth_cnvcalling_BED": "Pan4703", }, "Pan4823": { # VCP1 STG R25_FGFR "mokapipe": True, @@ -1251,7 +1276,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "STG": True, + "STG": True, # CNV not required }, "Pan4824": { # VCP1 STG R73_DMD "mokapipe": True, @@ -1265,6 +1290,7 @@ "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", "STG": True, + "exome_depth_cnvcalling_BED": "Pan4622" }, "Pan4825": { # VCP1 STG R337_cadasil "mokapipe": True, @@ -1277,7 +1303,7 @@ "hsmetrics_bedfile": "Pan4397data.bed", "variant_calling_bedfile": "Pan4398data.bed", "sambamba_bedfile": "Pan4397dataSambamba.bed", - "STG": True, + "STG": True,# CNV not required }, "Pan4826": { # VCP3 STG R56 "mokapipe": True, @@ -1289,7 +1315,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4827": { # VCP3 STG R57 "mokapipe": True, @@ -1301,7 +1327,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4828": { # VCP3 STG R58 "mokapipe": True, @@ -1313,7 +1339,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4829": { # VCP3 STG R60 "mokapipe": True, @@ -1325,7 +1351,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4830": { # VCP3 STG R62 "mokapipe": True, @@ -1337,7 +1363,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4831": { # VCP3 STG R66 "mokapipe": True, @@ -1350,6 +1376,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + # "exome_depth_cnvcalling_BED": "Pan5174" BEDfile not yet available }, "Pan4832": { # VCP3 STG R78 "mokapipe": True, @@ -1361,7 +1388,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4833": { # VCP3 STG R79 "mokapipe": True, @@ -1374,6 +1401,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + # "exome_depth_cnvcalling_BED": "Pan5168", #Exome depth does not support VCP3 yet }, "Pan4834": { # VCP3 STG R81 "mokapipe": True, @@ -1386,6 +1414,7 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + #"exome_depth_cnvcalling_BED": "Pan5170", #Exome depth does not support VCP3 yet }, "Pan4835": { # VCP3 STG R82 "mokapipe": True, @@ -1397,7 +1426,7 @@ "congenica_project": "4201", "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", - "sambamba_bedfile": "Pan4995dataSambamba.bed", + "sambamba_bedfile": "Pan4995dataSambamba.bed",# CNV not required }, "Pan4836": { # VCP3 STG R229 "mokapipe": True, @@ -1410,8 +1439,9 @@ "hsmetrics_bedfile": "Pan4995data.bed", "variant_calling_bedfile": "Pan4995data.bed", "sambamba_bedfile": "Pan4995dataSambamba.bed", + #"exome_depth_cnvcalling_BED": "Pan5179" BEDfile not yet available }, - "Pan4818": { # VCP2 STG R209 + "Pan4819": { # VCP2 STG R210 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -1422,8 +1452,10 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5193" # useR211 bedfile }, - "Pan4819": { # VCP2 STG R210 + "Pan4820": { # VCP2 STG R211 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -1435,8 +1467,9 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5193" }, - "Pan4820": { # VCP2 STG R211 + "Pan5185": { # VCP2 STG R414 "mokapipe": True, "multiqc_coverage_level": 30, "RPKM_bedfile_pan_number": "Pan5109", @@ -1447,7 +1480,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", - "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5162" }, "Pan4816": { # VCP2 STG R208 "mokapipe": True, @@ -1460,6 +1493,7 @@ "hsmetrics_bedfile": "Pan5123data.bed", "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", + "exome_depth_cnvcalling_BED": "Pan5158" }, "Pan4817": { # VCP2 STG R207 "mokapipe": True, @@ -1473,6 +1507,7 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5152" }, "Pan5122": { # VCP2 STG R430 prostate "mokapipe": True, @@ -1486,6 +1521,7 @@ "variant_calling_bedfile": "Pan5119data.bed", "sambamba_bedfile": "Pan5123dataSambamba.bed", "polyedge": "MSH2", + "exome_depth_cnvcalling_BED": "Pan5165" }, "Pan5007": { # LRPCR Via R207 PMS2 "mokapipe": True, @@ -1601,6 +1637,9 @@ "sambamba_bedfile": "Pan5018dataSambamba.bed", "masked_reference": "project-ByfFPz00jy1fk6PjpZ95F27J:file-GF84GF00QfBfzV35Gf8Qg53q", # hs37d5_Pan4967.bwa-index.tar.gz }, + "Pan5180": { # DEVELOPMENT run - used to allow demultiplexing, but stop samplesheet checks/incorrect pan number alerts + "development_run": True, + }, } @@ -1669,17 +1708,15 @@ } duty_csv_id = ( - "project-ByfFPz00jy1fk6PjpZ95F27J:applet-GQg9J280jy1Zf79KGx9gk5K3" + "project-ByfFPz00jy1fk6PjpZ95F27J:applet-GZYx3Kj0kKj3YBV7qgK6VjXQ" ) duty_csv_inputs = { # tso_pannumbers should not include the dry lab pan number "tso_pannumbers": "-itso_pannumbers=Pan4969,Pan5085,Pan5114", "stg_pannumbers": ( - "-istg_pannumbers=Pan4042,Pan4043,Pan4044,Pan4049,Pan4821,Pan4822," - "Pan4823,Pan4824,Pan4825,Pan4816,Pan4817,Pan4818,Pan4819,Pan4820," - "Pan4826,Pan4827,Pan4828,Pan4829,Pan4830,Pan4831,Pan4832,Pan4833," - "Pan4834,Pan4835,Pan4836,Pan5008,Pan5010,Pan5012,Pan5014,Pan5122," - "Pan5144,Pan5148" + "-istg_pannumbers=Pan4821,Pan4822,Pan4823,Pan4824,Pan4825,Pan4816,Pan4817,Pan4819,Pan4820," + "Pan4826,Pan4827,Pan4828,Pan4829,Pan4830,Pan4831,Pan4832,Pan4833,Pan4834,Pan4835,Pan4836," + "Pan5008,Pan5010,Pan5012,Pan5014,Pan5122,Pan5144,Pan5148" ), "cp_capture_pannos": "-icp_capture_pannos=Pan5109,Pan4399,Pan4362", } diff --git a/demultiplex.py b/demultiplex.py index 67b4d001..35d7e160 100644 --- a/demultiplex.py +++ b/demultiplex.py @@ -212,23 +212,52 @@ def already_demultiplexed(self, runfolder): self.script_logfile.write("Checking if already demultiplexed .........Run has not yet been demultiplexed\n") self.samplesheet = self.runfolder + "_SampleSheet.csv" self.samplesheet_path = os.path.join(config.samplesheets_dir, self.samplesheet) - # run samplesheet checks (uses try to ensure that should an error occur this doesn't affect the other - # script functionality - ss_verification_results = samplesheet_verifier.run_ss_checks(self.samplesheet_path) - ss_fail = "" - ss_pass = "" - # If the value is True (i.e. check has passed), append to pass list, else append to fail list - for key in ss_verification_results: - if ss_verification_results[key][0]: - ss_pass += ss_verification_results[key][1] - else: - ss_fail += ss_verification_results[key][1] - if ss_pass: - self.logger("Following samplesheet checked were passed by {}: {}".format(self.samplesheet, ss_pass), - "demultiplex_success") - if ss_fail: - self.logger("SAMPLESHEET CHECKS FAILED {}: {}".format(self.samplesheet, ss_fail), "samplesheet_warning") + # if development run skip the samplesheet check to avoid endless alerts + if not self.check_for_development_run(self.samplesheet_path): + # run samplesheet checks (uses try to ensure that should an error occur this doesn't affect the other + # script functionality + ss_verification_results = samplesheet_verifier.run_ss_checks(self.samplesheet_path) + ss_fail = "" + ss_pass = "" + # If the value is True (i.e. check has passed), append to pass list, else append to fail list + for key in ss_verification_results: + if ss_verification_results[key][0]: + ss_pass += ss_verification_results[key][1] + else: + ss_fail += ss_verification_results[key][1] + if ss_pass: + self.logger("Following samplesheet checked were passed by {}: {}".format(self.samplesheet, ss_pass), + "demultiplex_success") + if ss_fail: + self.logger("SAMPLESHEET CHECKS FAILED {}: {}".format(self.samplesheet, ss_fail), "samplesheet_warning") + else: + self.script_logfile.write("development pan number identified.skipping samplesheet checks\n") self.has_run_finished() + + def check_for_development_run(self,samplesheet_path): + """ + Read samplesheet looking for development pan number. + If pannumber where development_run is True is present add samplename to list + return sample_list (will return False if empty) + """ + sample_list = [] + + with open(samplesheet_path, "r") as samplesheet_stream: + # read the file into a list and loop through the list in reverse (bottom to top). + # this allows us to access the sample names, and stop when reach the column headers, skipping the header of the file. + for line in reversed(samplesheet_stream.readlines()): + if line.startswith("Sample_ID") or "[Data]" in line: + break + # skip empty lines (check first element of the line, after splitting on comma) + elif len(line.split(",")[0]) < 2: + pass + # if it's a line detailing a sample + else: + for pannum in config.development_pannumber_list: + if pannum in line: + sample_list.append(line.split(",")[0]) + + return sample_list def has_run_finished(self): """Check if sequencing has completed for the current runfolder. This is denoted by the diff --git a/upload_and_setoff_workflows.py b/upload_and_setoff_workflows.py index d171b676..acc47376 100644 --- a/upload_and_setoff_workflows.py +++ b/upload_and_setoff_workflows.py @@ -114,6 +114,12 @@ def __init__(self, runfolder): + self.runfolder_name + "_congenica_upload_commands.sh" ) + self.TSO500_post_run_command_script = ( + config.DNA_Nexus_workflow_logfolder + + self.runfolder_name + + "_TSO_post_run_commands.sh" + ) + self.nexus_project_name = "" self.nexus_path = "" self.nexus_project_id = "" @@ -148,12 +154,15 @@ def __init__(self, runfolder, now, debug_mode=False): # list of fastqs to get ngs run number and WES batch self.list_of_processed_samples = [] + #list of TSO samplesheets + self.TSO500_samplesheets_list = [] + # DNA Nexus commands to be built on later - self.source_command = "#!/bin/bash\n. %s\n" % ( + self.source_command = "#!/bin/bash\n. %s" % ( config.sdk_source_cmd ) - self.empty_depends = "depends_list=''\n" - self.empty_gatk_depends = "depends_list_gatk=''\n" + self.empty_depends = "depends_list=''" + self.empty_gatk_depends = "depends_list_gatk=''" self.createprojectcommand = 'project_id="$(dx new project --bill-to %s "%s" --brief --auth-token %s)"\n' self.mokapipe_command = ( "jobid=$(dx run %s%s --priority high -y --name " @@ -200,6 +209,14 @@ def __init__(self, runfolder, now, debug_mode=False): "jobid=$(dx run %s%s --priority high -y --instance-type mem1_ssd1_v2_x8" % (config.app_project, config.RPKM_path) ) + self.ED_readcount_command = ( + "EDjobid=$(dx run %s%s --priority high -y --instance-type %s" + % (config.app_project, config.ED_readcount_path, config.ED_readcount_path_instance_type) + ) + self.ED_cnvcalling_command = ( + "jobid=$(dx run %s%s --priority high -y --instance-type %s" + % (config.app_project, config.ED_cnvcalling_path, config.ED_cnvcalling_instance_type) + ) self.mokaamp_command = ( "jobid=$(dx run %s%s --priority high -y --name " % (config.app_project, config.mokaamp_path) @@ -244,6 +261,7 @@ def __init__(self, runfolder, now, debug_mode=False): # arguments to capture jobids self.depends_list = 'depends_list="${depends_list} -d ${jobid} "' + self.depends_list_ED_readcount = 'depends_list="${depends_list} -d ${EDjobid} "' self.depends_list_gatk = 'depends_list_gatk="${depends_list_gatk} -d ${jobid} "' self.depends_list_recombined = 'depends_list="${depends_list} ${depends_list_gatk} "' # Argument to define depends_list only if the job ID exists @@ -316,21 +334,25 @@ def quarterback(self): self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_name, ) - # check for TSO500 run - this is not demultiplexed locally but the entire runfolder is uploaded - # read samplesheet to create a list of samples - TSO500_sample_list = self.check_for_TSO500() - # if not TSO500 will return None - if TSO500_sample_list: - self.list_of_processed_samples, self.fastq_string = ( - TSO500_sample_list, - self.runfolder_obj.runfolder_samplesheet_path, - ) - + # check for development pan number. If found self.list_of_processed_sampels will be empty and no further processing will occur + if self.check_for_development_run(): + self.loggers.script.info("development pan number identified in samplesheet. Stopping any further processing") else: - ( - self.list_of_processed_samples, - self.fastq_string, - ) = self.find_fastqs(self.runfolder_obj.fastq_folder_path) + # check for TSO500 run - this is not demultiplexed locally but the entire runfolder is uploaded + # read samplesheet to create a list of samples + TSO500_sample_list = self.check_for_TSO500() + # if not TSO500 will return None + if TSO500_sample_list: + self.list_of_processed_samples, self.fastq_string = ( + TSO500_sample_list, + self.runfolder_obj.runfolder_samplesheet_path, #this sets the fastq_string to be the samplesheet path + ) + + else: + ( + self.list_of_processed_samples, + self.fastq_string, + ) = self.find_fastqs(self.runfolder_obj.fastq_folder_path) if self.list_of_processed_samples: # build the project name using the WES batch and NGS run numbers @@ -354,9 +376,12 @@ def quarterback(self): view_users_list, admin_users_list ).rstrip() ) + # split tso samplesheet and write split versions to the runfolder # build upload agent command for fastq upload and write stdout to ua_stdout_log # pass path to function which checks files were uploaded without error if TSO500_sample_list: + # split TSO samplesheet to multiple sheets with <=16 samples/sheet + self.TSO500_samplesheets_list = self.split_TSO500_samplesheet() backup_attempt_count = 1 while backup_attempt_count < 5: self.loggers.script.info( @@ -372,6 +397,7 @@ def quarterback(self): # increase backup count backup_attempt_count += 1 + #upload fastqs. if TSO500 run, this uploads the samplesheet to the project root self.look_for_upload_errors(self.upload_fastqs()) # upload cluster density files and check upload was successful. @@ -611,6 +637,103 @@ def check_for_TSO500(self): open(self.loggers.upload_agent.filepath, "w").close() return sample_list + def split_TSO500_samplesheet(self): + """ + take TSO500 samplesheet and split in to parts with x samples per samplesheet (x defined in config.batch_size) + write samplesheets to runfolder + returns: list of samplesheet names + """ + # samplesheet in the runfolder + samplesheet_file = self.runfolder_obj.runfolder_samplesheet_path + + samplesheet_header = [] + samples = [] + no_sample_lines = 0 + expected_data_headers = ["Sample_ID", "Sample_Name", "index"] + + # Read all lines from the sample sheet + with open(samplesheet_file) as samplesheet: + for line in reversed(samplesheet.readlines()): + # stop when get to data headers section + if any(header in line for header in expected_data_headers): + break + # skip empty lines (check first element of the line, after splitting on comma) + elif len(line.split(",")[0]) < 2: + pass + # If its a line containing a sample:: + elif line.startswith("TSO"): + samples.append(line) + no_sample_lines += 1 + # get header + with open(samplesheet_file) as samplesheet: + for line in samplesheet.readlines(): + # stop when get to data headers section- add header line to header then break + if any(header in line for header in expected_data_headers): + samplesheet_header.append(line) + break + else: + samplesheet_header.append(line) + + # reverse samples list to get back in correct order (starting at sample 1) + samples.reverse() + + # Split samples into batches (size specified in config) + # batches is a list of lists, where each list is a subset of the samples from the samplesheet + # e.g. if batch_size=16, each list will contain up to 16 samples + batches = [samples[i:i + config.batch_size] for i in range(0, len(samples), config.batch_size)] + + # Write batches to separate files named "PartXofY", and add samplesheet to list + samplesheet_list = [] + number_of_batches = len(batches) + #capture path for samplesheet in runfolder + runfolder_samplesheet_file = os.path.join(self.runfolder_obj.runfolderpath, self.runfolder_obj.runfolder_samplesheet_name) + samplesheet_base_name = runfolder_samplesheet_file.split(".csv")[0] + for samplesheet_count, batch in enumerate(batches, start=1): + #capture samplesheet file path to write samplesheet paths to the runfolder + samplesheet_filepath = "%sPart%sof%s.csv" % (samplesheet_base_name,samplesheet_count,number_of_batches) + # capture samplesheet name to write to list- use runfolder name + samplesheet_name = "%s_SampleSheetPart%sof%s.csv" % (self.runfolder_obj.runfolder_name,samplesheet_count,number_of_batches) + samplesheet_list.append(samplesheet_name) + with open(samplesheet_filepath, "a") as new_samplesheet: + new_samplesheet.writelines(samplesheet_header) + new_samplesheet.writelines(batch) + + return(samplesheet_list) + + def check_for_development_run(self): + """ + Read samplesheet looking for development pan number. + If pannumber where development_run is True is present add samplename to list + return sample_list (will return False if empty) + """ + sample_list = [] + # build list of development pan numbers + development_panel_list=[] + for pan in self.panel_dictionary.keys(): + if self.panel_dictionary[pan]["development_run"]: + development_panel_list.append(pan) + + with open( + self.runfolder_obj.runfolder_samplesheet_path, "r" + ) as samplesheet_stream: + # read the file into a list and loop through the list in reverse (bottom to top). + # this allows us to access the sample names, and stop when reach the column headers, skipping the header of the file. + for line in reversed(samplesheet_stream.readlines()): + if line.startswith("Sample_ID") or "[Data]" in line: + break + # skip empty lines (check first element of the line, after splitting on comma) + elif len(line.split(",")[0]) < 2: + pass + # if it's a line detailing a sample + else: + for pannum in development_panel_list: + if pannum in line: + sample_list.append(line.split(",")[0]) + # as it takes a long time before the upload create the file to stop further processing + if sample_list: + open(self.loggers.upload_agent.filepath, "w").close() + return sample_list + def calculate_cluster_density(self, runfolder_path, runfolder_name): """ Inputs = runfolder name and runfolder path @@ -846,7 +969,7 @@ def write_create_project_script(self): # open bash script with open(self.project_bash_script_path, "w") as project_script: project_script.write(self.source_command + "\n") - project_script.write(self.empty_depends) + project_script.write(self.empty_depends + "\n") project_script.write( self.createprojectcommand % ( @@ -1350,6 +1473,15 @@ def nexus_bedfiles(self, pannumber): + self.panel_dictionary[pannumber]["RPKM_bedfile_pan_number"] + "_RPKM.bed" ) + + if self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"]: + bed_dict["exome_depth_cnvcalling_BED"] = ( + config.app_project + + config.bedfile_folder + + self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"] + + "_CNV.bed" + ) + return bed_dict def start_building_dx_run_cmds(self): @@ -1377,6 +1509,8 @@ def start_building_dx_run_cmds(self): congenica_upload = False joint_variant_calling = False # not currently in use rpkm_list = [] # list for panels needing RPKM analysis + pannnumber_list= [] + exome_depth = False TSO500 = False # loop through samples @@ -1393,9 +1527,15 @@ def start_building_dx_run_cmds(self): elif re.search(r"_R1_", fastq): # extract Pan number and use this to determine which dx run commands are needed for the sample panel = re.search(r"Pan\d+", fastq).group() + #create a list of all pan numbers in the run + pannnumber_list.append(panel) # The order in which the modules are called here is important to ensure the order # of dx run commands is correct. This affects which decision support tool data is sent to. - + + # determine if exome depth is needed - the exact commands will be determined in the function which handles exome_depth commands + if self.panel_dictionary[panel]["exome_depth_cnvcalling_BED"]: + exome_depth = True + # If panel is to be processed using MokaWES if self.panel_dictionary[panel]["mokawes"]: # call function to build the MokaWES command and add to command list and depends list @@ -1485,46 +1625,30 @@ def start_building_dx_run_cmds(self): # to stop custom panels being analysed by peddy - may cause problems commands_list.append(self.run_peddy_command()) commands_list.append(self.add_to_depends_list("peddy", 'depends_list')) - + + if exome_depth: + commands_list.append("# Exome depth is run once per capture and then once per Pan number within that capture") + # exome depth is run once per capture, and then for each capture, one per pannumber. This function returns a list of commands so need to add these to commands list + for cmd in self.determine_exome_depth_requirements(pannnumber_list): + commands_list.append(cmd) + + # write TSO commands if a TSO run. if TSO500: - # build command for the TSO500 app and set off fastqc commands - commands_list.append(self.create_tso500_command()) - commands_list.append(self.add_to_depends_list("TSO500", 'depends_list')) + commands_list.append("#The TSOapp is set off once for each samplesheet made") + commands_list.append("#Other jobs must be set off manually by running the file once the pipeline has finished") + # build commands for the TSO500 app and set off fastqc commands (need a command per samplesheet) + for samplesheet in self.TSO500_samplesheets_list: + commands_list.append(self.create_tso500_command(samplesheet)) + + self.build_TSO500_post_run_commands() + + # TSO500 multiqc commands are written to a separate file with a function called above + if not TSO500: + commands_list.append(self.create_multiqc_command()) + commands_list.append(self.add_to_depends_list("MultiQC", 'depends_list')) + commands_list.append(self.create_upload_multiqc_command(TSO500)) + commands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) - # For TSO samples, the fastqs are created within DNAnexus and the - # commands are generated using sample names parsed from the - # samplesheet. If for whatever reason those fastqs are not created - # by the DNAnexus app, the downstream job will not set off and - # therefore will produce no job ID to provide to the depends_list, - # which will create an error/ slack alert. To solve this problem, - # the job ID is only added to the depends list if it exits - for sample in self.list_of_processed_samples: - pannumber = re.search(r"Pan\d+", sample).group() - commands_list.append( - self.create_fastqc_command(sample) - ) - # Only add to depends_list if job ID from previous command - # is not empty - commands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) - - commands_list.append(self.create_sambamba_cmd(sample, pannumber)) - # Exclude negative controls from the depends list as the NTC - # coverage calculation can often fail. We want the coverage - # report for the NTC sample to help assess contamination. - # Only add to depends_list if job ID from previous command - # is not empty - commands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) - - if "HD200" in sample: - commands_list.append(self.create_sompy_cmd(sample, pannumber)) - # Only add to depends_list if job ID from previous command - # is not empty - commands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list("sompy", 'depends_list')) - - commands_list.append(self.create_multiqc_command()) - commands_list.append(self.add_to_depends_list("MultiQC", 'depends_list')) - commands_list.append(self.create_upload_multiqc_command(TSO500)) - commands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) # setoff the below commands later as they are not depended upon by # MultiQC but are required for duty_csv if rpkm_list: @@ -1536,10 +1660,199 @@ def start_building_dx_run_cmds(self): commands_list.append(self.add_to_depends_list("rpkm", 'depends_list')) commands_list.append(self.add_to_depends_list("depends", 'depends_list_recombined')) - commands_list.append(self.create_duty_csv_command()) + if not TSO500: + commands_list.append(self.create_duty_csv_command()) return commands_list + def build_TSO500_post_run_commands(self): + """ + Function to build TSO500 commands to run after pipeline, i.e. + Fastqc, sambamba, sompy, multiqc, upload multiqc and duty_csv + Commands must be written to file _TSO_post_run_commands.sh + which can be run manually once pipeline done. + For TSO samples, the fastqs are created within DNAnexus and the + commands are generated using sample names parsed from the + samplesheet. If for whatever reason those fastqs are not created + by the DNAnexus app, the downstream job will not set off and + therefore will produce no job ID to provide to the depends_list, + which will create an error/ slack alert. To solve this problem, + the job ID is only added to the depends list if it exits + """ + # Update script log file to say what is being done. + self.loggers.script.info("Building dx run commands for TSO500 post pipeline processing") + + # list to hold all commands. + TSO500 = True + TSOcommands_list = [] + TSOcommands_list.append(self.source_command) + TSOcommands_list.append(self.empty_depends) + TSOcommands_list.append(self.empty_gatk_depends) + + for sample in self.list_of_processed_samples: + pannumber = re.search(r"Pan\d+", sample).group() + TSOcommands_list.append( + self.create_fastqc_command(sample) + ) + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) + + TSOcommands_list.append(self.create_sambamba_cmd(sample, pannumber)) + # Exclude negative controls from the depends list as the NTC + # coverage calculation can often fail. We want the coverage + # report for the NTC sample to help assess contamination. + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list(sample, 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) + + if "HD200" in sample: + TSOcommands_list.append(self.create_sompy_cmd(sample, pannumber)) + # Only add to depends_list if job ID from previous command + # is not empty + TSOcommands_list.append(self.if_jobid_exists_depends % self.add_to_depends_list("sompy", 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) + + TSOcommands_list.append(self.create_multiqc_command()) + TSOcommands_list.append(self.add_to_depends_list("MultiQC", 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) + TSOcommands_list.append(self.create_upload_multiqc_command(TSO500)) + TSOcommands_list.append(self.add_to_depends_list("UploadMultiQC", 'depends_list')) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) + + TSOcommands_list.append(self.create_duty_csv_command()) + TSOcommands_list.append(self.if_jobid_exists_depends % ('echo ${jobid}')) + + with open( + self.runfolder_obj.TSO500_post_run_command_script, "w" + ) as TSO500_commands: + # remove any None values from the command_list + TSO500_commands.writelines( + [line + "\n" for line in filter(None, TSOcommands_list)] + ) + + return TSOcommands_list + + def determine_exome_depth_requirements(self,pannnumber_list): + """ + This function takes a list of all pan numbers found on this run. + Exome depth is run in 2 stages, firstly readcounts are calculated for each capture panel (VCP1 or VCP2 etc). + The jobid will be saved to $EDjobid which allows the output of this stage to be used to filter CNVs with a panel specific BEDfile. + The CNV calling steps should be a dependancy of multiqc + This function controls the order these commands are built and run so the output of the readcount step can be used as an input to the cnvcalling step + Inputs: + List of Pannumbers on the run + Returns: + List of dx run commands + """ + + # generate list of pan numbers in samplenames to process in ED + VCP1=[] + VCP2=[] + VCP3=[] + command_list=[] + + for pannumber in set(pannnumber_list): + # not all VCP1/2/3 pan numbers need CNV calling + if self.panel_dictionary[pannumber]["exome_depth_cnvcalling_BED"]: + if pannumber in config.vcp1_panel_list: + VCP1.append(pannumber) + if pannumber in config.vcp2_panel_list: + VCP2.append(pannumber) + if pannumber in config.vcp3_panel_list: + VCP3.append(pannumber) + + # make sure there are enough samples for that capture + if len(VCP1)>2: + # first build readcount command. + command_list.append(self.build_ED_readcount_cmd(set(VCP1), config.ED_readcount_normals_VCP1_file,config.ED_VCP1_readcount_BEDfile_pannum)) + # The output of readcount can be used in multiqc so add this to the multiqc depends list + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list_ED_readcount')) + # the cnvcalling stage can use the jobid from the readcount stage as an input so run these before the next capture panel + for panel in set(VCP1):# then build cnvcalling commands + command_list.append(self.build_ED_cnv_calling_cmd(panel)) + + if len(VCP2)>2: + # first build readcount command + command_list.append(self.build_ED_readcount_cmd(set(VCP2), config.ED_readcount_normals_VCP2_file,config.ED_VCP2_readcount_BEDfile_pannum)) + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list_ED_readcount')) + for panel in set(VCP2):# then build cnvcalling commands + command_list.append(self.build_ED_cnv_calling_cmd(panel)) + + if len(VCP3)>2: + # first build readcount command + command_list.append(self.build_ED_readcount_cmd(set(VCP3), config.ED_readcount_normals_VCP3_file,config.ED_VCP3_readcount_BEDfile_pannum)) + command_list.append(self.add_to_depends_list("exomedepth", 'depends_list_ED_readcount')) + for panel in set(VCP2):# then build cnvcalling commands + command_list.append(self.build_ED_cnv_calling_cmd(panel)) + + return command_list + + def build_ED_readcount_cmd(self,pannumber_list, normals_file,readcount_bedfile_pannum): + """ + This function builds the dx run command for the exome depth readcount app + This is run once per capture panel + Inputs: + pannumber_list = list of Pan numbers for this capture panel on this run. used to determine which BAM files to download + normals_file = predefined panel of normals data file (from config) + readcount bedfile pannumber = predefined capture panel wide BEDfile (from config) + Returns: + dx run cmd (string) + """ + #build bedfile address from the readcount_bedfile_pannum input + readcount_bedfile = "%s%s%s" % (config.app_project,config.bedfile_folder,readcount_bedfile_pannum) + + dx_command_list = [ + self.ED_readcount_command, + config.exomedepth_readcount_reference_genome_input, + config.exomedepth_readcount_bedfile_input, + readcount_bedfile, + config.exomedepth_readcount_normalsRdata_input, + normals_file, + config.exomedepth_readcount_projectname_input, + self.runfolder_obj.nexus_project_name, + config.exomedepth_readcount_pannumbers_input, + ",".join(pannumber_list), + self.depends_gatk, # use list of gatk related jobs to delay start + self.dest, + self.dest_cmd, + self.token, + ] + dx_command = "".join(map(str, dx_command_list)) + return dx_command + + def build_ED_cnv_calling_cmd(self,pannumber): + """ + This function builds the dx run command to filter the CNV calls for a specific R number using a BEDfile + Input: + pannumber = pannumber to filter CNV calls + Returns: + dx run cmd (string) + """ + # pull out the appropriate bedfile for ED cnvcalling app BEDfrom panel config dict (exome_depth_cnvcalling_BED) + # note the Pan number for this BED will be different to that used to name the sample + bedfiles = self.nexus_bedfiles(pannumber) + ed_cnvcalling_bedfile = bedfiles["exome_depth_cnvcalling_BED"] + + dx_command_list = [ + self.ED_cnvcalling_command, + config.exomedepth_cnvcalling_readcount_file_input, + "$EDjobid:%s" % (config.exomedepth_readcount_rdata_output), + config.exomedepth_cnvcalling_subpanel_bed_input, + ed_cnvcalling_bedfile, + config.exomedepth_cnvcalling_projectname_input, + self.runfolder_obj.nexus_project_name, + config.exomedepth_cnvcalling_pannumbers_input, + pannumber, + self.dest, + self.dest_cmd, + self.token, + ] + dx_command = "".join(map(str, dx_command_list)) + return dx_command + def create_mokawes_command(self, fastq, pannumber): """ Input = R1 fastq filename and Pan number for a single sample @@ -1612,7 +1925,7 @@ def create_fastqc_command(self, fastqs): return dx_command - def create_tso500_command(self): + def create_tso500_command(self,samplesheet): """ Build dx run command for tso500 docker app. Will assess if it's a novaseq or not from the runfoldername and if it's @@ -1666,13 +1979,16 @@ def create_tso500_command(self): config.TSO500_samplesheet_stage, self.runfolder_obj.nexus_project_id + ":" - + self.runfolder_obj.runfolder_samplesheet_name, + + self.runfolder_subdir + + "/" + + samplesheet, config.TSO500_project_name_stage, self.runfolder_obj.nexus_project_name, + config.TSO500_runfolder_name_stage, + self.runfolder_subdir, config.TSO500_analysis_options_stage, TSO500_analysis_options, instance_type, - "--wait ", self.dest, self.dest_cmd, self.token, @@ -2041,6 +2357,7 @@ def prepare_rpkm_list(self, rpkm_list): # return list to be used to build rpkm command(s). return cleaned_list + def create_rpkm_command(self, pannumber): """ Input = Pannumber for a single RPKM analysis @@ -2173,6 +2490,8 @@ def add_to_depends_list(self, fastq, depends_type): return self.depends_list_gatk elif depends_type=='depends_list_recombined': return self.depends_list_recombined + elif depends_type=='depends_list_ED_readcount': + return self.depends_list_ED_readcount def create_multiqc_command(self): """ @@ -2805,13 +3124,7 @@ def upload_log_files(self): Returns = filepath to the logfile containing output from the command, string of files to be uploaded and name of the stage to test """ # define where files to be uploaded to - nexus_upload_folder = ( - "/" - + self.runfolder_obj.nexus_project_name.replace( - self.nexusproject, "" - ) - + "/Logfiles/" - ) + nexus_upload_folder = ("/%s/Logfiles/" % ("_".join(self.runfolder_obj.nexus_project_name.split("_")[1:]))) # create a list of files to be used to check outputs files_to_upload_list = [] # create a space delimited string of files to be uploaded defined by the logger class