diff --git a/.tests/integration/DATA/cnv_deletion_genes2.tsv b/.tests/integration/DATA/cnv_deletion_genes2.tsv new file mode 100644 index 00000000..b1c5914b --- /dev/null +++ b/.tests/integration/DATA/cnv_deletion_genes2.tsv @@ -0,0 +1,2 @@ +#gene(s) chrmosome start_cnv_region stop_cnv_region gene_start gene_end +TP53 chr17 4638383 10578134 7572906 7579932 diff --git a/.tests/integration/DATA/small_deletions_caller_blacklist.tsv b/.tests/integration/DATA/small_deletions_caller_blacklist.tsv new file mode 100644 index 00000000..e8ba75f9 --- /dev/null +++ b/.tests/integration/DATA/small_deletions_caller_blacklist.tsv @@ -0,0 +1,3 @@ +gene chromosome start_pos end_pos +BAP1 chr3 52436283 52443914 +TP53 chr17 7572906 7579932 diff --git a/.tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv b/.tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv new file mode 100644 index 00000000..09e645ce --- /dev/null +++ b/.tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv @@ -0,0 +1,136 @@ +@HD VN:1.6 +@SQ SN:chr1 LN:249250621 +@SQ SN:chr2 LN:243199373 +@SQ SN:chr3 LN:198022430 +@SQ SN:chr4 LN:191154276 +@SQ SN:chr5 LN:180915260 +@SQ SN:chr6 LN:171115067 +@SQ SN:chr7 LN:159138663 +@SQ SN:chr8 LN:146364022 +@SQ SN:chr9 LN:141213431 +@SQ SN:chr10 LN:135534747 +@SQ SN:chr11 LN:135006516 +@SQ SN:chr12 LN:133851895 +@SQ SN:chr13 LN:115169878 +@SQ SN:chr14 LN:107349540 +@SQ SN:chr15 LN:102531392 +@SQ SN:chr16 LN:90354753 +@SQ SN:chr17 LN:81195210 +@SQ SN:chr18 LN:78077248 +@SQ SN:chr19 LN:59128983 +@SQ SN:chr20 LN:63025520 +@SQ SN:chr21 LN:48129895 +@SQ SN:chr22 LN:51304566 +@SQ SN:chrX LN:155270560 +@SQ SN:chrY LN:59373566 +@SQ SN:chrM LN:16571 +@RG ID:GATKCopyNumber SM:VAL-69_T +CONTIG START END LOG2_COPY_RATIO +chr17 7132712 7133612 0.105974 +chr17 7215805 7216253 -0.076869 +chr17 7216254 7216487 -0.053930 +chr17 7216488 7216654 0.085837 +chr17 7216655 7216837 -0.027169 +chr17 7216838 7217132 0.060059 +chr17 7217133 7217353 0.041558 +chr17 7217354 7217544 0.016901 +chr17 7217545 7217764 -0.005309 +chr17 7217765 7218097 0.070831 +chr17 7218098 7218641 -0.005943 +chr17 7387814 7388446 0.207122 +chr17 7398990 7399460 -0.162797 +chr17 7399461 7399712 -0.146018 +chr17 7399713 7400007 0.145977 +chr17 7400008 7400519 0.184896 +chr17 7400520 7400902 0.232086 +chr17 7400903 7401293 0.073453 +chr17 7401294 7401799 -0.038963 +chr17 7402088 7402541 0.008168 +chr17 7402542 7403080 0.062317 +chr17 7403688 7404213 0.036523 +chr17 7404214 7404517 0.116535 +chr17 7404518 7404773 0.183643 +chr17 7404774 7405133 0.198395 +chr17 7405134 7405639 0.174971 +chr17 7405640 7406230 0.086756 +chr17 7406231 7406657 0.090217 +chr17 7406658 7406857 0.008901 +chr17 7406858 7407380 0.081301 +chr17 7411300 7412028 0.239698 +chr17 7412029 7412680 0.074670 +chr17 7412681 7413221 -0.044962 +chr17 7414264 7414681 0.093942 +chr17 7414682 7415018 0.157573 +chr17 7415019 7415405 0.059885 +chr17 7415406 7415729 0.099066 +chr17 7415730 7416001 0.160210 +chr17 7416002 7416291 0.185480 +chr17 7416292 7417766 -0.202713 +chr17 7460067 7460967 0.091835 +chr17 7572657 7573278 -0.019844 +chr17 7573657 7574303 0.151558 +chr17 7576267 7576604 -0.217493 +chr17 7576756 7576972 -0.000587 +chr17 7576973 7577327 -0.091586 +chr17 7577328 7577878 -0.098413 +chr17 7577907 7578330 -0.089749 +chr17 7578331 7578824 -0.158608 +chr17 7579042 7579645 -0.408962 +chr17 7579646 7579780 -0.977634 +chr17 7579781 7580182 0.222077 +chr17 7787855 7788671 -0.883565 +chr17 7792049 7792688 0.397481 +chr17 7792712 7793364 0.253649 +chr17 7793619 7794158 -0.463637 +chr17 7794159 7794652 0.207440 +chr17 7796334 7797005 -0.169586 +chr17 7797006 7797343 0.155267 +chr17 7797344 7797658 0.064223 +chr17 7797659 7798080 -0.020076 +chr17 7798081 7798562 0.111615 +chr17 7798563 7799130 -0.329941 +chr17 7800131 7800882 0.151060 +chr17 7801019 7801617 0.083185 +chr17 7801618 7802121 0.043944 +chr17 7802122 7802590 -0.051191 +chr17 7802591 7803037 0.184511 +chr17 7803038 7803479 0.213696 +chr17 7803480 7803802 0.026167 +chr17 7803803 7804109 0.232087 +chr17 7804110 7804436 0.007232 +chr17 7804437 7804963 0.035956 +chr17 7805658 7806150 0.023346 +chr17 7806151 7806484 0.056706 +chr17 7806485 7806982 0.078185 +chr17 7806983 7807534 0.124214 +chr17 7807535 7808177 0.156872 +chr17 7808178 7808711 0.105312 +chr17 7808712 7809099 0.169250 +chr17 7809100 7809577 0.287494 +chr17 7809601 7810101 0.139272 +chr17 7810102 7810396 0.177454 +chr17 7810397 7810618 0.350710 +chr17 7810619 7810862 0.144804 +chr17 7810863 7811116 0.105927 +chr17 7811117 7811528 0.216446 +chr17 7811529 7811921 0.289250 +chr17 7811922 7812304 0.269335 +chr17 7812305 7812926 0.077875 +chr17 7813476 7814037 0.272691 +chr17 7814038 7814536 0.167095 +chr17 7814537 7815173 0.103120 +chr17 7975819 7976367 0.149621 +chr17 7976368 7976805 -0.099943 +chr17 7976806 7977345 0.085893 +chr17 7978643 7979263 -0.053568 +chr17 7979264 7979818 0.070835 +chr17 7979819 7980184 -0.034290 +chr17 7980185 7980781 0.119204 +chr17 7982444 7982972 0.092477 +chr17 7982973 7983406 0.366159 +chr17 7983407 7983816 0.249227 +chr17 7983817 7984150 -0.023842 +chr17 7984151 7984359 -0.269555 +chr17 7984360 7984775 -0.041958 +chr17 7989064 7989808 0.103631 +chr17 7990344 7991030 -0.396831 diff --git a/config/config.data.hg19.yaml b/config/config.data.hg19.yaml index 2825fd0e..b3d59399 100644 --- a/config/config.data.hg19.yaml +++ b/config/config.data.hg19.yaml @@ -54,6 +54,7 @@ bwa_mem: sa: "{{PROJECT_REF_DATA}}/ref_data/hg19/hg19.with.mt.fasta.sa" call_small_cnv_deletions: + blacklist: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/small_deletions_caller_blacklist.tsv" regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_deletion_genes.tsv" call_small_cnv_amplifications: diff --git a/config/config.data.hg38.yaml b/config/config.data.hg38.yaml index 3ffc1ef3..ea8cb67f 100644 --- a/config/config.data.hg38.yaml +++ b/config/config.data.hg38.yaml @@ -54,6 +54,7 @@ bwa_mem: sa: "{{PROJECT_REF_DATA}}/ref_data/GRCh38_p14/homo_sapiens.fasta.sa" call_small_cnv_deletions: + blacklist: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/small_deletions_caller_blacklist_hg38.tsv" regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_deletion_genes_hg38.tsv" call_small_cnv_amplifications: diff --git a/config/references/design_files.hg19.yaml b/config/references/design_files.hg19.yaml index fb6da1d7..c10abc1f 100644 --- a/config/references/design_files.hg19.yaml +++ b/config/references/design_files.hg19.yaml @@ -48,6 +48,11 @@ url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.6.0/cnv/cnv_amplification_genes_240307.tsv call_small_cnv_deletions: + blacklist: + checksum: 019763e1b79682a56074a3f53a73d149 + path: GMS560/cnv/small_deletions_caller_blacklist.tsv + type: file + url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.7.0/cnv/small_deletions_caller_blacklist.tsv regions_file: checksum: e044cf88e5e50e68a3a797be5720e6c0 path: GMS560/cnv/cnv_deletion_genes.tsv diff --git a/config/references/design_files.hg38.yaml b/config/references/design_files.hg38.yaml index 475d0b60..d968faa1 100644 --- a/config/references/design_files.hg38.yaml +++ b/config/references/design_files.hg38.yaml @@ -48,6 +48,11 @@ url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.6.0/cnv/cnv_amplification_genes_hg38_240307.tsv call_small_cnv_deletions: + blacklist: + checksum: 9f57eec94bee9b4edf4bba409c7b7b4c + path: GMS560/cnv/small_deletions_caller_blacklist_hg38.tsv + type: file + url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.7.0/cnv/small_deletions_caller_blacklist_hg38.tsv regions_file: checksum: 92510485ce32e9ef64697121aae4122c path: GMS560/cnv/cnv_deletion_genes_hg38.tsv diff --git a/workflow/rules/call_small_cnv_deletions.smk b/workflow/rules/call_small_cnv_deletions.smk index 0e6084ae..cc2d7885 100644 --- a/workflow/rules/call_small_cnv_deletions.smk +++ b/workflow/rules/call_small_cnv_deletions.smk @@ -11,10 +11,11 @@ rule call_small_cnv_deletions: output: deletions=temp("cnv_sv/call_small_cnv_deletions/{sample}_{type}.deletions.tsv"), params: - window_size=config.get("call_small_cnv_deletions", {}).get("window_size", 4), - region_max_size=config.get("call_small_cnv_deletions", {}).get("region_max_size", 30), + blacklist=config.get("call_small_cnv_deletions", {}).get("blacklist", ""), min_nr_stdev_diff=config.get("call_small_cnv_deletions", {}).get("min_nr_stdev_diff", 2.5), min_log_odds_diff=config.get("call_small_cnv_deletions", {}).get("min_log_odds_diff", 0.3), + region_max_size=config.get("call_small_cnv_deletions", {}).get("region_max_size", 30), + window_size=config.get("call_small_cnv_deletions", {}).get("window_size", 4), log: "cnv_sv/call_small_cnv_deletions/{sample}_{type}.deletions.tsv.log", benchmark: diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 298b94f2..fd9aebaa 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -82,18 +82,21 @@ properties: Defines both the genomic position of the genes and their surrounding regions that should be analysed for this gene. The size of the analysis region is dependent on the density of the backbone but should at least 20 backbone SNPs. 40 SNPs is used in Pipeline. - window_size: - type: integer - description: Number of data points included in the sliding window - region_max_size: - type: integer - description: Max number of data points of deletion to be reported + blacklist: + type: string + description: File with gene regions that should be filtered (Format is header + gene\tchr\tpos1\tpos2) min_nr_stdev_diff: type: number description: Min number of standard deviations difference between the deletion and surrounding data points in region min_log_odds_diff: type: number description: Min log2ratio difference between the deletion and the average of surrounding data points in region + region_max_size: + type: integer + description: Max number of data points of deletion to be reported + window_size: + type: integer + description: Number of data points included in the sliding window cnv_tsv_report: type: object diff --git a/workflow/scripts/call_small_cnv_deletions.py b/workflow/scripts/call_small_cnv_deletions.py index f28f3b1e..d9432247 100644 --- a/workflow/scripts/call_small_cnv_deletions.py +++ b/workflow/scripts/call_small_cnv_deletions.py @@ -62,8 +62,12 @@ def find_max_probe_diff(probe_data, window_size): def filter_deletions( max_probe_diff_index, min_probe_diff_index, probe_data, gene_probe_index, region, deletions, region_max_size, window_size, - min_log_odds_diff, min_nr_stdev_diff + min_log_odds_diff, min_nr_stdev_diff, blacklist_dict, ): + # Blacklist filter + key = f"{region[0]}_{region[3]}_{region[4]}" + if key in blacklist_dict: + return "Blacklist" # Filter amplifications if max_probe_diff_index >= min_probe_diff_index: return "Amplification" @@ -136,10 +140,23 @@ def read_regions_data(regions_file): return regions +def read_blacklist(blacklist_file_name): + blacklist = open(blacklist_file_name) + blacklist_dict = {} + next(blacklist) + for line in blacklist: + columns = line.strip().split("\t") + key = f"{columns[1]}_{columns[2]}_{columns[3]}" + blacklist_dict[key] = "" + return blacklist_dict + + def call_small_cnv_deletions( - cnv_file_name, regions_file, deletions, window_size, region_max_size, min_nr_stdev_diff, min_log_odds_diff, + cnv_file_name, regions_file, deletions, window_size, + region_max_size, min_nr_stdev_diff, min_log_odds_diff, blacklist_file_name, ): regions = read_regions_data(regions_file) + blacklist_dict = read_blacklist(blacklist_file_name) sample_name = cnv_file_name.split("/")[1].split("_")[0] deletions.write("Gene(s)\tChromosome\tGene_start\tGene_end\tLog2_ratio_diff\tMedian_L2R_deletion\t") deletions.write("Median_L2R_surrounding\tNumber_of_data_points\tNumber_of_stdev\n") @@ -152,7 +169,7 @@ def call_small_cnv_deletions( max_probe_diff_index, min_probe_diff_index = find_max_probe_diff(probe_data, window_size) filter = filter_deletions( max_probe_diff_index, min_probe_diff_index, probe_data, gene_probe_index, region, deletions, region_max_size, - window_size, min_log_odds_diff, min_nr_stdev_diff, + window_size, min_log_odds_diff, min_nr_stdev_diff, blacklist_dict ) return filter @@ -168,4 +185,5 @@ def call_small_cnv_deletions( snakemake.params.region_max_size, snakemake.params.min_nr_stdev_diff, snakemake.params.min_log_odds_diff, + snakemake.params.blacklist, ) diff --git a/workflow/scripts/call_small_cnv_deletions_test.py b/workflow/scripts/call_small_cnv_deletions_test.py index 8a21b7ab..60188b9b 100644 --- a/workflow/scripts/call_small_cnv_deletions_test.py +++ b/workflow/scripts/call_small_cnv_deletions_test.py @@ -39,12 +39,13 @@ def test_call_small_cnv_deletions_ok(self): # True call cnv_data1 = ".tests/units/gatk_cnv/HD832.HES45_T.test1.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data1, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) out_deletions.close() @@ -64,13 +65,14 @@ def test_call_small_cnv_deletions_ok(self): def test_call_small_cnv_deletions_filter(self): from call_small_cnv_deletions import call_small_cnv_deletions - # Filter 1 (Amplifactions) + # Filter 1 (Amplifications) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter1.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Amplification", filter) out_deletions.close() @@ -78,11 +80,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 2 (Large deletions) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter2.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Too_large", filter) out_deletions.close() @@ -90,11 +93,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 4 (Filter deletions not in the actual gene of interest) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter4.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Not_in_gene", filter) out_deletions.close() @@ -102,11 +106,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 5 (Filter too short deletions) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter5.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Too_small", filter) out_deletions.close() @@ -114,11 +119,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 6 (Filter deletions with positive median) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter6.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Not_deletion", filter) out_deletions.close() @@ -126,11 +132,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 7 (Too low difference in log odds ratio) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter7.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Low_abs_diff", filter) out_deletions.close() @@ -138,11 +145,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 8 (Too few standard deviations in difference) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.test1.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, 100, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Low_nr_std_diff", filter) out_deletions.close() @@ -150,12 +158,26 @@ def test_call_small_cnv_deletions_filter(self): # Filter 9 (Filter deletions with too few data points outside to calculate median and standard deviations) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter8.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Too_few_outside", filter) out_deletions.close() regions_file.close() + + # Filter 10 (Filter blacklisted deletions) + cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" + regions_file = open(".tests/integration/DATA/cnv_deletion_genes2.tsv") + out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") + filter = call_small_cnv_deletions( + cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, + self.min_log_odds_diff, blacklist_file_name, + ) + self._test_call_small_cnv_deletions_filter("Blacklist", filter) + out_deletions.close() + regions_file.close()