Skip to content

Commit

Permalink
Merge pull request #458 from genomic-medicine-sweden/small_del_blacklist
Browse files Browse the repository at this point in the history
feat: added blacklist filtering of small cnv deletions
  • Loading branch information
Smeds authored May 2, 2024
2 parents 6c6d553 + 3450166 commit b8fbf09
Show file tree
Hide file tree
Showing 11 changed files with 218 additions and 21 deletions.
2 changes: 2 additions & 0 deletions .tests/integration/DATA/cnv_deletion_genes2.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#gene(s) chrmosome start_cnv_region stop_cnv_region gene_start gene_end
TP53 chr17 4638383 10578134 7572906 7579932
3 changes: 3 additions & 0 deletions .tests/integration/DATA/small_deletions_caller_blacklist.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gene chromosome start_pos end_pos
BAP1 chr3 52436283 52443914
TP53 chr17 7572906 7579932
136 changes: 136 additions & 0 deletions .tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
@HD VN:1.6
@SQ SN:chr1 LN:249250621
@SQ SN:chr2 LN:243199373
@SQ SN:chr3 LN:198022430
@SQ SN:chr4 LN:191154276
@SQ SN:chr5 LN:180915260
@SQ SN:chr6 LN:171115067
@SQ SN:chr7 LN:159138663
@SQ SN:chr8 LN:146364022
@SQ SN:chr9 LN:141213431
@SQ SN:chr10 LN:135534747
@SQ SN:chr11 LN:135006516
@SQ SN:chr12 LN:133851895
@SQ SN:chr13 LN:115169878
@SQ SN:chr14 LN:107349540
@SQ SN:chr15 LN:102531392
@SQ SN:chr16 LN:90354753
@SQ SN:chr17 LN:81195210
@SQ SN:chr18 LN:78077248
@SQ SN:chr19 LN:59128983
@SQ SN:chr20 LN:63025520
@SQ SN:chr21 LN:48129895
@SQ SN:chr22 LN:51304566
@SQ SN:chrX LN:155270560
@SQ SN:chrY LN:59373566
@SQ SN:chrM LN:16571
@RG ID:GATKCopyNumber SM:VAL-69_T
CONTIG START END LOG2_COPY_RATIO
chr17 7132712 7133612 0.105974
chr17 7215805 7216253 -0.076869
chr17 7216254 7216487 -0.053930
chr17 7216488 7216654 0.085837
chr17 7216655 7216837 -0.027169
chr17 7216838 7217132 0.060059
chr17 7217133 7217353 0.041558
chr17 7217354 7217544 0.016901
chr17 7217545 7217764 -0.005309
chr17 7217765 7218097 0.070831
chr17 7218098 7218641 -0.005943
chr17 7387814 7388446 0.207122
chr17 7398990 7399460 -0.162797
chr17 7399461 7399712 -0.146018
chr17 7399713 7400007 0.145977
chr17 7400008 7400519 0.184896
chr17 7400520 7400902 0.232086
chr17 7400903 7401293 0.073453
chr17 7401294 7401799 -0.038963
chr17 7402088 7402541 0.008168
chr17 7402542 7403080 0.062317
chr17 7403688 7404213 0.036523
chr17 7404214 7404517 0.116535
chr17 7404518 7404773 0.183643
chr17 7404774 7405133 0.198395
chr17 7405134 7405639 0.174971
chr17 7405640 7406230 0.086756
chr17 7406231 7406657 0.090217
chr17 7406658 7406857 0.008901
chr17 7406858 7407380 0.081301
chr17 7411300 7412028 0.239698
chr17 7412029 7412680 0.074670
chr17 7412681 7413221 -0.044962
chr17 7414264 7414681 0.093942
chr17 7414682 7415018 0.157573
chr17 7415019 7415405 0.059885
chr17 7415406 7415729 0.099066
chr17 7415730 7416001 0.160210
chr17 7416002 7416291 0.185480
chr17 7416292 7417766 -0.202713
chr17 7460067 7460967 0.091835
chr17 7572657 7573278 -0.019844
chr17 7573657 7574303 0.151558
chr17 7576267 7576604 -0.217493
chr17 7576756 7576972 -0.000587
chr17 7576973 7577327 -0.091586
chr17 7577328 7577878 -0.098413
chr17 7577907 7578330 -0.089749
chr17 7578331 7578824 -0.158608
chr17 7579042 7579645 -0.408962
chr17 7579646 7579780 -0.977634
chr17 7579781 7580182 0.222077
chr17 7787855 7788671 -0.883565
chr17 7792049 7792688 0.397481
chr17 7792712 7793364 0.253649
chr17 7793619 7794158 -0.463637
chr17 7794159 7794652 0.207440
chr17 7796334 7797005 -0.169586
chr17 7797006 7797343 0.155267
chr17 7797344 7797658 0.064223
chr17 7797659 7798080 -0.020076
chr17 7798081 7798562 0.111615
chr17 7798563 7799130 -0.329941
chr17 7800131 7800882 0.151060
chr17 7801019 7801617 0.083185
chr17 7801618 7802121 0.043944
chr17 7802122 7802590 -0.051191
chr17 7802591 7803037 0.184511
chr17 7803038 7803479 0.213696
chr17 7803480 7803802 0.026167
chr17 7803803 7804109 0.232087
chr17 7804110 7804436 0.007232
chr17 7804437 7804963 0.035956
chr17 7805658 7806150 0.023346
chr17 7806151 7806484 0.056706
chr17 7806485 7806982 0.078185
chr17 7806983 7807534 0.124214
chr17 7807535 7808177 0.156872
chr17 7808178 7808711 0.105312
chr17 7808712 7809099 0.169250
chr17 7809100 7809577 0.287494
chr17 7809601 7810101 0.139272
chr17 7810102 7810396 0.177454
chr17 7810397 7810618 0.350710
chr17 7810619 7810862 0.144804
chr17 7810863 7811116 0.105927
chr17 7811117 7811528 0.216446
chr17 7811529 7811921 0.289250
chr17 7811922 7812304 0.269335
chr17 7812305 7812926 0.077875
chr17 7813476 7814037 0.272691
chr17 7814038 7814536 0.167095
chr17 7814537 7815173 0.103120
chr17 7975819 7976367 0.149621
chr17 7976368 7976805 -0.099943
chr17 7976806 7977345 0.085893
chr17 7978643 7979263 -0.053568
chr17 7979264 7979818 0.070835
chr17 7979819 7980184 -0.034290
chr17 7980185 7980781 0.119204
chr17 7982444 7982972 0.092477
chr17 7982973 7983406 0.366159
chr17 7983407 7983816 0.249227
chr17 7983817 7984150 -0.023842
chr17 7984151 7984359 -0.269555
chr17 7984360 7984775 -0.041958
chr17 7989064 7989808 0.103631
chr17 7990344 7991030 -0.396831
1 change: 1 addition & 0 deletions config/config.data.hg19.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ bwa_mem:
sa: "{{PROJECT_REF_DATA}}/ref_data/hg19/hg19.with.mt.fasta.sa"

call_small_cnv_deletions:
blacklist: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/small_deletions_caller_blacklist.tsv"
regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_deletion_genes.tsv"

call_small_cnv_amplifications:
Expand Down
1 change: 1 addition & 0 deletions config/config.data.hg38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ bwa_mem:
sa: "{{PROJECT_REF_DATA}}/ref_data/GRCh38_p14/homo_sapiens.fasta.sa"

call_small_cnv_deletions:
blacklist: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/small_deletions_caller_blacklist_hg38.tsv"
regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_deletion_genes_hg38.tsv"

call_small_cnv_amplifications:
Expand Down
5 changes: 5 additions & 0 deletions config/references/design_files.hg19.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@
url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.6.0/cnv/cnv_amplification_genes_240307.tsv

call_small_cnv_deletions:
blacklist:
checksum: 019763e1b79682a56074a3f53a73d149
path: GMS560/cnv/small_deletions_caller_blacklist.tsv
type: file
url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.7.0/cnv/small_deletions_caller_blacklist.tsv
regions_file:
checksum: e044cf88e5e50e68a3a797be5720e6c0
path: GMS560/cnv/cnv_deletion_genes.tsv
Expand Down
5 changes: 5 additions & 0 deletions config/references/design_files.hg38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@
url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.6.0/cnv/cnv_amplification_genes_hg38_240307.tsv

call_small_cnv_deletions:
blacklist:
checksum: 9f57eec94bee9b4edf4bba409c7b7b4c
path: GMS560/cnv/small_deletions_caller_blacklist_hg38.tsv
type: file
url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.7.0/cnv/small_deletions_caller_blacklist_hg38.tsv
regions_file:
checksum: 92510485ce32e9ef64697121aae4122c
path: GMS560/cnv/cnv_deletion_genes_hg38.tsv
Expand Down
5 changes: 3 additions & 2 deletions workflow/rules/call_small_cnv_deletions.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ rule call_small_cnv_deletions:
output:
deletions=temp("cnv_sv/call_small_cnv_deletions/{sample}_{type}.deletions.tsv"),
params:
window_size=config.get("call_small_cnv_deletions", {}).get("window_size", 4),
region_max_size=config.get("call_small_cnv_deletions", {}).get("region_max_size", 30),
blacklist=config.get("call_small_cnv_deletions", {}).get("blacklist", ""),
min_nr_stdev_diff=config.get("call_small_cnv_deletions", {}).get("min_nr_stdev_diff", 2.5),
min_log_odds_diff=config.get("call_small_cnv_deletions", {}).get("min_log_odds_diff", 0.3),
region_max_size=config.get("call_small_cnv_deletions", {}).get("region_max_size", 30),
window_size=config.get("call_small_cnv_deletions", {}).get("window_size", 4),
log:
"cnv_sv/call_small_cnv_deletions/{sample}_{type}.deletions.tsv.log",
benchmark:
Expand Down
15 changes: 9 additions & 6 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,21 @@ properties:
Defines both the genomic position of the genes and their surrounding regions that should be analysed for this gene.
The size of the analysis region is dependent on the density of the backbone but should at least 20 backbone SNPs.
40 SNPs is used in Pipeline.
window_size:
type: integer
description: Number of data points included in the sliding window
region_max_size:
type: integer
description: Max number of data points of deletion to be reported
blacklist:
type: string
description: File with gene regions that should be filtered (Format is header + gene\tchr\tpos1\tpos2)
min_nr_stdev_diff:
type: number
description: Min number of standard deviations difference between the deletion and surrounding data points in region
min_log_odds_diff:
type: number
description: Min log2ratio difference between the deletion and the average of surrounding data points in region
region_max_size:
type: integer
description: Max number of data points of deletion to be reported
window_size:
type: integer
description: Number of data points included in the sliding window

cnv_tsv_report:
type: object
Expand Down
24 changes: 21 additions & 3 deletions workflow/scripts/call_small_cnv_deletions.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,12 @@ def find_max_probe_diff(probe_data, window_size):

def filter_deletions(
max_probe_diff_index, min_probe_diff_index, probe_data, gene_probe_index, region, deletions, region_max_size, window_size,
min_log_odds_diff, min_nr_stdev_diff
min_log_odds_diff, min_nr_stdev_diff, blacklist_dict,
):
# Blacklist filter
key = f"{region[0]}_{region[3]}_{region[4]}"
if key in blacklist_dict:
return "Blacklist"
# Filter amplifications
if max_probe_diff_index >= min_probe_diff_index:
return "Amplification"
Expand Down Expand Up @@ -136,10 +140,23 @@ def read_regions_data(regions_file):
return regions


def read_blacklist(blacklist_file_name):
blacklist = open(blacklist_file_name)
blacklist_dict = {}
next(blacklist)
for line in blacklist:
columns = line.strip().split("\t")
key = f"{columns[1]}_{columns[2]}_{columns[3]}"
blacklist_dict[key] = ""
return blacklist_dict


def call_small_cnv_deletions(
cnv_file_name, regions_file, deletions, window_size, region_max_size, min_nr_stdev_diff, min_log_odds_diff,
cnv_file_name, regions_file, deletions, window_size,
region_max_size, min_nr_stdev_diff, min_log_odds_diff, blacklist_file_name,
):
regions = read_regions_data(regions_file)
blacklist_dict = read_blacklist(blacklist_file_name)
sample_name = cnv_file_name.split("/")[1].split("_")[0]
deletions.write("Gene(s)\tChromosome\tGene_start\tGene_end\tLog2_ratio_diff\tMedian_L2R_deletion\t")
deletions.write("Median_L2R_surrounding\tNumber_of_data_points\tNumber_of_stdev\n")
Expand All @@ -152,7 +169,7 @@ def call_small_cnv_deletions(
max_probe_diff_index, min_probe_diff_index = find_max_probe_diff(probe_data, window_size)
filter = filter_deletions(
max_probe_diff_index, min_probe_diff_index, probe_data, gene_probe_index, region, deletions, region_max_size,
window_size, min_log_odds_diff, min_nr_stdev_diff,
window_size, min_log_odds_diff, min_nr_stdev_diff, blacklist_dict
)
return filter

Expand All @@ -168,4 +185,5 @@ def call_small_cnv_deletions(
snakemake.params.region_max_size,
snakemake.params.min_nr_stdev_diff,
snakemake.params.min_log_odds_diff,
snakemake.params.blacklist,
)
Loading

0 comments on commit b8fbf09

Please sign in to comment.