diff --git a/.tests/integration/DATA/cnv_deletion_genes2.tsv b/.tests/integration/DATA/cnv_deletion_genes2.tsv new file mode 100644 index 00000000..b1c5914b --- /dev/null +++ b/.tests/integration/DATA/cnv_deletion_genes2.tsv @@ -0,0 +1,2 @@ +#gene(s) chrmosome start_cnv_region stop_cnv_region gene_start gene_end +TP53 chr17 4638383 10578134 7572906 7579932 diff --git a/.tests/integration/DATA/small_deletions_caller_blacklist.tsv b/.tests/integration/DATA/small_deletions_caller_blacklist.tsv new file mode 100644 index 00000000..e8ba75f9 --- /dev/null +++ b/.tests/integration/DATA/small_deletions_caller_blacklist.tsv @@ -0,0 +1,3 @@ +gene chromosome start_pos end_pos +BAP1 chr3 52436283 52443914 +TP53 chr17 7572906 7579932 diff --git a/.tests/integration/config/config.yaml b/.tests/integration/config/config.yaml index 8a1b1383..0652ece1 100644 --- a/.tests/integration/config/config.yaml +++ b/.tests/integration/config/config.yaml @@ -109,11 +109,10 @@ hotspot_annotation: hotspots: "DATA/Hotspots_combined.csv" chr_translation_file: "../../config/reports/hotspot_report.chr.translation.hg19" -hotspot_info: - hotspot_mutations: "DATA/Hotspots_combined.csv" - hotspot_report: - hotspot_mutations: "DATA/Hotspots_combined.csv" + hotspot_mutations: + all: "DATA/Hotspots_combined.csv" + ENC: "DATA/Hotspots_combined.csv" report_config: "../../config/reports/config/reports/hotspot_report.yaml" chr_translation_file: "../../reports/config/reports/hotspot_report.chr.translation.hg19" levels: diff --git a/.tests/jenkins/Jenkinsfile b/.tests/jenkins/Jenkinsfile index 0237af7f..43b7d94d 100644 --- a/.tests/jenkins/Jenkinsfile +++ b/.tests/jenkins/Jenkinsfile @@ -18,6 +18,14 @@ pipeline { virtualenv venv -p python3.9 source venv/bin/activate pip3 install -r requirements.txt + + hydra-genetics --debug references download -o /beegfs-storage/design_and_ref_files -v config/references/design_files.hg19.yaml + hydra-genetics --debug references download -o /beegfs-storage/design_and_ref_files -v config/references/nextseq.hg19.pon.yaml + hydra-genetics --debug references download -o /beegfs-storage/design_and_ref_files -v config/references/novaseq.hg19.pon.yaml + hydra-genetics --debug references download -o /beegfs-storage/design_and_ref_files -v config/references/design_files.hg38.yaml + hydra-genetics --debug references download -o /beegfs-storage/design_and_ref_files -v config/references/nextseq.hg38.pon.yaml + #hydra-genetics --debug references download -o /beegfs-storage/design_and_ref_files -v config/references/novaseq.hg38.pon.yaml + mkdir -p develop/slurm_out cp -r config develop/ cp .tests/jenkins/units_develop.tsv develop/units.tsv diff --git a/.tests/jenkins/test_input_develop.tsv b/.tests/jenkins/test_input_develop.tsv index 1dcd7dd6..2eea3623 100644 --- a/.tests/jenkins/test_input_develop.tsv +++ b/.tests/jenkins/test_input_develop.tsv @@ -4,16 +4,17 @@ bam_rna/SeraSeq-NTRK-FFPE-2-5M_R.star_fusion.bam 7be7432fe3d040f81bbd240452f7463 results/dna/VAL2022-2-5M_T/additional_files/biomarker/VAL2022-2-5M_T.pathology.scarhrd_cnvkit_score.txt 748d10cd1ae3ab6cb7849cfd0e9f39c0 results/dna/VAL2022-2-5M_T/additional_files/biomarker/VAL2022-2-5M_T.purecn.scarhrd_cnvkit_score.txt 55145eab2e60e588c66b233cf1479287 results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.amplifications.tsv f397f92f69100d1f20ca65ffac830dd0 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.deletions.tsv f887194ae8134c8a1c8fd427ae533646 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.manta_tumorSV.vcf.gz 1c4f0a2b346d1355dc76fcd126222729 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology.amp_all_del_all.cnv_report.tsv 2b09144044b2bdf63c440082be760b26 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology.amp_all_del_validated.cnv_report.tsv ceb3be1abd7951617928cd5c3572a8a1 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology_purecn.amp_all_del_all.cnv_report.tsv 8a85c32ff5b4e0668376043e16dc0e77 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology_purecn.svdb_query.vcf c5ebcc72790e0c6f5186298bf5890348 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology.svdb_query.vcf c51a58f9332c9176b0f354a955fd25bc -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.purecn.amp_all_del_all.cnv_report.tsv 8a85c32ff5b4e0668376043e16dc0e77 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.purecn.amp_all_del_validated.cnv_report.tsv 2438c93ab15eb0e2f6595ac5c006a1f2 -results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.purecn.svdb_query.vcf 49498b22ca7af9176b31e194ef5eb42f +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.deletions.tsv e441b862e6d6d573d18fe40f1eaa7103 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.manta_tumorSV.vcf.gz 05c5aead798d0a04dc762657fc377f70 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology.amp_all_del_all.cnv_report.tsv 29beb70a92981779b52d26f5b3a38434 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology.amp_all_del_validated.cnv_report.tsv 1bb1a34245f59a884358fb21ff2a5f69 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology_purecn.amp_all_del_all.cnv_report.tsv 1115cf9989f5ce9725bf69f2e397fde9 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology_purecn.svdb_query.vcf 9a4d542bbb31bb6adf8178ffe6cf323a +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.pathology.svdb_query.vcf dd3c24e0c05d79eb7a2d9ca02aa73156 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.purecn.amp_all_del_all.cnv_report.tsv 1115cf9989f5ce9725bf69f2e397fde9 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.purecn.amp_all_del_validated.cnv_report.tsv 227717b4bc1de967c6ebeff9e240a274 +results/dna/VAL2022-2-5M_T/additional_files/cnv/VAL2022-2-5M_T.purecn.svdb_query.vcf bade0d7c0199cd3d31f0ec2bd4d1b378 +results/dna/VAL2022-2-5M_T/additional_files/fusion/VAL2022-2-5M_T.fuseq_wes.unfiltered.results.csv 8ecac127fc74039d5b9eb36e522ac4a5 results/dna/VAL2022-2-5M_T/additional_files/qc/VAL2022-2-5M_T.alignment_summary_metrics.txt fa88f8adc1d6146cf399c6d17a37d098 results/dna/VAL2022-2-5M_T/additional_files/qc/VAL2022-2-5M_T.contamination.table a4ee6cd2e1cfe029229f5b218ac318e9 results/dna/VAL2022-2-5M_T/additional_files/qc/VAL2022-2-5M_T.duplication_metrics.txt 1c476addf89d82bf68aea08eba98aacf @@ -28,9 +29,10 @@ results/dna/VAL2022-2-5M_T/additional_files/vcf/vardict_VAL2022-2-5M_T.vcf.gz 22 results/dna/VAL2022-2-5M_T/biomarker/VAL2022-2-5M_T.msisensor_pro.score.tsv e47f1fb00f0220a20335f972e1af8d39 results/dna/VAL2022-2-5M_T/biomarker/VAL2022-2-5M_T.pathology_purecn.scarhrd_cnvkit_score.txt 55145eab2e60e588c66b233cf1479287 results/dna/VAL2022-2-5M_T/biomarker/VAL2022-2-5M_T.TMB.txt e20d98b80deb3ad74605c0439abda1f2 -results/dna/VAL2022-2-5M_T/cnv/VAL2022-2-5M_T.pathology_purecn.amp_all_del_validated.cnv_report.tsv 2438c93ab15eb0e2f6595ac5c006a1f2 +results/dna/VAL2022-2-5M_T/cnv/VAL2022-2-5M_T.pathology_purecn.amp_all_del_validated.cnv_report.tsv 227717b4bc1de967c6ebeff9e240a274 results/dna/VAL2022-2-5M_T/fusion/VAL2022-2-5M_T.fuseq_wes.report.csv ada3432e93dd1ff64f9254d8c83c91c5 -results/dna/VAL2022-2-5M_T/qc/VAL2022-2-5M_T.coverage_and_mutations.tsv 62666e179f8e43450e4dd65a75928e34 +results/dna/VAL2022-2-5M_T/qc/VAL2022-2-5M_T.coverage_and_mutations.tsv 8921f53aa4902a009fb34b6e6db2a8da +results/dna/VAL2022-2-5M_T/qc/VAL2022-2-5M_T.coverage_and_mutations.ENC.tsv 5331bcd5647fac6ee1424b07b3a81651 results/dna/VAL2022-2-5M_T/vcf/VAL2022-2-5M_T.annotated.exon_only.filter.hard_filter.codon_snv.qci.vcf e9dc0afeeb5d2b738183ce465973b292 results/dna/VAL2022-2-5M_T/vcf/VAL2022-2-5M_T.annotated.exon_only.filter.hard_filter.codon_snv.vcf 7c62b3abce55f9eacce567211dbb0a1b results/rna/SeraSeq-NTRK-FFPE-2-5M_R/additional_files/fusion/SeraSeq-NTRK-FFPE-2-5M_R.arriba.fusions.tsv eedffe64014818557f8fd9ea2c21fa7c diff --git a/.tests/jenkins/test_input_main.tsv b/.tests/jenkins/test_input_main.tsv index 47bf6d6a..9e274a4f 100644 --- a/.tests/jenkins/test_input_main.tsv +++ b/.tests/jenkins/test_input_main.tsv @@ -17,6 +17,7 @@ results/dna/HD827_T/additional_files/cnv/HD827_T.pathology.svdb_query.vcf 2c73b0 results/dna/HD827_T/additional_files/cnv/HD827_T.purecn.amp_all_del_all.cnv_report.tsv 01660be1832c394b02e87e47e11afd9a results/dna/HD827_T/additional_files/cnv/HD827_T.purecn.amp_all_del_validated.cnv_report.tsv 83f8461e2ac1537ed67f5fbb48cc6e09 results/dna/HD827_T/additional_files/cnv/HD827_T.purecn.svdb_query.vcf 9e1269f0072aa92eeaf969432af18ad8 +results/dna/HD827_T/additional_files/fusion/HD827_T.fuseq_wes.unfiltered.results.csv 3350bfff935006e7375301954261e468 results/dna/HD827_T/additional_files/qc/HD827_T.alignment_summary_metrics.txt 351f113346779003eae764ab6d16c5c4 results/dna/HD827_T/additional_files/qc/HD827_T.contamination.table 3c9511e7efab4fefd32144ff2ad19d19 results/dna/HD827_T/additional_files/qc/HD827_T.duplication_metrics.txt 9ac673f2e7d28e6d9601bc096b7764e4 @@ -49,6 +50,7 @@ results/dna/HD832_T/additional_files/cnv/HD832_T.pathology.svdb_query.vcf ed9da0 results/dna/HD832_T/additional_files/cnv/HD832_T.purecn.amp_all_del_all.cnv_report.tsv eb1ad324eff1937f3e6db0c6cbb83aba results/dna/HD832_T/additional_files/cnv/HD832_T.purecn.amp_all_del_validated.cnv_report.tsv 4ba0962054a276e1cfdb4298ef815240 results/dna/HD832_T/additional_files/cnv/HD832_T.purecn.svdb_query.vcf c9e96e55f8d252282ee43004710d731f +results/dna/HD832_T/additional_files/fusion/HD832_T.fuseq_wes.unfiltered.results.csv 23ac6dddb35fb8067fa79465fb6f6acf results/dna/HD832_T/additional_files/qc/HD832_T.alignment_summary_metrics.txt 8b2548b05cc43eaff20aeabb16aa1490 results/dna/HD832_T/additional_files/qc/HD832_T.contamination.table 18bef22a4210099ec22b2444e6ac2482 results/dna/HD832_T/additional_files/qc/HD832_T.duplication_metrics.txt 20cd0962dab4ca0011aec2100f4a572e @@ -81,6 +83,7 @@ results/dna/NA12878_T/additional_files/cnv/NA12878_T.pathology.svdb_query.vcf 7d results/dna/NA12878_T/additional_files/cnv/NA12878_T.purecn.amp_all_del_all.cnv_report.tsv c626b45394631f05c401350305511bf0 results/dna/NA12878_T/additional_files/cnv/NA12878_T.purecn.amp_all_del_validated.cnv_report.tsv 76bdd87a4ed97e304a79e6ff2b3279d5 results/dna/NA12878_T/additional_files/cnv/NA12878_T.purecn.svdb_query.vcf 8b6a8f92b09c532570e93a1d6f0e6b73 +results/dna/NA12878_T/additional_files/fusion/NA12878_T.fuseq_wes.unfiltered.results.csv 003e8f796c80c7e52322630800d36d41 results/dna/NA12878_T/additional_files/qc/NA12878_T.alignment_summary_metrics.txt 3256b89a54ac29cf4f7ef0d6551a3be3 results/dna/NA12878_T/additional_files/qc/NA12878_T.contamination.table e2ab7189dcf7c8cfb1f0ac24ac8766dc results/dna/NA12878_T/additional_files/qc/NA12878_T.duplication_metrics.txt 355ba5b555b9d83a5820f054c323eef6 @@ -113,6 +116,7 @@ results/dna/VAL2022_T/additional_files/cnv/VAL2022_T.pathology.svdb_query.vcf eb results/dna/VAL2022_T/additional_files/cnv/VAL2022_T.purecn.amp_all_del_all.cnv_report.tsv 21a1a191bbd182e974888af7fd35842b results/dna/VAL2022_T/additional_files/cnv/VAL2022_T.purecn.amp_all_del_validated.cnv_report.tsv badb0ef5f90cf55dc80fafeb9c415787 results/dna/VAL2022_T/additional_files/cnv/VAL2022_T.purecn.svdb_query.vcf abde954047d0b215e596d6305a96ed62 +results/dna/VAL2022_T/additional_files/fusion/VAL2022_T.fuseq_wes.unfiltered.results.csv a29ae481538efadb7403814d9d8f76e9 results/dna/VAL2022_T/additional_files/qc/VAL2022_T.alignment_summary_metrics.txt 6c99476824c691ca10e75382ed9bb059 results/dna/VAL2022_T/additional_files/qc/VAL2022_T.contamination.table c1d8fc1cdb993dda50a430028277c37b results/dna/VAL2022_T/additional_files/qc/VAL2022_T.duplication_metrics.txt 31611542aafb4495560e062cac828f08 diff --git a/.tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv b/.tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv new file mode 100644 index 00000000..09e645ce --- /dev/null +++ b/.tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv @@ -0,0 +1,136 @@ +@HD VN:1.6 +@SQ SN:chr1 LN:249250621 +@SQ SN:chr2 LN:243199373 +@SQ SN:chr3 LN:198022430 +@SQ SN:chr4 LN:191154276 +@SQ SN:chr5 LN:180915260 +@SQ SN:chr6 LN:171115067 +@SQ SN:chr7 LN:159138663 +@SQ SN:chr8 LN:146364022 +@SQ SN:chr9 LN:141213431 +@SQ SN:chr10 LN:135534747 +@SQ SN:chr11 LN:135006516 +@SQ SN:chr12 LN:133851895 +@SQ SN:chr13 LN:115169878 +@SQ SN:chr14 LN:107349540 +@SQ SN:chr15 LN:102531392 +@SQ SN:chr16 LN:90354753 +@SQ SN:chr17 LN:81195210 +@SQ SN:chr18 LN:78077248 +@SQ SN:chr19 LN:59128983 +@SQ SN:chr20 LN:63025520 +@SQ SN:chr21 LN:48129895 +@SQ SN:chr22 LN:51304566 +@SQ SN:chrX LN:155270560 +@SQ SN:chrY LN:59373566 +@SQ SN:chrM LN:16571 +@RG ID:GATKCopyNumber SM:VAL-69_T +CONTIG START END LOG2_COPY_RATIO +chr17 7132712 7133612 0.105974 +chr17 7215805 7216253 -0.076869 +chr17 7216254 7216487 -0.053930 +chr17 7216488 7216654 0.085837 +chr17 7216655 7216837 -0.027169 +chr17 7216838 7217132 0.060059 +chr17 7217133 7217353 0.041558 +chr17 7217354 7217544 0.016901 +chr17 7217545 7217764 -0.005309 +chr17 7217765 7218097 0.070831 +chr17 7218098 7218641 -0.005943 +chr17 7387814 7388446 0.207122 +chr17 7398990 7399460 -0.162797 +chr17 7399461 7399712 -0.146018 +chr17 7399713 7400007 0.145977 +chr17 7400008 7400519 0.184896 +chr17 7400520 7400902 0.232086 +chr17 7400903 7401293 0.073453 +chr17 7401294 7401799 -0.038963 +chr17 7402088 7402541 0.008168 +chr17 7402542 7403080 0.062317 +chr17 7403688 7404213 0.036523 +chr17 7404214 7404517 0.116535 +chr17 7404518 7404773 0.183643 +chr17 7404774 7405133 0.198395 +chr17 7405134 7405639 0.174971 +chr17 7405640 7406230 0.086756 +chr17 7406231 7406657 0.090217 +chr17 7406658 7406857 0.008901 +chr17 7406858 7407380 0.081301 +chr17 7411300 7412028 0.239698 +chr17 7412029 7412680 0.074670 +chr17 7412681 7413221 -0.044962 +chr17 7414264 7414681 0.093942 +chr17 7414682 7415018 0.157573 +chr17 7415019 7415405 0.059885 +chr17 7415406 7415729 0.099066 +chr17 7415730 7416001 0.160210 +chr17 7416002 7416291 0.185480 +chr17 7416292 7417766 -0.202713 +chr17 7460067 7460967 0.091835 +chr17 7572657 7573278 -0.019844 +chr17 7573657 7574303 0.151558 +chr17 7576267 7576604 -0.217493 +chr17 7576756 7576972 -0.000587 +chr17 7576973 7577327 -0.091586 +chr17 7577328 7577878 -0.098413 +chr17 7577907 7578330 -0.089749 +chr17 7578331 7578824 -0.158608 +chr17 7579042 7579645 -0.408962 +chr17 7579646 7579780 -0.977634 +chr17 7579781 7580182 0.222077 +chr17 7787855 7788671 -0.883565 +chr17 7792049 7792688 0.397481 +chr17 7792712 7793364 0.253649 +chr17 7793619 7794158 -0.463637 +chr17 7794159 7794652 0.207440 +chr17 7796334 7797005 -0.169586 +chr17 7797006 7797343 0.155267 +chr17 7797344 7797658 0.064223 +chr17 7797659 7798080 -0.020076 +chr17 7798081 7798562 0.111615 +chr17 7798563 7799130 -0.329941 +chr17 7800131 7800882 0.151060 +chr17 7801019 7801617 0.083185 +chr17 7801618 7802121 0.043944 +chr17 7802122 7802590 -0.051191 +chr17 7802591 7803037 0.184511 +chr17 7803038 7803479 0.213696 +chr17 7803480 7803802 0.026167 +chr17 7803803 7804109 0.232087 +chr17 7804110 7804436 0.007232 +chr17 7804437 7804963 0.035956 +chr17 7805658 7806150 0.023346 +chr17 7806151 7806484 0.056706 +chr17 7806485 7806982 0.078185 +chr17 7806983 7807534 0.124214 +chr17 7807535 7808177 0.156872 +chr17 7808178 7808711 0.105312 +chr17 7808712 7809099 0.169250 +chr17 7809100 7809577 0.287494 +chr17 7809601 7810101 0.139272 +chr17 7810102 7810396 0.177454 +chr17 7810397 7810618 0.350710 +chr17 7810619 7810862 0.144804 +chr17 7810863 7811116 0.105927 +chr17 7811117 7811528 0.216446 +chr17 7811529 7811921 0.289250 +chr17 7811922 7812304 0.269335 +chr17 7812305 7812926 0.077875 +chr17 7813476 7814037 0.272691 +chr17 7814038 7814536 0.167095 +chr17 7814537 7815173 0.103120 +chr17 7975819 7976367 0.149621 +chr17 7976368 7976805 -0.099943 +chr17 7976806 7977345 0.085893 +chr17 7978643 7979263 -0.053568 +chr17 7979264 7979818 0.070835 +chr17 7979819 7980184 -0.034290 +chr17 7980185 7980781 0.119204 +chr17 7982444 7982972 0.092477 +chr17 7982973 7983406 0.366159 +chr17 7983407 7983816 0.249227 +chr17 7983817 7984150 -0.023842 +chr17 7984151 7984359 -0.269555 +chr17 7984360 7984775 -0.041958 +chr17 7989064 7989808 0.103631 +chr17 7990344 7991030 -0.396831 diff --git a/config/config.data.hg19.yaml b/config/config.data.hg19.yaml index 2825fd0e..bb154743 100644 --- a/config/config.data.hg19.yaml +++ b/config/config.data.hg19.yaml @@ -54,13 +54,14 @@ bwa_mem: sa: "{{PROJECT_REF_DATA}}/ref_data/hg19/hg19.with.mt.fasta.sa" call_small_cnv_deletions: + blacklist: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/small_deletions_caller_blacklist.tsv" regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_deletion_genes.tsv" call_small_cnv_amplifications: regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_amplification_genes_240307.tsv" cnvkit_batch: - normal_reference: "{{PROJECT_PON_DATA}}/GMS560/PoN/cnvkit_nextseq_36.cnn" + normal_reference: "{{PROJECT_PON_DATA}}/GMS560/PoN/cnvkit_PoN_combined_hg19.cnn" cnvkit_batch_hrd: normal_reference_hrd: "{{PROJECT_PON_DATA}}/GMS560/PoN/cnvkit_nextseq_27_HRD.cnn" @@ -77,6 +78,12 @@ filter_fuseq_wes: transcript_black_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/fuseq_wes_transcript_black_list.txt" gtf: "{{PROJECT_REF_DATA}}/ref_data/refGene/hg19.refGene.gtf" +filter_fuseq_wes_umi: + gene_fusion_black_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/false_positive_fusion_pairs.txt" + gene_white_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/fuseq_wes_gene_white_list.txt" + transcript_black_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/fuseq_wes_transcript_black_list.txt" + gtf: "{{PROJECT_REF_DATA}}/ref_data/refGene/hg19.refGene.gtf" + fuseq_wes: ref_json: "{{PROJECT_REF_DATA}}/ref_data/fuseq_wes/UCSC_hg19_wes_contigSize3000_bigLen130000_r100.json" gtfSqlite: "{{PROJECT_REF_DATA}}/ref_data/fuseq_wes/UCSC_hg19_wes_contigSize3000_bigLen130000_r100.sqlite" @@ -105,12 +112,11 @@ hotspot_annotation: chr_translation_file: "config/reports/hotspot_report.chr.translation.hg19" hotspots: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011.csv" -hotspot_info: - hotspot_mutations: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011.csv" - hotspot_report: chr_translation_file: "config/reports/hotspot_report.chr.translation.hg19" - hotspot_mutations: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011.csv" + hotspot_mutations: + all: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011.csv" + ENC: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/ENC_hotspots_240604.csv" manta_config_t: extra: "--exome --callRegions {{PROJECT_DESIGN_DATA}}/GMS560/design/pool1_pool2.sort.merged.padded20.cnv200.hg19.split_fusion_genes.210608.bed.gz" diff --git a/config/config.data.hg38.yaml b/config/config.data.hg38.yaml index 3ffc1ef3..3d3546ea 100644 --- a/config/config.data.hg38.yaml +++ b/config/config.data.hg38.yaml @@ -54,13 +54,14 @@ bwa_mem: sa: "{{PROJECT_REF_DATA}}/ref_data/GRCh38_p14/homo_sapiens.fasta.sa" call_small_cnv_deletions: + blacklist: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/small_deletions_caller_blacklist_hg38.tsv" regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_deletion_genes_hg38.tsv" call_small_cnv_amplifications: regions_file: "{{PROJECT_DESIGN_DATA}}/GMS560/cnv/cnv_amplification_genes_hg38_240307.tsv" cnvkit_batch: - normal_reference: "{{PROJECT_PON_DATA}}/GMS560/PoN/cnvkit_nextseq_noUmea_27_hg38.cnn" + normal_reference: "{{PROJECT_PON_DATA}}/GMS560/PoN/cnvkit_PoN_combined_hg38.cnn" cnvkit_batch_hrd: normal_reference_hrd: "{{PROJECT_PON_DATA}}/GMS560/PoN/cnvkit_nextseq_noUmea_27_hg38.cnn" @@ -77,6 +78,12 @@ filter_fuseq_wes: transcript_black_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/fuseq_wes_transcript_black_list.txt" gtf: "{{PROJECT_REF_DATA}}/ref_data/refGene/hg38.refGene.gtf" +filter_fuseq_wes_umi: + gene_fusion_black_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/false_positive_fusion_pairs.txt" + gene_white_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/fuseq_wes_gene_white_list.txt" + transcript_black_list: "{{PROJECT_DESIGN_DATA}}/GMS560/fuseq_wes/fuseq_wes_transcript_black_list.txt" + gtf: "{{PROJECT_REF_DATA}}/ref_data/refGene/hg38.refGene.gtf" + fuseq_wes: ref_json: "{{PROJECT_REF_DATA}}/ref_data/fuseq_wes/UCSC_hg38_wes_contigSize3000_bigLen130000_r100.json" gtfSqlite: "{{PROJECT_REF_DATA}}/ref_data/fuseq_wes/UCSC_hg38_wes_contigSize3000_bigLen130000_r100.sqlite" @@ -97,16 +104,19 @@ gatk_get_pileup_summaries: sites: "{{PROJECT_REF_DATA}}/ref_data/GNOMAD/small_exac_common_3.hg38.vcf.gz" variants: "{{PROJECT_REF_DATA}}/ref_data/GNOMAD/small_exac_common_3.hg38.vcf.gz" +gene_fuse: + genes: "/projects/wp1/nobackup/ngs/utveckling/Twist_DNA_DATA/gene_fuse/GMS560_fusion_w_pool2.hg38.csv" + fasta: "/data/ref_genomes/GRCh38_p14/homo_sapiens.fasta" + hotspot_annotation: chr_translation_file: "config/reports/hotspot_report.chr.translation.hg38" hotspots: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011_hg38.csv" -hotspot_info: - hotspot_mutations: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011_hg38.csv" - hotspot_report: chr_translation_file: "config/reports/hotspot_report.chr.translation.hg38" - hotspot_mutations: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011_hg38.csv" + hotspot_mutations: + all: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/Hotspots_combined_regions_nodups.231011_hg38.csv" + ENC: "{{PROJECT_DESIGN_DATA}}/GMS560/reports/ENC_hotspots_240604_hg38.csv" manta_config_t: extra: "--exome --callRegions {{PROJECT_DESIGN_DATA}}/GMS560/design/pool1_pool2.sort.merged.padded20.cnv200.hg38.split_fusion_genes.reannotated.230222.bed.gz" diff --git a/config/config.yaml b/config/config.yaml index 1aedaa69..b6602b4e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -5,7 +5,7 @@ units: "units.tsv" #hydra_local_path: "PATH_TO_REPO" output: "config/output_files.yaml" -default_container: "docker://hydragenetics/common:1.10.2" +default_container: "docker://hydragenetics/common:3.0.0" trimmer_software: "fastp_pe" @@ -142,6 +142,10 @@ filter_fuseq_wes: min_support: 50 filter_on_fusiondb: True +filter_fuseq_wes_umi: + min_support: 15 + filter_on_fusiondb: True + fuseq_wes: container: "docker://hydragenetics/fuseq_wes:1.0.1" @@ -182,6 +186,9 @@ gatk_mutect2_gvcf: gatk_mutect2_merge_stats: container: "docker://hydragenetics/gatk4:4.1.9.0" +gene_fuse: + container: "docker://hydragenetics/genefuse:0.6.1" + hotspot_report: report_config: "config/reports/hotspot_report.yaml" levels: @@ -299,6 +306,9 @@ report_fusions: star_fusion_low_support: 2 star_fusion_low_support_inframe: 6 +report_gene_fuse: + min_unique_reads: 6 + samtools_merge_bam: extra: "-c -p" diff --git a/config/filters/config_hard_filter_umi_vep105.yaml b/config/filters/config_hard_filter_umi_vep105.yaml index e0b4fb64..a66e2b12 100644 --- a/config/filters/config_hard_filter_umi_vep105.yaml +++ b/config/filters/config_hard_filter_umi_vep105.yaml @@ -3,6 +3,10 @@ filters: description: "Hard filter intronic variants" expression: "(exist[intron_variant, VEP:Consequence] and !exist[splice, VEP:Consequence] and VEP:SYMBOL != MET and VEP:SYMBOL != TERT and !exist[COSV[0-9]+, VEP:Existing_variation])" soft_filter: "False" + noisy_gene: + description: "Hard filter variants in noisy genes" + expression: "VEP:SYMBOL = MUC6 or VEP:SYMBOL = CDC27" + soft_filter: "False" artifacts: description: "Hard filter variants found in more than 3 normal samples" expression: "((INFO:Artifact:0 > 2 and INFO:ArtifactNrSD:0 < 10) or (INFO:Artifact:0 > 2 and INFO:ArtifactNrSD:1 < 10))" diff --git a/config/filters/config_soft_filter_umi_vep105.yaml b/config/filters/config_soft_filter_umi_vep105.yaml index fadf0abe..dda54549 100644 --- a/config/filters/config_soft_filter_umi_vep105.yaml +++ b/config/filters/config_soft_filter_umi_vep105.yaml @@ -4,6 +4,11 @@ filters: expression: "(exist[intron_variant, VEP:Consequence] and !exist[splice, VEP:Consequence] and VEP:SYMBOL != MET and VEP:SYMBOL != TERT and !exist[COSV[0-9]+, VEP:Existing_variation])" soft_filter_flag: "intron" soft_filter: "True" + noisy_gene: + description: "Hard filter variants in noisy genes" + expression: "VEP:SYMBOL = MUC6 or VEP:SYMBOL = CDC27" + soft_filter_flag: "noisy_gene" + soft_filter: "True" artifacts: description: "Hard filter variants found in more than 3 normal samples" expression: "((INFO:Artifact:0 > 2 and INFO:ArtifactNrSD:0 < 10) or (INFO:Artifact:0 > 2 and INFO:ArtifactNrSD:1 < 10))" diff --git a/config/output_files.yaml b/config/output_files.yaml index 33494cea..2ed4a5f3 100644 --- a/config/output_files.yaml +++ b/config/output_files.yaml @@ -137,12 +137,18 @@ files: types: - T - N - - name: Hotspot TSV - input: qc/hotspot_report/{sample}_{type}.output.tsv + - name: Coverage and mutations all + input: qc/hotspot_report/{sample}_{type}.all.output.tsv output: results/dna/{sample}_{type}/qc/{sample}_{type}.coverage_and_mutations.tsv types: - T - N + - name: Coverage and mutations ENC + input: qc/hotspot_report/{sample}_{type}.ENC.output.tsv + output: results/dna/{sample}_{type}/qc/{sample}_{type}.coverage_and_mutations.ENC.tsv + types: + - T + - N - name: Contamination table input: qc/gatk_calculate_contamination/{sample}_{type}.contamination.table output: results/dna/{sample}_{type}/additional_files/qc/{sample}_{type}.contamination.table @@ -247,6 +253,36 @@ files: types: - T - N + - name: fuseq WES umi + input: fusions/filter_fuseq_wes/{sample}_{type}.fuseq_wes.report.umi.csv + output: results/dna/{sample}_{type}/fusion/{sample}_{type}.fuseq_wes.report.umi.csv + types: + - T + - N + deduplication: + - umi + - name: fuseq WES unfiltered + input: fusions/fuseq_wes/{sample}_{type}/FuSeq_WES_FusionFinal.txt + output: results/dna/{sample}_{type}/additional_files/fusion/{sample}_{type}.fuseq_wes.unfiltered.results.csv + types: + - T + - N + - name: GeneFuse report + input: fusions/report_gene_fuse/{sample}_{type}.gene_fuse_report.tsv + output: results/dna/{sample}_{type}/fusion/{sample}_{type}.gene_fuse_report.umi.tsv + types: + - T + - N + deduplication: + - umi + - name: GeneFuse fusions unfiltered + input: fusions/gene_fuse/{sample}_{type}_gene_fuse_fusions.txt + output: results/dna/{sample}_{type}/additional_files/fusion/{sample}_{type}.gene_fuse.unfiltered.results.txt + types: + - T + - N + deduplication: + - umi - name: ID-SNP VCF RNA input: snv_indels/bcftools_id_snps/{sample}_{type}.id_snps.vcf output: results/rna/{sample}_{type}/id_snps/{sample}_{type}.id_snps.vcf diff --git a/config/references/design_files.hg19.yaml b/config/references/design_files.hg19.yaml index fb6da1d7..dbf6f698 100644 --- a/config/references/design_files.hg19.yaml +++ b/config/references/design_files.hg19.yaml @@ -48,6 +48,11 @@ url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.6.0/cnv/cnv_amplification_genes_240307.tsv call_small_cnv_deletions: + blacklist: + checksum: 019763e1b79682a56074a3f53a73d149 + path: GMS560/cnv/small_deletions_caller_blacklist.tsv + type: file + url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.7.0/cnv/small_deletions_caller_blacklist.tsv regions_file: checksum: e044cf88e5e50e68a3a797be5720e6c0 path: GMS560/cnv/cnv_deletion_genes.tsv @@ -122,19 +127,18 @@ type: file url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.4.0/design/Hotspots_combined_regions_nodups.231011.csv - hotspot_info: - hotspot_mutations: - checksum: 50564a9b99f86b123bb61f6e361eff10 - path: GMS560/reports/Hotspots_combined_regions_nodups.231011.csv - type: file - url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.4.0/design/Hotspots_combined_regions_nodups.231011.csv - hotspot_report: hotspot_mutations: checksum: 50564a9b99f86b123bb61f6e361eff10 path: GMS560/reports/Hotspots_combined_regions_nodups.231011.csv type: file url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.4.0/design/Hotspots_combined_regions_nodups.231011.csv + + hotspot_mutations_ENC: + checksum: 2c7c1707880dbf1790e8a472c1d1e311 + path: GMS560/reports/ENC_hotspots_240604.csv + type: file + url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.8.1/design/ENC_hotspots_240604.csv manta_config_t: extra: diff --git a/config/references/design_files.hg38.yaml b/config/references/design_files.hg38.yaml index 475d0b60..dc2bc0e1 100644 --- a/config/references/design_files.hg38.yaml +++ b/config/references/design_files.hg38.yaml @@ -48,6 +48,11 @@ url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.6.0/cnv/cnv_amplification_genes_hg38_240307.tsv call_small_cnv_deletions: + blacklist: + checksum: 9f57eec94bee9b4edf4bba409c7b7b4c + path: GMS560/cnv/small_deletions_caller_blacklist_hg38.tsv + type: file + url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.7.0/cnv/small_deletions_caller_blacklist_hg38.tsv regions_file: checksum: 92510485ce32e9ef64697121aae4122c path: GMS560/cnv/cnv_deletion_genes_hg38.tsv @@ -115,19 +120,18 @@ type: file url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.5.0/design/Hotspots_combined_regions_nodups.231011_hg38.csv - hotspot_info: + hotspot_report: hotspot_mutations: checksum: 08c574ec13984188c8e4d2c80c14d2e3 path: GMS560/reports/Hotspots_combined_regions_nodups.231011_hg38.csv type: file url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.5.0/design/Hotspots_combined_regions_nodups.231011_hg38.csv - hotspot_report: - hotspot_mutations: - checksum: 08c574ec13984188c8e4d2c80c14d2e3 - path: GMS560/reports/Hotspots_combined_regions_nodups.231011_hg38.csv + hotspot_mutations_ENC: + checksum: db3d27fa28240e00977db554c131e2e4 + path: GMS560/reports/ENC_hotspots_240604_hg38.csv type: file - url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.5.0/design/Hotspots_combined_regions_nodups.231011_hg38.csv + url: https://github.com/genomic-medicine-sweden/Twist_Solid_pipeline_files/raw/v0.8.1/design/ENC_hotspots_240604_hg38.csv manta_config_t: extra: diff --git a/config/references/nextseq.hg19.pon.yaml b/config/references/nextseq.hg19.pon.yaml index ca1a0d97..cee6cdf1 100644 --- a/config/references/nextseq.hg19.pon.yaml +++ b/config/references/nextseq.hg19.pon.yaml @@ -30,10 +30,10 @@ cnvkit_batch: normal_reference: - checksum: 2c8f519a872d92ec1cb43bd29518a71b - path: GMS560/PoN/cnvkit_nextseq_36.cnn + checksum: 203d8e695b6324f8a5c27e2c7882f3d2 + path: GMS560/PoN/cnvkit_PoN_combined_hg19.cnn type: file - url: https://figshare.scilifelab.se/ndownloader/files/40882850 + url: https://figshare.com/ndownloader/files/46640134?private_link=88202740beb6fcbac09d cnvkit_batch_hrd: normal_reference_hrd: diff --git a/config/references/nextseq.hg38.pon.yaml b/config/references/nextseq.hg38.pon.yaml index 7dbad80a..1ed6dce0 100644 --- a/config/references/nextseq.hg38.pon.yaml +++ b/config/references/nextseq.hg38.pon.yaml @@ -30,10 +30,10 @@ cnvkit_batch: normal_reference: - checksum: b9995ea0cfa1c259fe7229aee1fd2646 - path: GMS560/PoN/cnvkit_nextseq_noUmea_27_hg38.cnn + checksum: 0dd2ecd2e4bd4bd364404740b0f7a16c + path: GMS560/PoN/cnvkit_PoN_combined_hg38.cnn type: file - url: https://figshare.com/ndownloader/files/44328734?private_link=53f1597211477d840a29 + url: https://figshare.com/ndownloader/files/46640131?private_link=88202740beb6fcbac09d #cnvkit_batch_hrd: # normal_reference_hrd: diff --git a/config/references/novaseq.hg19.pon.yaml b/config/references/novaseq.hg19.pon.yaml index 45c5b060..160246a7 100644 --- a/config/references/novaseq.hg19.pon.yaml +++ b/config/references/novaseq.hg19.pon.yaml @@ -29,17 +29,17 @@ cnvkit_batch: normal_reference: - checksum: e7853030e7a6b0ccbeffb7c008983a29 - path: GMS560/PoN/cnvkit_novaseq_13.cnn + checksum: 203d8e695b6324f8a5c27e2c7882f3d2 + path: GMS560/PoN/cnvkit_PoN_combined_hg19.cnn type: file - url: https://figshare.scilifelab.se/ndownloader/files/40882853 + url: https://figshare.com/ndownloader/files/46640134?private_link=88202740beb6fcbac09d cnvkit_batch_hrd: normal_reference_hrd: checksum: e7853030e7a6b0ccbeffb7c008983a29 path: GMS560/PoN/cnvkit_novaseq_13_HRD.cnn type: file - url: https://figshare.scilifelab.se/ndownloader/files/40882871 + url: https://figshare.scilifelab.se/ndownloader/files/40914854 msisensor_pro: PoN: diff --git a/config/reports/multiqc_config_dna.yaml b/config/reports/multiqc_config_dna.yaml index 67e3ffa5..549f29f0 100644 --- a/config/reports/multiqc_config_dna.yaml +++ b/config/reports/multiqc_config_dna.yaml @@ -76,7 +76,6 @@ multiqc_cgs: format: "{:.1%}" PCT_USABLE_BASES_ON_TARGET: title: "Usable bases [%]" - format: "{:.0%}" description: "Bases aligned, on exon target, and dedup [%] from Picard" min: 0 max: 0.2 @@ -149,8 +148,6 @@ multiqc_cgs: table_columns_placement: Picard: PF_READS_ALIGNED: 900 - PCT_PF_READS_ALIGNED: 901 - PERCENT_DUPLICATION: 902 PCT_PF_READS_ALIGNED: 910 PERCENT_DUPLICATION: 920 PCT_SELECTED_BASES: 930 diff --git a/config/resources.yaml b/config/resources.yaml index 35a36288..29fc8259 100644 --- a/config/resources.yaml +++ b/config/resources.yaml @@ -32,6 +32,7 @@ fastp_pe_arriba: threads: 5 mem_mb: 30720 mem_per_cpu: 6144 + fgbio_call_and_filter_consensus_reads: threads: 3 mem_mb: 18432 @@ -52,12 +53,6 @@ fusioncatcher: mem_mb: 61440 mem_per_cpu: 6144 -vep: - threads: 5 - time: "6:00:00" - mem_mb: 30720 - mem_per_cpu: 6144 - fastqc: threads: 2 mem_mb: 12288 @@ -81,9 +76,6 @@ gene_fuse: mem_mb: 36864 mem_per_cpu: 6144 -vardict: - time: "48:00:00" - star: threads: 5 time: "8:00:00" @@ -95,3 +87,12 @@ star_fusion: time: "16:00:00" mem_mb: 30720 mem_per_cpu: 6144 + +vardict: + time: "48:00:00" + +vep: + threads: 5 + time: "6:00:00" + mem_mb: 30720 + mem_per_cpu: 6144 \ No newline at end of file diff --git a/docs/dna_fusions.md b/docs/dna_fusions.md index 82e814b3..2dac4bec 100644 --- a/docs/dna_fusions.md +++ b/docs/dna_fusions.md @@ -6,8 +6,53 @@ See the [fusions hydra-genetics module](https://hydra-genetics-fusions.readthedo ## Pipeline output files: +* `results/dna/fusion/{sample}_{type}.gene_fuse_report.tsv` (with UMI option only) * `results/dna/fusion/{sample}_{type}.fuseq_wes.report.csv` +## Fusions calling using GeneFuse +DNA fusion calling is performed by **[GeneFuse](https://github.com/OpenGene/GeneFuse)** v0.6.1 on fastq-files. It uses a gene transcript target file to limit the number of targets to analyze. + +### Configuration + +**References** + +* [Fasta reference](references.md#genefuse_fasta) genome +* [Gene transcript](references.md#genefuse_transcripts) file with genomic positions for all exons include in the analysis + +
+**Cluster resources** + +| **Options** | **Value** | +|-------------|-| +| mem_mb | 36864 | +| mem_per_cpu | 6144 | +| threads | 6 | +| time | "8:00:00" | + +## GeneFuse Filtering and report +The output from GeneFuse is filtered and then reported into a fusion report using the in-house script [report_gene_fuse.py](https://github.com/genomic-medicine-sweden/Twist_Solid/blob/develop/workflow/scripts/report_gene_fuse.py) ([rule and config](rules.md#report_gene_fuse)). The following filter criteria is used: + +* Fusions must have at least 6 unique supporting reads. +* Very noisy fusion pairs found in almost all samples (defined in [`filter_fusions_20230214.csv`](references.md#genefuse_filter_fusions)) are removed: + - NPM1::ALK + - CLTC::NTRK3 + - MSH2_ALK + - MSH2_HIP1 +* Noisy fusion pairs found in some samples (defined in [`filter_fusions_20230214.csv`](references.md#genefuse_filter_fusions)) are filtered individually on the number of uniquely supporting reads: + - LMNA::EZR 9 + - ABL1::STRN 7 + - EZR::ALK 8 + - RSPO2::BRAF 8 + - LMNA::HIP1 12 + - NPM1::BICC1 11 + - RSPO2::ERG 13 + +### Result file + +* `results/dna/fusion/{sample}_{type}.gene_fuse_report.tsv` + +
+ ## Fusions calling using FuSeq_WES DNA fusion calling is performed by **[FuSeq_WES](https://github.com/nghiavtr/FuSeq_WES)** v1.0.1 on bam-files. It uses a gene transcript target file to limit the number of targets to analyze. diff --git a/docs/run_on_restricted_system_env.md b/docs/run_on_restricted_system_env.md index 99216063..31d4b50d 100644 --- a/docs/run_on_restricted_system_env.md +++ b/docs/run_on_restricted_system_env.md @@ -15,11 +15,8 @@ Fetch the pipeline and install requirements TAG_OR_BRANCH="vX.Y.X" # Clone selected version -git clone --branch ${VERSION} https://github.com/genomic-medicine-sweden/Twist_Solid.git +git clone --branch ${TAG_OR_BRANCH} https://github.com/genomic-medicine-sweden/Twist_Solid.git cd Twist_Solid -# Python 3.8 or newer -python3 -m venv venv && source venv/bin/activate -pip install -r requirements.txt ``` ## Environment @@ -50,12 +47,12 @@ hydra-genetics prepare-environment create-singularity-files -c config/config.yam ```bash # NextSeq - hydra-genetics --debug references download -o design_and_ref_files -v config/references/design_files.hg19.yaml -v config/references/nextseq.hg19.pon.yaml -v config/references/references.hg19.yaml +hydra-genetics --debug references download -o design_and_ref_files -v config/references/design_files.hg19.yaml -v config/references/nextseq.hg19.pon.yaml -v config/references/references.hg19.yaml - #NovaSeq, not all files are prepare for novaseq - hydra-genetics references download -o design_and_ref_files -v config/references/design_files.hg19.yaml -v config/references/novaseq.hg19.pon.yaml -v config/references/references.hg19.yaml +#NovaSeq, not all files are prepare for novaseq +hydra-genetics references download -o design_and_ref_files -v config/references/design_files.hg19.yaml -v config/references/novaseq.hg19.pon.yaml -v config/references/references.hg19.yaml - # Compress data +# Compress data tar -czvf design_and_ref_files.tar.gz design_and_ref_files ``` @@ -66,7 +63,7 @@ tar -czvf design_and_ref_files.tar.gz design_and_ref_files The following file/folders have been created and need to be moved to your server: 1. file: design_and_ref_files.tar.gz -2. file Twist_Solid_{TAG_OR_BRANCH}.tar.gz +2. file: Twist_Solid_{TAG_OR_BRANCH}.tar.gz 3. folder: singularity_cache --- @@ -81,7 +78,7 @@ The following file/folders have been created and need to be moved to your server # Extract tar. TAG_OR_BRANCH="vX.Y.X" tar -xvf Twist_Solid_${TAG_OR_BRANCH}.tar.gz -cd Twist_Solid_{TAG_OR_BRANCH} +cd Twist_Solid_${TAG_OR_BRANCH} mkdir venv && tar xvf env.tar.gz -C venv/ source venv/bin/activate @@ -142,7 +139,7 @@ Copy a profile and modify it to match your system, ex```Twist_Solid_${TAG_OR_BRA ```yaml # Found at Twist_Solid_{TAG_OR_BRANCH}/snakemake-wrappers, use absolute_path with 'git+file:/' wrapper-prefix="PATH_TO_WRAPPERS" -# ex: wrapper-prefix="git+file://proj/sens2022566/nobackup/patriksm/Twist_Solid_add-{TAG_OR_BRANCH}/snakemake-wrappers/" +# ex: wrapper-prefix: "git+file://proj/sens2022566/nobackup/patriksm/Twist_Solid_add-{TAG_OR_BRANCH}/snakemake-wrappers/" # Update account info, change ADD_YOUR_ACCOUNT to your bianca project id drmaa: " -A ADD_YOUR_ACCOUNT -N 1-1 -t {resources.time} -n {resources.threads} --mem={resources.mem_mb} --mem-per-cpu={resources.mem_per_cpu} --mem-per-cpu={resources.mem_per_cpu} --partition={resources.partition} -J {rule} -e slurm_out/{rule}_%j.err -o slurm_out/{rule}_%j.out" diff --git a/docs/softwares.md b/docs/softwares.md index 5ba17768..e587ca7b 100644 --- a/docs/softwares.md +++ b/docs/softwares.md @@ -208,6 +208,29 @@ Make a combined report of filtered fusion calls from all RNA fusion callers. App --- +## report_gene_fuse +Make a report of filtered fusion calls from the DNA fusion caller Gene Fuse. Applies configurable thresholds for supporting reads and flags noisy genes as well as filters artifact genes. See further [DNA fusions report info](dna_fusions.md). + +### :snake: Rule + +#SNAKEMAKE_RULE_SOURCE__report_gene_fuse__report_gene_fuse# + +#### :left_right_arrow: input / output files + +#SNAKEMAKE_RULE_TABLE__report_gene_fuse__report_gene_fuse# + +### :wrench: Configuration + +#### Software settings (`config.yaml`) + +#CONFIGSCHEMA__report_gene_fuse# + +#### Resources settings (`resources.yaml`) + +#RESOURCESSCHEMA__report_gene_fuse# + +--- + ## purecn_modify_vcf Increases the MQB (mean base quality) value by 5 as the qualities are so bad for our samples. diff --git a/requirements.txt b/requirements.txt index 14c24826..93a9de17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -hydra-genetics==1.14.0 +hydra-genetics==3.0.0 pandas>=1.3.1 snakemake==7.18.0 singularity==3.0.0 diff --git a/workflow/Snakefile b/workflow/Snakefile index 1ff9f7bc..188982f2 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -15,6 +15,7 @@ include: "rules/fix_vcf_ad_for_qci.smk" include: "rules/hotspot_report.smk" include: "rules/house_keeping_gene_coverage.smk" include: "rules/purecn_modify_vcf.smk" +include: "rules/report_gene_fuse.smk" include: "rules/report_fusions.smk" @@ -480,17 +481,21 @@ use rule * from fusions exclude all as fusions_* use rule fuseq_wes from fusions as fusions_fuseq_wes with: - input: - bam=lambda wildcards: get_deduplication_bam_input(wildcards), - bai=lambda wildcards: "%s%s" % (get_deduplication_bam_input(wildcards), ".bai"), - ref_json=config.get("fuseq_wes", {}).get("ref_json", ""), - gtfSqlite=config.get("fuseq_wes", {}).get("gtfSqlite", ""), - fusiondb=config.get("fuseq_wes", {}).get("fusiondb", ""), - paralogdb=config.get("fuseq_wes", {}).get("paralogdb", ""), - params=config.get("fuseq_wes", {}).get("params", ""), priority: 50 +use rule filter_report_fuseq_wes from fusions as fusions_filter_report_fuseq_wes_umi with: + output: + fusions=temp("fusions/filter_fuseq_wes/{sample}_{type}.fuseq_wes.report.umi.csv"), + params: + min_support=config.get("filter_fuseq_wes_umi", {}).get("min_support", ""), + gene_white_list=config.get("filter_fuseq_wes_umi", {}).get("gene_white_list", ""), + gene_fusion_black_list=config.get("filter_fuseq_wes_umi", {}).get("gene_fusion_black_list", ""), + transcript_black_list=config.get("filter_fuseq_wes_umi", {}).get("transcript_black_list", ""), + filter_on_fusiondb=config.get("filter_fuseq_wes_umi", {}).get("filter_on_fusiondb", ""), + gtf=config.get("filter_fuseq_wes_umi", {}).get("gtf", ""), + + use rule star_fusion from fusions as fusions_star_fusion with: output: bam=temp("fusions/star_fusion/{sample}_{type}/Aligned.out.bam"), diff --git a/workflow/rules/call_small_cnv_deletions.smk b/workflow/rules/call_small_cnv_deletions.smk index 0e6084ae..cc2d7885 100644 --- a/workflow/rules/call_small_cnv_deletions.smk +++ b/workflow/rules/call_small_cnv_deletions.smk @@ -11,10 +11,11 @@ rule call_small_cnv_deletions: output: deletions=temp("cnv_sv/call_small_cnv_deletions/{sample}_{type}.deletions.tsv"), params: - window_size=config.get("call_small_cnv_deletions", {}).get("window_size", 4), - region_max_size=config.get("call_small_cnv_deletions", {}).get("region_max_size", 30), + blacklist=config.get("call_small_cnv_deletions", {}).get("blacklist", ""), min_nr_stdev_diff=config.get("call_small_cnv_deletions", {}).get("min_nr_stdev_diff", 2.5), min_log_odds_diff=config.get("call_small_cnv_deletions", {}).get("min_log_odds_diff", 0.3), + region_max_size=config.get("call_small_cnv_deletions", {}).get("region_max_size", 30), + window_size=config.get("call_small_cnv_deletions", {}).get("window_size", 4), log: "cnv_sv/call_small_cnv_deletions/{sample}_{type}.deletions.tsv.log", benchmark: diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 9b72b638..e8e7a1b3 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -24,10 +24,10 @@ from hydra_genetics.utils.misc import export_config_as_file from hydra_genetics.utils.software_versions import add_version_files_to_multiqc from hydra_genetics.utils.software_versions import add_software_version_to_config from hydra_genetics.utils.software_versions import export_pipeline_version_as_file -from hydra_genetics.utils.software_versions import export_software_version_as_files +from hydra_genetics.utils.software_versions import export_software_version_as_file from hydra_genetics.utils.software_versions import get_pipeline_version -from hydra_genetics.utils.software_versions import touch_pipeline_verion_file_name -from hydra_genetics.utils.software_versions import touch_software_version_files +from hydra_genetics.utils.software_versions import touch_pipeline_version_file_name +from hydra_genetics.utils.software_versions import touch_software_version_file from hydra_genetics.utils.software_versions import use_container @@ -76,9 +76,9 @@ validate(output_spec, schema="../schemas/output_files.schema.yaml") date_string = datetime.now().strftime("%Y%m%d") pipeline_version = get_pipeline_version(workflow, pipeline_name="Twist_Solid") -version_files = touch_pipeline_verion_file_name(pipeline_version, date_string=date_string, directory="results/versions/software") +version_files = touch_pipeline_version_file_name(pipeline_version, date_string=date_string, directory="results/versions/software") if use_container(workflow): - version_files += touch_software_version_files(config, date_string=date_string, directory="results/versions/software") + version_files.append(touch_software_version_file(config, date_string=date_string, directory="results/versions/software")) add_version_files_to_multiqc(config, version_files) @@ -86,7 +86,7 @@ onstart: export_pipeline_version_as_file(pipeline_version, date_string=date_string, directory="results/versions/software") if use_container(workflow): update_config, software_info = add_software_version_to_config(config, workflow, False) - export_software_version_as_files(software_info, date_string=date_string, directory="results/versions/software") + export_software_version_as_file(software_info, date_string=date_string, directory="results/versions/software") export_config_as_file(update_config, date_string=date_string, directory="results/versions") diff --git a/workflow/rules/hotspot_report.smk b/workflow/rules/hotspot_report.smk index 26d7438c..5f9bf499 100644 --- a/workflow/rules/hotspot_report.smk +++ b/workflow/rules/hotspot_report.smk @@ -9,7 +9,7 @@ __license__ = "GPL-3" rule hotspot_report: input: - hotspots=config["hotspot_report"]["hotspot_mutations"], + hotspots=lambda wildcards: config["hotspot_report"]["hotspot_mutations"][wildcards.tag], vcf="snv_indels/bcbio_variation_recall_ensemble/{sample}_{type}.ensembled.vep_annotated.artifact_annotated.hotspot_annotated.background_annotated.include.exon.filter.snv_hard_filter.codon_snvs.sorted.vep_annotated.vcf.gz", vcf_index="snv_indels/bcbio_variation_recall_ensemble/{sample}_{type}.ensembled.vep_annotated.artifact_annotated.hotspot_annotated.background_annotated.include.exon.filter.snv_hard_filter.codon_snvs.sorted.vep_annotated.vcf.gz.tbi", vcf_file_wo_pick="snv_indels/bcbio_variation_recall_ensemble/{sample}_{type}.ensembled.vep_annotated.artifact_annotated.hotspot_annotated.background_annotated.include.exon.filter.snv_hard_filter.codon_snvs.sorted.vep_annotated_wo_pick.vcf.gz", @@ -17,7 +17,7 @@ rule hotspot_report: gvcf="qc/add_mosdepth_coverage_to_gvcf/{sample}_{type}.mosdepth.g.vcf.gz", gvcf_index="qc/add_mosdepth_coverage_to_gvcf/{sample}_{type}.mosdepth.g.vcf.gz.tbi", output: - report=temp("qc/hotspot_report/{sample}_{type}.output.tsv"), + report=temp("qc/hotspot_report/{sample}_{type}.{tag}.output.tsv"), params: levels=config.get("hotspot_report", {}).get("levels", []), sample_name=lambda wildcards: "%s_%s" % (wildcards.sample, wildcards.type), @@ -25,10 +25,11 @@ rule hotspot_report: chr_translation_file=config.get("hotspot_report", {})["chr_translation_file"], extra=config.get("hotspot_report", {}).get("extra", ""), log: - "qc/hotspot_report/{sample}_{type}.output.tsv.log", + "qc/hotspot_report/{sample}_{type}.{tag}.output.tsv.log", benchmark: repeat( - "qc/hotspot_report/{sample}_{type}.output.benchmark.tsv", config.get("hotspot_report", {}).get("benchmark_repeats", 1) + "qc/hotspot_report/{sample}_{type}.{tag}.output.benchmark.tsv", + config.get("hotspot_report", {}).get("benchmark_repeats", 1), ) threads: config.get("hotspot_report", {}).get("threads", config["default_resources"]["threads"]) resources: @@ -40,6 +41,6 @@ rule hotspot_report: container: config.get("hotspot_report", {}).get("container", config["default_container"]) message: - "{rule}: Do stuff on twist_dna_solid_uppsala/{rule}/{wildcards.sample}_{wildcards.type}.input" + "{rule}: Make a coverage and mutations report: {output.report}" script: "../scripts/hotspot_report.py" diff --git a/workflow/rules/report_gene_fuse.smk b/workflow/rules/report_gene_fuse.smk new file mode 100644 index 00000000..b3078eaf --- /dev/null +++ b/workflow/rules/report_gene_fuse.smk @@ -0,0 +1,34 @@ +__author__ = "Jonas Almlöf" +__copyright__ = "Copyright 2022, Jonas Almlöf" +__email__ = "jonas.almlof@scilifelab.uu.se" +__license__ = "GPL3" + + +rule report_gene_fuse: + input: + fusions="fusions/gene_fuse/{sample}_{type}_gene_fuse_fusions.txt", + output: + report=temp("fusions/report_gene_fuse/{sample}_{type}.gene_fuse_report.tsv"), + params: + filter_fusions=config.get("report_gene_fuse", {}).get("filter_fusions", ""), + min_unique_reads=config.get("report_gene_fuse", {}).get("min_unique_reads", 6), + log: + "fusions/report_gene_fuse/{sample}_{type}.gene_fuse_report.tsv.log", + threads: config.get("report_gene_fuse", {}).get("threads", config["default_resources"]["threads"]) + resources: + threads=config.get("report_gene_fuse", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("report_gene_fuse", {}).get("time", config["default_resources"]["time"]), + mem_mb=config.get("report_gene_fuse", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("report_gene_fuse", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("report_gene_fuse", {}).get("partition", config["default_resources"]["partition"]), + benchmark: + repeat( + "fusions/report_gene_fuse/{sample}_{type}.gene_fuse_report.tsv.benchmark.tsv", + config.get("report_gene_fuse", {}).get("benchmark_repeats", 1), + ) + container: + config.get("report_gene_fuse", {}).get("container", config["default_container"]) + message: + "{rule}: Collect and filter gene fuse dna fusions and create report: {output.report}" + script: + "../scripts/report_gene_fuse.py" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 298b94f2..cac451c1 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -82,18 +82,21 @@ properties: Defines both the genomic position of the genes and their surrounding regions that should be analysed for this gene. The size of the analysis region is dependent on the density of the backbone but should at least 20 backbone SNPs. 40 SNPs is used in Pipeline. - window_size: - type: integer - description: Number of data points included in the sliding window - region_max_size: - type: integer - description: Max number of data points of deletion to be reported + blacklist: + type: string + description: File with gene regions that should be filtered (Format is header + gene\tchr\tpos1\tpos2) min_nr_stdev_diff: type: number description: Min number of standard deviations difference between the deletion and surrounding data points in region min_log_odds_diff: type: number description: Min log2ratio difference between the deletion and the average of surrounding data points in region + region_max_size: + type: integer + description: Max number of data points of deletion to be reported + window_size: + type: integer + description: Number of data points included in the sliding window cnv_tsv_report: type: object @@ -336,14 +339,21 @@ properties: type: object properties: hotspot_mutations: - type: string - description: Text file with hotspot positions and annotations + type: object + description: List of files with hotspot positions and annotations. The names of the list items are found in the file names as {tag}. + properties: + all: + type: string + description: Hotspot list for normal FFPE analysis + ENC: + type: string + description: Hotspot list for ENC FFPE analysis report_config: type: string description: file defining which fields should be extracted from the annotation chr_translation_file: type: string - description: file used to translate chr to NC id. The format is a tab seperate file with column chr and NC + description: file used to translate chr to NC id. The format is a tab separate file with column chr and NC levels: type: array description: | @@ -582,6 +592,19 @@ properties: type: integer description: lower limit of supporting reads to flag filter fusion that are inframe for StarFusion + report_gene_fuse: + description: report results from genefuse + type: object + properties: + filter_fusions: + type: string + description: file specifying fusions that should be filtered completely (value 0 in column 2) or have higher limit (value >0 in column 2) + min_unique_reads: + type: integer + description: lower limit of uniquely supporting reads to report fusion + required: + - min_unique_reads + trimmer_software: description: trimmer software that should be used pattern: "^(fastp_pe|None)$" @@ -622,7 +645,6 @@ required: - gatk_collect_allelic_counts - gatk_denoise_read_counts - hotspot_annotation - - hotspot_info - hotspot_report - msisensor_pro - multiqc diff --git a/workflow/schemas/resources.schema.yaml b/workflow/schemas/resources.schema.yaml index e7d1646e..219c8290 100644 --- a/workflow/schemas/resources.schema.yaml +++ b/workflow/schemas/resources.schema.yaml @@ -228,6 +228,26 @@ properties: type: integer description: number of threads to be available + report_gene_fuse: + type: object + description: resource definitions for generating a gene fuse report + properties: + mem_mb: + type: integer + description: memory in MB used per cpu + mem_per_cpu: + type: integer + description: memory used per cpu + partition: + type: string + description: partition to use on cluster + time: + type: string + description: max execution time + threads: + type: integer + description: number of threads to be available + sample_mixup_check: type: object description: resource definitions for sample_mixup_check diff --git a/workflow/schemas/rules.schema.yaml b/workflow/schemas/rules.schema.yaml index b6b4b0df..541c24ca 100644 --- a/workflow/schemas/rules.schema.yaml +++ b/workflow/schemas/rules.schema.yaml @@ -160,9 +160,6 @@ properties: type: object description: list of inputs properties: - hotspots: - type: string - description: Text file with hotspot positions and annotations vcf: type: string description: Vcf file with filtered SNV and INDEL calls @@ -181,7 +178,7 @@ properties: properties: report: type: string - description: Excel frienly text report with hotspot position qc information + description: Excel friendly text report with hotspot position qc information house_keeping_gene_coverage: type: object @@ -243,7 +240,26 @@ properties: properties: fusions: type: string - description: Excel frienly text report with filtered fusion calls from respective caller + description: Excel friendly text report with filtered fusion calls from respective caller + + report_gene_fuse: + type: object + description: input and output parameters for report_gene_fuse + properties: + input: + type: object + description: list of inputs + properties: + fusions: + type: string + description: Called fusions by GeneFuse + output: + type: object + description: list of outputs + properties: + report: + type: string + description: Excel friendly text report with filtered fusion calls from GeneFuse purecn_modify_vcf: type: object diff --git a/workflow/schemas/singularity.schema.yaml b/workflow/schemas/singularity.schema.yaml index d9dddbbd..64893cee 100644 --- a/workflow/schemas/singularity.schema.yaml +++ b/workflow/schemas/singularity.schema.yaml @@ -6,7 +6,7 @@ properties: type: string description: name or path to a default docker/singularity container pattern: >- - hydragenetics/common:1\.10\.2$ + hydragenetics/common:3\.0\.0$|hydragenetics_common_3\.0\.0\.sif$| bcbio_variation_recall_ensemble: type: object diff --git a/workflow/scripts/call_small_cnv_deletions.py b/workflow/scripts/call_small_cnv_deletions.py index f28f3b1e..d9432247 100644 --- a/workflow/scripts/call_small_cnv_deletions.py +++ b/workflow/scripts/call_small_cnv_deletions.py @@ -62,8 +62,12 @@ def find_max_probe_diff(probe_data, window_size): def filter_deletions( max_probe_diff_index, min_probe_diff_index, probe_data, gene_probe_index, region, deletions, region_max_size, window_size, - min_log_odds_diff, min_nr_stdev_diff + min_log_odds_diff, min_nr_stdev_diff, blacklist_dict, ): + # Blacklist filter + key = f"{region[0]}_{region[3]}_{region[4]}" + if key in blacklist_dict: + return "Blacklist" # Filter amplifications if max_probe_diff_index >= min_probe_diff_index: return "Amplification" @@ -136,10 +140,23 @@ def read_regions_data(regions_file): return regions +def read_blacklist(blacklist_file_name): + blacklist = open(blacklist_file_name) + blacklist_dict = {} + next(blacklist) + for line in blacklist: + columns = line.strip().split("\t") + key = f"{columns[1]}_{columns[2]}_{columns[3]}" + blacklist_dict[key] = "" + return blacklist_dict + + def call_small_cnv_deletions( - cnv_file_name, regions_file, deletions, window_size, region_max_size, min_nr_stdev_diff, min_log_odds_diff, + cnv_file_name, regions_file, deletions, window_size, + region_max_size, min_nr_stdev_diff, min_log_odds_diff, blacklist_file_name, ): regions = read_regions_data(regions_file) + blacklist_dict = read_blacklist(blacklist_file_name) sample_name = cnv_file_name.split("/")[1].split("_")[0] deletions.write("Gene(s)\tChromosome\tGene_start\tGene_end\tLog2_ratio_diff\tMedian_L2R_deletion\t") deletions.write("Median_L2R_surrounding\tNumber_of_data_points\tNumber_of_stdev\n") @@ -152,7 +169,7 @@ def call_small_cnv_deletions( max_probe_diff_index, min_probe_diff_index = find_max_probe_diff(probe_data, window_size) filter = filter_deletions( max_probe_diff_index, min_probe_diff_index, probe_data, gene_probe_index, region, deletions, region_max_size, - window_size, min_log_odds_diff, min_nr_stdev_diff, + window_size, min_log_odds_diff, min_nr_stdev_diff, blacklist_dict ) return filter @@ -168,4 +185,5 @@ def call_small_cnv_deletions( snakemake.params.region_max_size, snakemake.params.min_nr_stdev_diff, snakemake.params.min_log_odds_diff, + snakemake.params.blacklist, ) diff --git a/workflow/scripts/call_small_cnv_deletions_test.py b/workflow/scripts/call_small_cnv_deletions_test.py index 8a21b7ab..60188b9b 100644 --- a/workflow/scripts/call_small_cnv_deletions_test.py +++ b/workflow/scripts/call_small_cnv_deletions_test.py @@ -39,12 +39,13 @@ def test_call_small_cnv_deletions_ok(self): # True call cnv_data1 = ".tests/units/gatk_cnv/HD832.HES45_T.test1.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data1, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) out_deletions.close() @@ -64,13 +65,14 @@ def test_call_small_cnv_deletions_ok(self): def test_call_small_cnv_deletions_filter(self): from call_small_cnv_deletions import call_small_cnv_deletions - # Filter 1 (Amplifactions) + # Filter 1 (Amplifications) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter1.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Amplification", filter) out_deletions.close() @@ -78,11 +80,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 2 (Large deletions) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter2.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Too_large", filter) out_deletions.close() @@ -90,11 +93,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 4 (Filter deletions not in the actual gene of interest) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter4.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Not_in_gene", filter) out_deletions.close() @@ -102,11 +106,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 5 (Filter too short deletions) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter5.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Too_small", filter) out_deletions.close() @@ -114,11 +119,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 6 (Filter deletions with positive median) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter6.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Not_deletion", filter) out_deletions.close() @@ -126,11 +132,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 7 (Too low difference in log odds ratio) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter7.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Low_abs_diff", filter) out_deletions.close() @@ -138,11 +145,12 @@ def test_call_small_cnv_deletions_filter(self): # Filter 8 (Too few standard deviations in difference) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.test1.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, 100, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Low_nr_std_diff", filter) out_deletions.close() @@ -150,12 +158,26 @@ def test_call_small_cnv_deletions_filter(self): # Filter 9 (Filter deletions with too few data points outside to calculate median and standard deviations) cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter8.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" regions_file = open(".tests/integration/DATA/cnv_deletion_genes.tsv") out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") filter = call_small_cnv_deletions( cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, - self.min_log_odds_diff, + self.min_log_odds_diff, blacklist_file_name, ) self._test_call_small_cnv_deletions_filter("Too_few_outside", filter) out_deletions.close() regions_file.close() + + # Filter 10 (Filter blacklisted deletions) + cnv_data = ".tests/units/gatk_cnv/HD832.HES45_T.filter10.tsv" + blacklist_file_name = ".tests/integration/DATA/small_deletions_caller_blacklist.tsv" + regions_file = open(".tests/integration/DATA/cnv_deletion_genes2.tsv") + out_deletions = open(os.path.join(self.tempdir, "HD832.HES45_T.deletions.tsv"), "w") + filter = call_small_cnv_deletions( + cnv_data, regions_file, out_deletions, self.window_size, self.region_max_size, self.min_nr_stdev_diff, + self.min_log_odds_diff, blacklist_file_name, + ) + self._test_call_small_cnv_deletions_filter("Blacklist", filter) + out_deletions.close() + regions_file.close() diff --git a/workflow/scripts/report_gene_fuse.py b/workflow/scripts/report_gene_fuse.py new file mode 100644 index 00000000..428cc325 --- /dev/null +++ b/workflow/scripts/report_gene_fuse.py @@ -0,0 +1,63 @@ +fusions = open(snakemake.input.fusions) +filter_fusions = open(snakemake.params.filter_fusions) +report = open(snakemake.output.report, "w") +min_unique_reads = snakemake.params.min_unique_reads + +report.write("Gene1\tGene2\tNr_unique_reads\tGene_region1\tBreak_point1\tTranscript1\tGene_region2\tBreak_point2\t") +report.write("Transcript2\tFlag\tRead_limit\n") + +FP_gene_pairs = [] +Noisy_genes_pairs = {} + +for line in filter_fusions: + columns = line.strip().split("\t") + fusion = columns[0] + reads = int(columns[1]) + if reads == 0: + FP_gene_pairs.append(fusion) + else: + Noisy_genes_pairs[fusion] = reads + + +for line in fusions: + if line[:8] != "#Fusion:": + continue + gene1 = line.split(" ")[1].split("_")[0] + gene2 = line.split("___")[1].split("_")[0] + unique_reads = int(line.split("unique:")[1].split(")")[0]) + gene_region1 = "" + if "intron" in line.split("___")[0]: + gene_region1 = "intron " + line.split("___")[0].split("intron:")[1].split("|")[0] + if "exon" in line.split("___")[0]: + gene_region1 = "exon " + line.split("___")[0].split("exon:")[1].split("|")[0] + gene_region2 = "" + if "intron" in line.split("___")[1]: + gene_region2 = "intron " + line.split("___")[1].split("intron:")[1].split("|")[0] + if "exon" in line.split("___")[1]: + gene_region2 = "exon " + line.split("___")[1].split("exon:")[1].split("|")[0] + bp1 = "chr" + line.split("___")[0].split("chr")[1] + bp2 = "chr" + line.split("___")[1].split("chr")[1].split(" (")[0] + transcript1 = line.split("_")[1].split(":")[0] + transcript2 = line.split("___")[1].split("_")[1].split(":")[0] + min_u_r = min_unique_reads + key = gene1 + "_" + gene2 + Noisy_pair = False + Noisy_limit = 0 + if key in Noisy_genes_pairs: + min_u_r = Noisy_genes_pairs[key] + Noisy_pair = True + Noisy_limit = min_u_r + if unique_reads < min_u_r: + continue + if key in FP_gene_pairs: + continue + if Noisy_pair: + report.write( + f"{gene1}\t{gene2}\t{unique_reads}\t{gene_region1}\t{bp1}\t{transcript1}\t{gene_region2}\t{bp2}\t{transcript2}\t" + f"Noisy fusion\t{Noisy_limit}\n" + ) + else: + report.write( + f"{gene1}\t{gene2}\t{unique_reads}\t{gene_region1}\t{bp1}\t{transcript1}\t{gene_region2}\t{bp2}\t{transcript2}\t\t\n" + ) +report.close()