Skip to content

Commit

Permalink
solved merge annotation attribute error in #143 and added test cases (#…
Browse files Browse the repository at this point in the history
…144)

* solved merge annotation attribute error in  #143 and added test cases

* fixup! Format Python code with psf/black pull_request

* added more broad nan test, better function description

* fixup! Format Python code with psf/black pull_request

---------

Co-authored-by: Mück <[email protected]>
Co-authored-by: PMBio <[email protected]>
  • Loading branch information
3 people authored Oct 23, 2024
1 parent 243b47a commit e42f67d
Show file tree
Hide file tree
Showing 19 changed files with 689 additions and 10 deletions.
21 changes: 11 additions & 10 deletions deeprvat/annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,16 +1169,17 @@ def deepripe_score_variant_onlyseq_all(


def calculate_scores_max(scores):
if scores is None:
return None
# calculationg spliceai delta scores out of SpliceAI_pred output, by splitting value on '|' and calculating max out of DS_AG, DS_AL, DS_DG, DS_DL (values 2 to 5)
values = [
float(score)
for score in scores.split("|")[1:5]
if score != "-" and score != "nan"
]
# Calculate the max
if len(values) > 0:
return np.max(values)
else:
# Split the string and extract values from index 1 to 5
values = [float(score) for score in scores.split("|")[1:5] if score != "nan"]
# Calculate the sum
if len(values) > 0:
return np.max(values)
else:
return np.NaN
return np.NaN


@cli.command()
Expand Down Expand Up @@ -1783,7 +1784,7 @@ def process_vep(
)
if "SpliceAI_pred" in vep_file.columns:
vep_file["SpliceAI_delta_score"] = vep_file["SpliceAI_pred"].apply(
calculate_scores_max
lambda val: calculate_scores_max(val) if pd.notna(val) else np.NaN
)

if "Consequence" in vep_file.columns:
Expand Down
24 changes: 24 additions & 0 deletions tests/annotations/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,30 @@ def test_deepsea_pca(
"49",
"annotation_colnames_filling_values.yaml",
),
(
"merge_annotations_allNAN_spliceAI",
"merged_annotations_expected.parquet",
"test_hg2_deepripe.csv.gz",
"test_k5_deepripe.csv.gz",
"test_parclip.csv.gz",
"variants.parquet",
"test.vcf",
"test_vep.tsv",
"0",
"annotation_colnames_filling_values.yaml",
),
(
"merge_annotations_allbut1NAN_spliceAI",
"merged_annotations_expected.parquet",
"test_hg2_deepripe.csv.gz",
"test_k5_deepripe.csv.gz",
"test_parclip.csv.gz",
"variants.parquet",
"test.vcf",
"test_vep.tsv",
"0",
"annotation_colnames_filling_values.yaml",
),
],
)
def test_merge_annotations(
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
annotation_column_names:
'af' :
- 'AF'
- 0
- float
'maf_mb' :
- 'MAF_MB'
- 10000
- float
'maf' :
- 'MAF'
- 0
- float
'PolyPhen' :
- 'polyphen_score'
- 0
- float
'SIFT' :
- 'sift_score'
- 1
- float
'QKI_hg2' :
- 'DeepRipe_plus_QKI_lip_hg2'
- 0
- float
'QKI_k5' :
- 'DeepRipe_plus_QKI_clip_k5'
- 0
- float
'KHDRBS1_k5' :
- 'DeepRipe_plus_KHDRBS1_clip_k5'
- 0
- float
'ELAVL1_parclip' :
- 'DeepRipe_plus_ELAVL1_parclip'
- 0
- float
'TARDBP_parclip' :
- 'DeepRipe_plus_TARDBP_parclip'
- 0
- float
'HNRNPD_parclip' :
- 'DeepRipe_plus_HNRNPD_parclip'
- 0
- float
'MBNL1_parclip' :
- 'DeepRipe_plus_MBNL1_parclip'
- 0
- float
'QKI_parclip' :
- 'DeepRipe_plus_QKI_parclip'
- 0
- float
'Consequence_splice_acceptor_variant' :
- 'Consequence_splice_acceptor_variant'
- 0
- int
'Consequence_splice_donor_variant' :
- 'Consequence_splice_donor_variant'
- 0
- int
'Consequence_stop_gained' :
- 'Consequence_stop_gained'
- 0
- int
'Consequence_frameshift_variant' :
- 'Consequence_frameshift_variant'
- 0
- int
'Consequence_stop_lost' :
- 'Consequence_stop_lost'
- 0
- int
'Consequence_start_lost' :
- 'Consequence_start_lost'
- 0
- int
'Consequence_inframe_insertion' :
- 'Consequence_inframe_insertion'
- 0
- int
'Consequence_inframe_deletion' :
- 'Consequence_inframe_deletion'
- 0
- int
'Consequence_missense_variant' :
- 'Consequence_missense_variant'
- 0
- int
'Consequence_protein_altering_variant' :
- 'Consequence_protein_altering_variant'
- 0
- int
'Consequence_splice_region_variant' :
- 'Consequence_splice_region_variant'
- 0
- int
'DeepSEA_PC_1' :
- 'DeepSEA_PC_1'
- 0
- float
'DeepSEA_PC_2' :
- 'DeepSEA_PC_2'
- 0
- float
'DeepSEA_PC_3' :
- 'DeepSEA_PC_3'
- 0
- float
'DeepSEA_PC_4' :
- 'DeepSEA_PC_4'
- 0
- float
'DeepSEA_PC_5' :
- 'DeepSEA_PC_5'
- 0
- float
'DeepSEA_PC_6' :
- 'DeepSEA_PC_6'
- 0
- float
'is_plof':
- 'is_plof'
- 0
- int
'AbSplice_DNA' :
- 'AbSplice_DNA'
- 0
- float
'SpliceAI_delta_score' :
- 'SpliceAI_delta_score'
- 0
- float
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
chr3 3474106 . C T
chr3 6134790 rs881 A G
chr3 6492413 . A G
chr3 7479092 . T A
chr3 10151779 . G T
chr3 10963200 . C A
chr3 13336897 rs178 T A
chr3 25565017 rs37 C G
chr3 28027872 rs721 T C
chr3 30429305 rs135 T C
chr3 39059372 rs23 A C
chr3 47378509 rs727 T A
chr3 47839379 rs268 C T
chr3 55062103 rs873 A G
chr3 56288165 rs664 G C
chr3 64813843 rs815 A G
chr3 70306576 rs107 C G
chr3 72140079 rs492 A T
chr3 72906610 rs930 T G
chr3 74562325 rs523 G T
chr3 78839934 rs583 G A
chr3 81414874 rs170 A T
chr3 97458263 rs548 A T
chr3 97649369 rs546 C G
chr3 97949211 rs543 G A
chr3 99075824 rs838 T C
chr3 101580812 rs311 A C
chr3 103151123 rs382 C A
chr3 103329532 rs179 T C
chr3 103928516 rs19 A T
chr3 105180981 rs341 A G
chr3 111113126 rs470 A G
chr3 111866541 rs467 T A
chr3 117455785 rs718 C A
chr3 120258434 rs506 A C
chr3 120364684 rs367 T G
chr3 122803142 rs488 A C
chr3 125013245 rs146 A G
chr3 127342540 rs318 G T
chr3 133734681 rs104 G A
chr3 139349025 rs665 T C
chr3 140275153 rs791 G C
chr3 145304395 rs102 C G
chr3 147901161 rs274 C T
chr3 150051584 rs123 C A
chr3 150399452 rs648 T A
chr3 158349305 rs748 T A
chr3 158851780 rs408 A T
chr3 160382108 rs963 A C
chr3 168465216 rs751 G C
chr3 171089322 rs197 A T
chr3 177499702 rs376 G C
chr3 185836100 rs581 G T
chr3 191856146 rs596 G C
chr3 192824921 rs701 C G
chr3 193390684 rs434 A C
chr3 194577309 rs70 T G
chr3 194762766 rs598 G A
chr3 197345633 rs356 T A
chr3 197732094 rs693 G T
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
Unnamed: 0 #Uploaded_variation Location Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation IMPACT DISTANCE STRAND FLAGS BIOTYPE CANONICAL ENSP SIFT PolyPhen AF CLIN_SIG SOMATIC PHENO SpliceAI_pred
0 0 3_3474106_C/T 3:3474106 T ENSG00000223727 ENST00000420000 Transcript intron_variant,non_coding_transcript_variant MODIFIER -1.0 lncRNA YES
1 1 rs881 3:6134790 G intergenic_variant rs1007430246 MODIFIER
2 2 3_6492413_A/G 3:6492413 G ENSG00000189229 ENST00000655754 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 lncRNA YES
3 3 3_7479092_T/A 3:7479092 A ENSG00000196277 ENST00000357716 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000350348
4 4 3_10151779_G/T 3:10151779 T ENSG00000287086 ENST00000660063 Transcript intron_variant,non_coding_transcript_variant COSV56556349 MODIFIER -1.0 lncRNA YES 1.0 1.0
5 5 3_10151779_G/T 3:10151779 T ENSG00000134086 ENST00000256474 Transcript 3_prime_UTR_variant 2526/4414 COSV56556349 MODIFIER 1.0 protein_coding YES ENSP00000256474 1.0 1.0
6 6 3_10963200_C/A 3:10963200 A ENSG00000286962 ENST00000656787 Transcript intron_variant,non_coding_transcript_variant MODIFIER -1.0 lncRNA YES
7 7 rs178 3:13336897 A ENSG00000132182 ENST00000254508 Transcript missense_variant 3670/7206 3574/5664 1192/1887 I/F Atc/Ttc MODERATE -1.0 protein_coding YES ENSP00000254508 0.01 0.712
8 8 rs37 3:25565017 G ENSG00000077092 ENST00000330688 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000332296
9 9 rs721 3:28027872 C ENSG00000235493 ENST00000356047 Transcript intron_variant,non_coding_transcript_variant MODIFIER -1.0 lncRNA
10 10 rs135 3:30429305 C ENSG00000289450 ENST00000691186 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 lncRNA YES
11 11 rs23 3:39059372 C ENSG00000114742 ENST00000302313 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000307491
12 12 rs727 3:47378509 A ENSG00000260236 ENST00000568593 Transcript downstream_gene_variant MODIFIER 580.0 -1.0 lncRNA YES
13 13 rs727 3:47378509 A ENSG00000076201 ENST00000265562 Transcript upstream_gene_variant MODIFIER 2512.0 1.0 protein_coding YES ENSP00000265562
14 14 rs268 3:47839379 T ENSG00000132153 ENST00000445061 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000405620
15 15 rs873 3:55062103 G ENSG00000157445 ENST00000474759 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000419101
16 16 rs664 3:56288165 C ENSG00000187672 ENST00000288221 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000288221
17 17 rs815 3:64813843 G ENSG00000241684 ENST00000650103 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 lncRNA YES
18 18 rs107 3:70306576 G ENSG00000240405 ENST00000642114 Transcript intron_variant,non_coding_transcript_variant rs536908099 MODIFIER 1.0 lncRNA YES 0.0002
19 19 rs107 3:70306576 G ENSG00000242120 ENST00000567252 Transcript intron_variant rs536908099 MODIFIER -1.0 protein_coding YES ENSP00000490638 0.0002
20 20 rs492 3:72140079 T ENSG00000241163 ENST00000626474 Transcript intron_variant,non_coding_transcript_variant MODIFIER -1.0 lncRNA YES
21 21 rs930 3:72906610 G ENSG00000172986 ENST00000389617 Transcript intron_variant COSV67474846 MODIFIER 1.0 protein_coding YES ENSP00000374268 1.0 1.0
22 22 rs523 3:74562325 T ENSG00000113805 ENST00000263665 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000263665
23 23 rs583 3:78839934 A ENSG00000169855 ENST00000464233 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000420321
24 24 rs170 3:81414874 T intergenic_variant MODIFIER
25 25 rs548 3:97458263 T ENSG00000080224 ENST00000389672 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000374323
26 26 rs546 3:97649369 G ENSG00000080224 ENST00000389672 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000374323
27 27 rs543 3:97949211 A ENSG00000080200 ENST00000389622 Transcript downstream_gene_variant MODIFIER 4227.0 1.0 protein_coding YES ENSP00000374273
28 28 rs543 3:97949211 A ENSG00000170854 ENST00000394198 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000377748
29 29 rs838 3:99075824 C intergenic_variant MODIFIER
30 30 rs311 3:101580812 C ENSG00000081154 ENST00000265260 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000265260
31 31 rs311 3:101580812 C ENSG00000242299 ENST00000496294 Transcript upstream_gene_variant MODIFIER 3865.0 -1.0 processed_pseudogene YES
32 32 rs382 3:103151123 A intergenic_variant MODIFIER
33 33 rs179 3:103329532 C intergenic_variant MODIFIER
34 34 rs19 3:103928516 T intergenic_variant MODIFIER
35 35 rs341 3:105180981 G intergenic_variant MODIFIER
36 36 rs470 3:111113126 G ENSG00000177707 ENST00000485303 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000418070
37 37 rs467 3:111866541 A ENSG00000144824 ENST00000431670 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000405405
38 38 rs718 3:117455785 A intergenic_variant MODIFIER
39 39 rs506 3:120258434 C ENSG00000175697 ENST00000464295 Transcript intron_variant rs1470110063 MODIFIER -1.0 protein_coding YES ENSP00000417261
40 40 rs367 3:120364684 G ENSG00000240661 ENST00000634410 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 transcribed_unitary_pseudogene YES
41 41 rs367 3:120364684 G ENSG00000282950 ENST00000634744 Transcript downstream_gene_variant MODIFIER 1309.0 -1.0 lncRNA YES
42 42 rs488 3:122803142 C ENSG00000138463 ENST00000261038 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000261038
43 43 rs146 3:125013245 G ENSG00000173706 ENST00000311127 Transcript synonymous_variant 2441/9195 2334/4146 778/1381 L ctT/ctC LOW -1.0 protein_coding YES ENSP00000311502
44 44 rs318 3:127342540 T ENSG00000244215 ENST00000488425 Transcript intron_variant,non_coding_transcript_variant MODIFIER -1.0 lncRNA YES
45 45 rs104 3:133734681 A ENSG00000291042 ENST00000460564 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 lncRNA YES
46 46 rs665 3:139349025 C ENSG00000272656 ENST00000608472 Transcript non_coding_transcript_exon_variant 347/348 MODIFIER -1.0 lncRNA YES
47 47 rs665 3:139349025 C ENSG00000184432 ENST00000503326 Transcript downstream_gene_variant MODIFIER 4921.0 -1.0 protein_coding ENSP00000426682
48 48 rs665 3:139349025 C ENSG00000175110 ENST00000680020 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000505414
49 49 rs791 3:140275153 C ENSG00000158258 ENST00000458420 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000402460
50 50 rs102 3:145304395 G intergenic_variant MODIFIER
51 51 rs274 3:147901161 T intergenic_variant rs1305018800 MODIFIER
52 52 rs123 3:150051584 A ENSG00000243944 ENST00000487840 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 lncRNA YES
53 53 rs123 3:150051584 A ENSG00000070087 ENST00000497148 Transcript upstream_gene_variant MODIFIER 796.0 -1.0 protein_coding ENSP00000417817
54 54 rs123 3:150051584 A ENSG00000240477 ENST00000466044 Transcript downstream_gene_variant MODIFIER 128.0 1.0 processed_pseudogene YES
55 55 rs648 3:150399452 A intergenic_variant MODIFIER
56 56 rs748 3:158349305 A ENSG00000174891 ENST00000611884 Transcript intron_variant MODIFIER 1.0 protein_coding YES ENSP00000481697
57 57 rs408 3:158851780 T intergenic_variant MODIFIER
58 58 rs963 3:160382108 C ENSG00000068885 ENST00000326448 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000312778
59 59 rs751 3:168465216 C ENSG00000206120 ENST00000431685 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 transcribed_unitary_pseudogene YES
60 60 rs197 3:171089322 T ENSG00000154310 ENST00000436636 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000399511
61 61 rs376 3:177499702 C ENSG00000252028 ENST00000516219 Transcript downstream_gene_variant MODIFIER 3601.0 -1.0 misc_RNA YES
62 62 rs376 3:177499702 C ENSG00000228221 ENST00000656037 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 lncRNA YES
63 63 rs581 3:185836100 T intergenic_variant MODIFIER
64 64 rs596 3:191856146 C intergenic_variant MODIFIER
65 65 rs701 3:192824921 G ENSG00000180611 ENST00000392452 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000376246
66 66 rs434 3:193390684 C intergenic_variant MODIFIER
67 67 rs70 3:194577309 G intergenic_variant MODIFIER
68 68 rs598 3:194762766 A ENSG00000237222 ENST00000667646 Transcript intron_variant,non_coding_transcript_variant MODIFIER 1.0 lncRNA YES
69 69 rs598 3:194762766 A ENSG00000230401 ENST00000422271 Transcript downstream_gene_variant MODIFIER 2472.0 -1.0 lncRNA YES
70 70 rs356 3:197345633 A ENSG00000286870 ENST00000669801 Transcript downstream_gene_variant MODIFIER 4927.0 1.0 lncRNA YES
71 71 rs693 3:197732094 T ENSG00000145016 ENST00000296343 Transcript intron_variant MODIFIER -1.0 protein_coding YES ENSP00000296343
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit e42f67d

Please sign in to comment.