Merge pull request #33 from EGA-archive/headers

adding more VCF headers parsing
EGA-archive · Jan 9, 2025 · 967b972 · 967b972
2 parents 6017ec2 + b68f443
commit 967b972
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -58,6 +58,9 @@ The **datasetId** needs to match the id of your datasets.csv or datasets.json fi
 The **case_level_data** is a boolean parameter (True or False) which will relate your variants to the samples they belong to. In case you set this to true, please, read as well the case level data paragraph below.
 The **num_rows** are the aproximate calculation you expect for the total of variants in each vcf there are. Make sure this is greater than the total variants expected. It was automatically calculated before but it was very slow sometimes to calculate all the variants number in a VCF.
 
+#### VCF headers
+Beacon RI Tools v2 is compatible to headers annotated by VEP. The parameters that are read from VEP are UPLOADED_ALLELE, for setting the variant type, SYMBOL, for setting the gene id, HGVSp, for setting the aminoacid change, and CONSEQUENCE, for setting the molecular effects. If your VCF isn't annotated with VEP you can write down in [pipelines](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/pipelines/default/templates), within the template.json file, which are the keynames for these different properties in your VCF header id and activate the template to true. Note that using template.json will deactivate reading the VEP headers.
+
 #### VCF pipelines for allele frequencies
 To read allele frequency variables, there is the populations.json pipeline inside [pipelines](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/pipelines/default/templates) folder.
 In order to let Beacon RI Tools v2 read all the INFO column from your VCF and parse the allele frequency variants entries, you will need to add how are the different entries named for each annotation. You will have to tell how many populations are there in your VCF setting the numberOfPopulations value, if there are no allele frequencies in the VCF, then you will need to set it to 0, and if there are but no specific populations, then fill the populations with a “Total” name. 

diff --git a/genomicVariations.json b/genomicVariations.json
diff --git a/genomicVariations_vcf.py b/genomicVariations_vcf.py
@@ -32,6 +32,14 @@
 except Exception:
     pipeline = None
 
+try:
+    with open('pipelines/default/templates/template.json') as template_file:
+        template = json.load(template_file)
+        if template["template"]==False:
+            template=None
+except Exception:
+    template = None
+
 def commas(prova):
     length_iter=0
     array_of_newdicts=[]
@@ -110,7 +118,7 @@ def generate(dict_properties):
         for rec in vcf.header_iter():
             d = rec.info()
             try:
-                if d['ID'] == 'CSQ':
+                if d['ID'] == 'CSQ' and template == None:
                     format_annotation = d['Description']
                     format_list=format_annotation.split('|')
                     formatted=True
@@ -152,8 +160,116 @@ def generate(dict_properties):
 
         for v in vcf:
             dict_to_xls={}
-
-            if formatted == True:
+            if template != None:
+                varianttype=v.INFO.get(template["variantType"])
+                gene=v.INFO.get(template["geneId"])
+                aminoacidchange=v.INFO.get(template["aminoacidChange"])
+                moleculareffectt=v.INGO.get(template["molecularEffects"])
+                if "&" in moleculareffectt:
+                    moleculareffects=moleculareffectt.split("&")
+                    dict_to_xls['molecularAttributes|molecularEffects|id']=""
+                else:
+                    moleculareffects=[moleculareffectt]
+                    dict_to_xls['molecularAttributes|molecularEffects|id']=""
+                for moleculareffect in moleculareffects:
+                    if dict_to_xls['molecularAttributes|molecularEffects|id']=="":
+                        if moleculareffect == 'missense_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "ENSGLOSSARY:0000150"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'intron_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "ENSGLOSSARY:0000161"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'upstream_gene_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001631"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == '5_prime_UTR_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001623"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'synonymous_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001819"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'downstream_gene_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001632"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'non_coding_transcript_exon_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001792"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == '5_prime_UTR_premature_start_codon_gain_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001988"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'splice_region_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001630"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'intergenic_region':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0000605"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'splice_donor_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001575"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == '3_prime_UTR_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001624"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'splice_acceptor_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001574"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'stop_retained_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001567"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                        elif moleculareffect == 'coding_sequence_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id']="SO:0001580"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
+                    else:
+                        if moleculareffect == 'missense_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "ENSGLOSSARY:0000150"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'intron_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "ENSGLOSSARY:0000161"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'upstream_gene_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001631"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == '5_prime_UTR_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001623"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'synonymous_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001819"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'downstream_gene_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001632"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'non_coding_transcript_exon_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001792"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == '5_prime_UTR_premature_start_codon_gain_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001988"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'splice_region_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001630"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'intergenic_region':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0000605"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'splice_donor_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001575"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == '3_prime_UTR_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001624"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'splice_acceptor_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001574"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'stop_retained_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001567"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+                        elif moleculareffect == 'coding_sequence_variant':
+                            dict_to_xls['molecularAttributes|molecularEffects|id']+= "|"+ "SO:0001580"
+                            dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+  moleculareffect
+
+                if gene != '':
+                    dict_to_xls['molecularAttributes|geneIds']=gene
+                if aminoacidchange!='':
+                    dict_to_xls['molecularAttributes|aminoacidChanges']=aminoacidchange
+            elif formatted == True:
                 annotation_list=v.INFO.get('CSQ')
                 if annotation_list != None:
                     annotation_list=annotation_list.split('|')
@@ -286,6 +402,7 @@ def generate(dict_properties):
                     formatted=False
 
 
+
             else:
                 try:
                     varianttype=v.INFO.get('VT')

diff --git a/pipelines/default/templates/template.json b/pipelines/default/templates/template.json
@@ -0,0 +1,7 @@
+{
+    "template": false,
+    "variantType": "VT",
+    "aminoacidChange": "HGVSp",
+    "geneId": "SYMBOL",
+    "molecularEffects": "CONSEQUENCE"
+}