Skip to content

Commit

Permalink
Merge pull request #33 from EGA-archive/headers
Browse files Browse the repository at this point in the history
adding more VCF headers parsing
  • Loading branch information
costero-e authored Jan 9, 2025
2 parents 6017ec2 + b68f443 commit 967b972
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 3 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ The **datasetId** needs to match the id of your datasets.csv or datasets.json fi
The **case_level_data** is a boolean parameter (True or False) which will relate your variants to the samples they belong to. In case you set this to true, please, read as well the case level data paragraph below.
The **num_rows** are the aproximate calculation you expect for the total of variants in each vcf there are. Make sure this is greater than the total variants expected. It was automatically calculated before but it was very slow sometimes to calculate all the variants number in a VCF.

#### VCF headers
Beacon RI Tools v2 is compatible to headers annotated by VEP. The parameters that are read from VEP are UPLOADED_ALLELE, for setting the variant type, SYMBOL, for setting the gene id, HGVSp, for setting the aminoacid change, and CONSEQUENCE, for setting the molecular effects. If your VCF isn't annotated with VEP you can write down in [pipelines](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/pipelines/default/templates), within the template.json file, which are the keynames for these different properties in your VCF header id and activate the template to true. Note that using template.json will deactivate reading the VEP headers.

#### VCF pipelines for allele frequencies
To read allele frequency variables, there is the populations.json pipeline inside [pipelines](https://github.com/EGA-archive/beacon2-ri-tools-v2/tree/main/pipelines/default/templates) folder.
In order to let Beacon RI Tools v2 read all the INFO column from your VCF and parse the allele frequency variants entries, you will need to add how are the different entries named for each annotation. You will have to tell how many populations are there in your VCF setting the numberOfPopulations value, if there are no allele frequencies in the VCF, then you will need to set it to 0, and if there are but no specific populations, then fill the populations with a “Total” name.
Expand Down
1 change: 1 addition & 0 deletions genomicVariations.json

Large diffs are not rendered by default.

123 changes: 120 additions & 3 deletions genomicVariations_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@
except Exception:
pipeline = None

try:
with open('pipelines/default/templates/template.json') as template_file:
template = json.load(template_file)
if template["template"]==False:
template=None
except Exception:
template = None

def commas(prova):
length_iter=0
array_of_newdicts=[]
Expand Down Expand Up @@ -110,7 +118,7 @@ def generate(dict_properties):
for rec in vcf.header_iter():
d = rec.info()
try:
if d['ID'] == 'CSQ':
if d['ID'] == 'CSQ' and template == None:
format_annotation = d['Description']
format_list=format_annotation.split('|')
formatted=True
Expand Down Expand Up @@ -152,8 +160,116 @@ def generate(dict_properties):

for v in vcf:
dict_to_xls={}

if formatted == True:
if template != None:
varianttype=v.INFO.get(template["variantType"])
gene=v.INFO.get(template["geneId"])
aminoacidchange=v.INFO.get(template["aminoacidChange"])
moleculareffectt=v.INGO.get(template["molecularEffects"])
if "&" in moleculareffectt:
moleculareffects=moleculareffectt.split("&")
dict_to_xls['molecularAttributes|molecularEffects|id']=""
else:
moleculareffects=[moleculareffectt]
dict_to_xls['molecularAttributes|molecularEffects|id']=""
for moleculareffect in moleculareffects:
if dict_to_xls['molecularAttributes|molecularEffects|id']=="":
if moleculareffect == 'missense_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "ENSGLOSSARY:0000150"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'intron_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "ENSGLOSSARY:0000161"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'upstream_gene_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001631"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == '5_prime_UTR_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001623"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'synonymous_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001819"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'downstream_gene_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001632"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'non_coding_transcript_exon_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001792"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == '5_prime_UTR_premature_start_codon_gain_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001988"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'splice_region_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001630"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'intergenic_region':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0000605"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'splice_donor_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001575"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == '3_prime_UTR_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001624"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'splice_acceptor_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001574"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'stop_retained_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] = "SO:0001567"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
elif moleculareffect == 'coding_sequence_variant':
dict_to_xls['molecularAttributes|molecularEffects|id']="SO:0001580"
dict_to_xls['molecularAttributes|molecularEffects|label'] = moleculareffect
else:
if moleculareffect == 'missense_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "ENSGLOSSARY:0000150"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'intron_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "ENSGLOSSARY:0000161"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'upstream_gene_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001631"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == '5_prime_UTR_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001623"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'synonymous_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001819"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'downstream_gene_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001632"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'non_coding_transcript_exon_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001792"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == '5_prime_UTR_premature_start_codon_gain_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001988"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'splice_region_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001630"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'intergenic_region':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0000605"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'splice_donor_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001575"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == '3_prime_UTR_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001624"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'splice_acceptor_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001574"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'stop_retained_variant':
dict_to_xls['molecularAttributes|molecularEffects|id'] += "|"+ "SO:0001567"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect
elif moleculareffect == 'coding_sequence_variant':
dict_to_xls['molecularAttributes|molecularEffects|id']+= "|"+ "SO:0001580"
dict_to_xls['molecularAttributes|molecularEffects|label'] += "|"+ moleculareffect

if gene != '':
dict_to_xls['molecularAttributes|geneIds']=gene
if aminoacidchange!='':
dict_to_xls['molecularAttributes|aminoacidChanges']=aminoacidchange
elif formatted == True:
annotation_list=v.INFO.get('CSQ')
if annotation_list != None:
annotation_list=annotation_list.split('|')
Expand Down Expand Up @@ -286,6 +402,7 @@ def generate(dict_properties):
formatted=False



else:
try:
varianttype=v.INFO.get('VT')
Expand Down
7 changes: 7 additions & 0 deletions pipelines/default/templates/template.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"template": false,
"variantType": "VT",
"aminoacidChange": "HGVSp",
"geneId": "SYMBOL",
"molecularEffects": "CONSEQUENCE"
}

0 comments on commit 967b972

Please sign in to comment.