Skip to content

Commit

Permalink
Development (#24)
Browse files Browse the repository at this point in the history
* Add inheritance information to the result vcf

* Fix typo

* Fix a problem with the multiprocessing

* Fix some problems with multiprocessing in standalone mode. Introduce small changes to the result file creation to prevent possible problems.

* Preparations for new feature

* Add new feature

* Prevent too strict filtering

* Fix for possible problems if gene symbols have lower and upper case letters in different resources.

* Add missing condition to make filtering less strict

* Add todo notes

* Frameshift and splicing (#21)

* Improve filtering and add frameshift handling

* Add splicing predictions and improve filtering

* Refactoring (#22)

* Use the 'functool' to get rid off global variables in context of multiprocessing.

* Update HPO resources to prevent problems. Now gene symbols are in all capital letters in each mapping file

* Update HPO resources generation script to include the gene to interacting genes resource

* Refactoring and removal of unused code fragments

* Bugfix

* Make ABB filter less strict to be more sensitive

* Small changes

* Use logging instead of terminal outputs. Fix some problems with the multisample handling and activate the functionality.

* Remove unnecessary debug messages.

* Remove merge artifacr
  • Loading branch information
dboceck authored Jul 27, 2021
1 parent b283301 commit a60186f
Show file tree
Hide file tree
Showing 14 changed files with 968 additions and 102 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
import argparse
from functools import partial
from operator import itemgetter
import logging


logger = logging.getLogger(__name__)


def annotate_indels_with_combined_snps_information(row, grouped_expanded_vcf, feature):
Expand Down
10 changes: 7 additions & 3 deletions aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import argparse
import random
from Bio import SeqIO
import logging


logger = logging.getLogger(__name__)


def write_data_information_to_file(input_data, outfile, ref_sequence, header):
Expand Down Expand Up @@ -36,10 +40,10 @@ def write_data_information_to_file(input_data, outfile, ref_sequence, header):
elif (extended_ref_seq[i] == "G") or (extended_ref_seq[i] == "C"):
alt_variant = random.choice(["A", "T"])
elif (extended_ref_seq[i] == "N"):
print("WARNING: Reference base skipped since it was N!")
logger.warn("Reference base was skipped because it was 'N'!")
continue
else:
print("ERROR: Something went wrong!")
logger.error("The given reference sequence seems to be corrupted!")

outfile.write(str(row.CHROM).strip() + "\t" + str(window_start + i + 1).strip() + "\t" + "." + "\t" + str(extended_ref_seq[i]).strip() + "\t" + str(alt_variant).strip() + "\t" + "." + "\t" + "." + "\t" + str(row.INFO).strip() + "\n")

Expand All @@ -62,7 +66,7 @@ def import_vcf_data(in_data):
continue

if header_line == "":
print("ERROR: The VCF file seems to be corrupted")
logger.error("The VCF seems to be corrupted, missing header line!")

# reset file pointer to begin reading at the beginning
input_vcf.close()
Expand Down
148 changes: 100 additions & 48 deletions aidiva/helper_modules/convert_vcf_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import argparse
from functools import partial
from operator import itemgetter
import logging


variant_consequences = {"transcript_ablation": 1,
Expand Down Expand Up @@ -44,6 +45,8 @@
"feature_truncation": 35,
"intergenic_variant": 36}

logger = logging.getLogger(__name__)


def reformat_vcf_file_and_read_into_pandas_and_extract_header(filepath):
header_line = ""
Expand All @@ -68,7 +71,7 @@ def reformat_vcf_file_and_read_into_pandas_and_extract_header(filepath):
continue

if header_line == "":
print("ERROR: The VCF file seems to be corrupted")
logger.warn("The VCF file seems to be corrupted")

vcf_header = header_line.strip().split("\t")
vcf_as_dataframe = pd.read_csv(tmp.name, names=vcf_header, sep="\t", comment="#", low_memory=False)
Expand All @@ -88,6 +91,18 @@ def extract_annotation_header(header):
return annotation_header


def extract_sample_header(header):
sample_header = []

for line in header:
if line.startswith("##SAMPLE=<"):
sample_entry = line.strip().replace("##SAMPLE=<", "").replace(">", "")
sample_id = sample_entry.split(",")[0]
sample_header.append(sample_id.split("=")[1])

return sample_header


def extract_columns(cell, process_indel):
info_fields = str(cell).split(";")

Expand Down Expand Up @@ -156,53 +171,78 @@ def extract_vep_annotation(cell, annotation_header):
return new_cols


def extract_sample_information(row, sample):
sample_header = str(row["FORMAT"]).strip().split(":")
sample_fields = str(row[sample + ".full"]).strip().split(":")

if len(sample_header) != len(sample_fields):
num_missing_entries = abs(len(sample_header) - len(sample_fields))
for i in range(num_missing_entries):
sample_fields.append(".")
def extract_sample_information(row, sample, sample_header=None):
if str(row["FORMAT"]) == "MULTI":
if sample_header is not None:
sample_dict = {}
sample_information = []
for sample_entry in row[sample].split(","):
splitted_entry = sample_entry.split("=")
sample_dict[splitted_entry[0]] = splitted_entry[1]

for sample_id in sample_header:
splitted_sample_information = sample_dict[sample_id].split("|")
if splitted_sample_information[0] == "wt":
sample_information.append("0/0")
elif splitted_sample_information[0] == "hom":
sample_information.append("1/1")
elif splitted_sample_information[0] == "het":
sample_information.append("0/1")
else:
logger.warning("Genotype not recognized! (%s)" % (splitted_sample_information[0]))

sample_information.append(splitted_sample_information[1])
sample_information.append(splitted_sample_information[2])
sample_information.append(sample_id + "=" + sample_dict[sample_id])
else:
logger.error("Format is MULTI but no sample_header is given!")
else:
format_entries = str(row["FORMAT"]).strip().split(":")
sample_fields = str(row[sample + ".full"]).strip().split(":")

if len(sample_header) != len(sample_fields):
num_missing_entries = abs(len(sample_header) - len(sample_fields))
for i in range(num_missing_entries):
sample_fields.append(".")
if len(format_entries) != len(sample_fields):
num_missing_entries = abs(len(format_entries) - len(sample_fields))
for i in range(num_missing_entries):
sample_fields.append(".")

if "GT" in sample_header:
sample_gt_information = sample_fields[sample_header.index("GT")]
else:
sample_gt_information = "./."
if len(format_entries) != len(sample_fields):
num_missing_entries = abs(len(format_entries) - len(sample_fields))
for i in range(num_missing_entries):
sample_fields.append(".")

if "DP" in sample_header:
sample_dp_information = sample_fields[sample_header.index("DP")]
else:
sample_dp_information = "."
if "GT" in format_entries:
sample_gt_information = sample_fields[format_entries.index("GT")]
else:
sample_gt_information = "./."

if "AD" in sample_header:
sample_ref_information = sample_fields[sample_header.index("AD")].split(",")[0]
sample_alt_information = sample_fields[sample_header.index("AD")].split(",")[1]
else:
sample_ref_information = "."
sample_alt_information = "."
if "DP" in format_entries:
sample_dp_information = sample_fields[format_entries.index("DP")]
else:
sample_dp_information = "."

if "GQ" in sample_header:
sample_gq_information = sample_fields[sample_header.index("GQ")]
else:
sample_gq_information = "."
if "AD" in format_entries:
sample_ref_information = sample_fields[format_entries.index("AD")].split(",")[0]
sample_alt_information = sample_fields[format_entries.index("AD")].split(",")[1]
else:
sample_ref_information = "."
sample_alt_information = "."

if (sample_ref_information != ".") and (sample_alt_information != "."):
divisor = (int(sample_ref_information) + int(sample_alt_information))
if divisor == 0:
sample_af_information = 0
if "GQ" in format_entries:
sample_gq_information = sample_fields[format_entries.index("GQ")]
else:
sample_af_information = (int(sample_alt_information) / divisor)
sample_gq_information = "."

else:
sample_af_information = "."
if (sample_ref_information != ".") and (sample_alt_information != "."):
divisor = (int(sample_ref_information) + int(sample_alt_information))
if divisor == 0:
sample_af_information = 0
else:
sample_af_information = (int(sample_alt_information) / divisor)

else:
sample_af_information = "."

sample_information = [sample_gt_information, sample_dp_information, sample_ref_information, sample_alt_information, sample_af_information, sample_gq_information]
sample_information = [sample_gt_information, sample_dp_information, sample_ref_information, sample_alt_information, sample_af_information, sample_gq_information]

return sample_information

Expand All @@ -225,12 +265,21 @@ def add_VEP_annotation_to_dataframe(annotation_header, vcf_as_dataframe):
return vcf_as_dataframe


def add_sample_information_to_dataframe(sample_ids, vcf_as_dataframe):
def add_sample_information_to_dataframe(sample_ids, sample_header, vcf_as_dataframe):
for sample in sample_ids:
vcf_as_dataframe.rename(columns={sample: sample + ".full"}, inplace=True)
sample_header = ["GT." + sample, "DP." + sample, "REF." + sample, "ALT." + sample, "AF." + sample, "GQ." + sample]
vcf_as_dataframe[sample_header] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_sample_information(x, sample)), axis=1)
vcf_as_dataframe = vcf_as_dataframe.drop(columns=[sample + ".full"])
if (sample == "trio") or (sample == "multi"):
sample_header_multi = []
for sample_id in sample_header:
sample_header_multi.append("GT." + sample_id)
sample_header_multi.append("DP." + sample_id)
sample_header_multi.append("ALT." + sample_id)
sample_header_multi.append(sample_id + ".full")
vcf_as_dataframe[sample_header_multi] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_sample_information(x, sample, sample_header)), axis=1)
else:
vcf_as_dataframe.rename(columns={sample: sample + ".full"}, inplace=True)
sample_header_default = ["GT." + sample, "DP." + sample, "REF." + sample, "ALT." + sample, "AF." + sample, "GQ." + sample]
vcf_as_dataframe[sample_header_default] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_sample_information(x, sample)), axis=1)
vcf_as_dataframe = vcf_as_dataframe.drop(columns=[sample + ".full"])

vcf_as_dataframe = vcf_as_dataframe.drop(columns=["FORMAT"])

Expand All @@ -240,31 +289,34 @@ def add_sample_information_to_dataframe(sample_ids, vcf_as_dataframe):
def convert_vcf_to_pandas_dataframe(input_file, process_indel, num_cores):
header, vcf_as_dataframe = reformat_vcf_file_and_read_into_pandas_and_extract_header(input_file)

logger.info("Convert")

sample_ids = []
# FORMAT column has index 8 (counted from 0) and sample columns follow afterwards (sample names are unique)
## TODO: add condition to check if FORMAT column exists
for i in range(9, len(vcf_as_dataframe.columns)):
sample_ids.append(vcf_as_dataframe.columns[i])

annotation_header = extract_annotation_header(header)
sample_header = extract_sample_header(header)

if not vcf_as_dataframe.empty:
vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_INFO_fields_to_dataframe, process_indel), num_cores)
vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_VEP_annotation_to_dataframe, annotation_header), num_cores)

if len(vcf_as_dataframe.columns) > 8:
if "FORMAT" in vcf_as_dataframe.columns:
vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_sample_information_to_dataframe, sample_ids), num_cores)
vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_sample_information_to_dataframe, sample_ids, sample_header), num_cores)
else:
# This warning is always triggered when the expanded indel vcf file is processed. If it is only triggered once in this case it can be ignored.
print("WARNING: It seems that your VCF file does contain sample information but not FORMAT description!")
logger.warn("It seems that your VCF file does contain sample information but no FORMAT description!")
else:
print("MISSING SAMPLE INFORMATION!")
logger.error("MISSING SAMPLE INFORMATION!")

# replace empty strings or only spaces with NaN
vcf_as_dataframe = vcf_as_dataframe.replace(r"^\s*$", np.nan, regex=True)
else:
print("WARNING: The given VCF file is empty!")
logger.error("The given VCF file is empty!")

return vcf_as_dataframe

Expand Down
4 changes: 4 additions & 0 deletions aidiva/helper_modules/create_result_vcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import pandas as pd
import numpy as np
import argparse
import logging


logger = logging.getLogger(__name__)


def write_header(out_file, single):
Expand Down
9 changes: 8 additions & 1 deletion aidiva/helper_modules/filter_vcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import gzip
import tempfile
import logging


coding_variants = ["splice_acceptor_variant",
Expand All @@ -22,6 +23,8 @@
"5_prime_UTR_variant",
"3_prime_UTR_variant"]

logger = logging.getLogger(__name__)


def filter_coding_variants(filepath, filepath_out, annotation_field_name):
if filepath.endswith(".gz"):
Expand Down Expand Up @@ -62,6 +65,9 @@ def filter_coding_variants(filepath, filepath_out, annotation_field_name):
elif line.strip().startswith("##FILTER="):
outfile.write(line)
continue
elif line.strip().startswith("##ANALYSISTYPE="):
outfile.write(line)
continue
elif line.strip().startswith("##SAMPLE="):
outfile.write(line)
continue
Expand Down Expand Up @@ -102,7 +108,8 @@ def filter_coding_variants(filepath, filepath_out, annotation_field_name):
splitted_line[7] = "."
outfile.write("\t".join(splitted_line))
else:
print("WARNING: Annotation field missing!")
logger.error("Annotation field missing!")
logger.warn("Variant filtering will be skipped!")

vcf_file_to_reformat.close()
outfile.close()
Expand Down
2 changes: 1 addition & 1 deletion aidiva/helper_modules/generate_HPO_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def extract_hpo_graph_edges(hpo_ontology, hpo_edges_file):
# awk -F '\t' '{print $5}' < phenotype_annotation.tab | sort | uniq -c | awk '{print $2 "\t" $1}' > HPO_counts.txt
def generate_hpo_graph(hpo_counts, hpo_edges_file, hpo_graph_file):
print("Generate HPO graph...")
offset =1000 # penalization for the distance
offset = 1000 # penalization for the distance
# idea is a graph with attribute the IC value per node calculated
output = hpo_graph_file # contains the nodes and edges to import in a DiGraph

Expand Down
9 changes: 6 additions & 3 deletions aidiva/helper_modules/split_vcf_in_indel_and_snp_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import tempfile
import argparse
import gzip
import logging


logger = logging.getLogger(__name__)


def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel):
Expand Down Expand Up @@ -38,8 +42,7 @@ def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel)
# remove variants with multiple alternative alleles reported
# TODO decide how to handle them in general
if "," in splitted_line[4]:
print("Variant was removed!")
print("REASON: Too many alternative alleles reported!")
logger.warn("Processed variant was removed, too many alternative alleles reported!")
continue
else:
ref_length = len(splitted_line[3])
Expand All @@ -55,7 +58,7 @@ def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel)
splitted_line[7] = splitted_line[7].replace("\n", "") + ";INDEL_ID=" + str(indel_ID)
outfile_indel.write("\t".join(splitted_line))
else:
print("Something was not rigtht!")
logger.critical("Something bad happened!")

vcf_file_to_reformat.close()
outfile_snps.close()
Expand Down
Loading

0 comments on commit a60186f

Please sign in to comment.