Development (#24)

* Add inheritance information to the result vcf * Fix typo * Fix a problem with the multiprocessing * Fix some problems with multiprocessing in standalone mode. Introduce small changes to the result file creation to prevent possible problems. * Preparations for new feature * Add new feature * Prevent too strict filtering * Fix for possible problems if gene symbols have lower and upper case letters in different resources. * Add missing condition to make filtering less strict * Add todo notes * Frameshift and splicing (#21) * Improve filtering and add frameshift handling * Add splicing predictions and improve filtering * Refactoring (#22) * Use the 'functool' to get rid off global variables in context of multiprocessing. * Update HPO resources to prevent problems. Now gene symbols are in all capital letters in each mapping file * Update HPO resources generation script to include the gene to interacting genes resource * Refactoring and removal of unused code fragments * Bugfix * Make ABB filter less strict to be more sensitive * Small changes * Use logging instead of terminal outputs. Fix some problems with the multisample handling and activate the functionality. * Remove unnecessary debug messages. * Remove merge artifacr
imgag · Jul 27, 2021 · a60186f · a60186f
1 parent b283301
commit a60186f
Show file tree

Hide file tree

Showing 14 changed files with 968 additions and 102 deletions.
diff --git a/aidiva/helper_modules/combine_expanded_indels_and_create_csv.py b/aidiva/helper_modules/combine_expanded_indels_and_create_csv.py
@@ -5,6 +5,10 @@
 import argparse
 from functools import partial
 from operator import itemgetter
+import logging
+
+
+logger = logging.getLogger(__name__)
 
 
 def annotate_indels_with_combined_snps_information(row, grouped_expanded_vcf, feature):

diff --git a/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py b/aidiva/helper_modules/convert_indels_to_snps_and_create_vcf.py
@@ -3,6 +3,10 @@
 import argparse
 import random
 from Bio import SeqIO
+import logging
+
+
+logger = logging.getLogger(__name__)
 
 
 def write_data_information_to_file(input_data, outfile, ref_sequence, header):
@@ -36,10 +40,10 @@ def write_data_information_to_file(input_data, outfile, ref_sequence, header):
                 elif (extended_ref_seq[i] == "G") or (extended_ref_seq[i] == "C"):
                     alt_variant = random.choice(["A", "T"])
                 elif (extended_ref_seq[i] == "N"):
-                    print("WARNING: Reference base skipped since it was N!")
+                    logger.warn("Reference base was skipped because it was 'N'!")
                     continue
                 else:
-                    print("ERROR: Something went wrong!")
+                    logger.error("The given reference sequence seems to be corrupted!")
 
                 outfile.write(str(row.CHROM).strip() + "\t" + str(window_start + i + 1).strip() + "\t" + "." + "\t" + str(extended_ref_seq[i]).strip() + "\t" + str(alt_variant).strip() + "\t" + "." + "\t" + "." + "\t" + str(row.INFO).strip() + "\n")
 
@@ -62,7 +66,7 @@ def import_vcf_data(in_data):
             continue
 
     if header_line == "":
-        print("ERROR: The VCF file seems to be corrupted")
+        logger.error("The VCF seems to be corrupted, missing header line!")
 
     # reset file pointer to begin reading at the beginning
     input_vcf.close()

diff --git a/aidiva/helper_modules/convert_vcf_to_csv.py b/aidiva/helper_modules/convert_vcf_to_csv.py
@@ -5,6 +5,7 @@
 import argparse
 from functools import partial
 from operator import itemgetter
+import logging
 
 
 variant_consequences = {"transcript_ablation": 1,
@@ -44,6 +45,8 @@
                         "feature_truncation": 35,
                         "intergenic_variant": 36}
 
+logger = logging.getLogger(__name__)
+
 
 def reformat_vcf_file_and_read_into_pandas_and_extract_header(filepath):
     header_line = ""
@@ -68,7 +71,7 @@ def reformat_vcf_file_and_read_into_pandas_and_extract_header(filepath):
             continue
 
     if header_line == "":
-        print("ERROR: The VCF file seems to be corrupted")
+        logger.warn("The VCF file seems to be corrupted")
 
     vcf_header = header_line.strip().split("\t")
     vcf_as_dataframe = pd.read_csv(tmp.name, names=vcf_header, sep="\t", comment="#", low_memory=False)
@@ -88,6 +91,18 @@ def extract_annotation_header(header):
     return annotation_header
 
 
+def extract_sample_header(header):
+    sample_header = []
+
+    for line in header:
+        if line.startswith("##SAMPLE=<"):
+            sample_entry = line.strip().replace("##SAMPLE=<", "").replace(">", "")
+            sample_id = sample_entry.split(",")[0]
+            sample_header.append(sample_id.split("=")[1])
+
+    return sample_header
+
+
 def extract_columns(cell, process_indel):
     info_fields = str(cell).split(";")
 
@@ -156,53 +171,78 @@ def extract_vep_annotation(cell, annotation_header):
     return new_cols
 
 
-def extract_sample_information(row, sample):
-    sample_header = str(row["FORMAT"]).strip().split(":")
-    sample_fields = str(row[sample + ".full"]).strip().split(":")
-
-    if len(sample_header) != len(sample_fields):
-        num_missing_entries = abs(len(sample_header) - len(sample_fields))
-        for i in range(num_missing_entries):
-            sample_fields.append(".")
+def extract_sample_information(row, sample, sample_header=None):
+    if str(row["FORMAT"]) == "MULTI":
+        if sample_header is not None:
+            sample_dict = {}
+            sample_information = []
+            for sample_entry in row[sample].split(","):
+                splitted_entry = sample_entry.split("=")
+                sample_dict[splitted_entry[0]] = splitted_entry[1]
+
+            for sample_id in sample_header:
+                splitted_sample_information = sample_dict[sample_id].split("|")
+                if splitted_sample_information[0] == "wt":
+                    sample_information.append("0/0")
+                elif splitted_sample_information[0] == "hom":
+                    sample_information.append("1/1")
+                elif splitted_sample_information[0] == "het":
+                    sample_information.append("0/1")
+                else:
+                    logger.warning("Genotype not recognized! (%s)" % (splitted_sample_information[0]))
+
+                sample_information.append(splitted_sample_information[1])
+                sample_information.append(splitted_sample_information[2])
+                sample_information.append(sample_id + "=" + sample_dict[sample_id])
+        else:
+            logger.error("Format is MULTI but no sample_header is given!")
+    else:
+        format_entries = str(row["FORMAT"]).strip().split(":")
+        sample_fields = str(row[sample + ".full"]).strip().split(":")
 
-    if len(sample_header) != len(sample_fields):
-        num_missing_entries = abs(len(sample_header) - len(sample_fields))
-        for i in range(num_missing_entries):
-            sample_fields.append(".")
+        if len(format_entries) != len(sample_fields):
+            num_missing_entries = abs(len(format_entries) - len(sample_fields))
+            for i in range(num_missing_entries):
+                sample_fields.append(".")
 
-    if "GT" in sample_header:
-        sample_gt_information = sample_fields[sample_header.index("GT")]
-    else:
-        sample_gt_information = "./."
+        if len(format_entries) != len(sample_fields):
+            num_missing_entries = abs(len(format_entries) - len(sample_fields))
+            for i in range(num_missing_entries):
+                sample_fields.append(".")
 
-    if "DP" in sample_header:
-        sample_dp_information = sample_fields[sample_header.index("DP")]
-    else:
-        sample_dp_information = "."
+        if "GT" in format_entries:
+            sample_gt_information = sample_fields[format_entries.index("GT")]
+        else:
+            sample_gt_information = "./."
 
-    if "AD" in sample_header:
-        sample_ref_information = sample_fields[sample_header.index("AD")].split(",")[0]
-        sample_alt_information = sample_fields[sample_header.index("AD")].split(",")[1]
-    else:
-        sample_ref_information = "."
-        sample_alt_information = "."
+        if "DP" in format_entries:
+            sample_dp_information = sample_fields[format_entries.index("DP")]
+        else:
+            sample_dp_information = "."
 
-    if "GQ" in sample_header:
-        sample_gq_information = sample_fields[sample_header.index("GQ")]
-    else:
-        sample_gq_information = "."
+        if "AD" in format_entries:
+            sample_ref_information = sample_fields[format_entries.index("AD")].split(",")[0]
+            sample_alt_information = sample_fields[format_entries.index("AD")].split(",")[1]
+        else:
+            sample_ref_information = "."
+            sample_alt_information = "."
 
-    if (sample_ref_information != ".") and (sample_alt_information != "."):
-        divisor = (int(sample_ref_information) + int(sample_alt_information))
-        if divisor == 0:
-            sample_af_information = 0
+        if "GQ" in format_entries:
+            sample_gq_information = sample_fields[format_entries.index("GQ")]
         else:
-            sample_af_information = (int(sample_alt_information) / divisor)
+            sample_gq_information = "."
 
-    else:
-        sample_af_information = "."
+        if (sample_ref_information != ".") and (sample_alt_information != "."):
+            divisor = (int(sample_ref_information) + int(sample_alt_information))
+            if divisor == 0:
+                sample_af_information = 0
+            else:
+                sample_af_information = (int(sample_alt_information) / divisor)
+
+        else:
+            sample_af_information = "."
 
-    sample_information = [sample_gt_information, sample_dp_information, sample_ref_information, sample_alt_information, sample_af_information, sample_gq_information]
+        sample_information = [sample_gt_information, sample_dp_information, sample_ref_information, sample_alt_information, sample_af_information, sample_gq_information]
 
     return sample_information
 
@@ -225,12 +265,21 @@ def add_VEP_annotation_to_dataframe(annotation_header, vcf_as_dataframe):
     return vcf_as_dataframe
 
 
-def add_sample_information_to_dataframe(sample_ids, vcf_as_dataframe):
+def add_sample_information_to_dataframe(sample_ids, sample_header, vcf_as_dataframe):
     for sample in sample_ids:
-        vcf_as_dataframe.rename(columns={sample: sample + ".full"}, inplace=True)
-        sample_header = ["GT." + sample, "DP." + sample, "REF." + sample, "ALT." + sample, "AF." + sample, "GQ." + sample]
-        vcf_as_dataframe[sample_header] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_sample_information(x, sample)), axis=1)
-        vcf_as_dataframe = vcf_as_dataframe.drop(columns=[sample + ".full"])
+        if (sample == "trio") or (sample == "multi"):
+            sample_header_multi = []
+            for sample_id in sample_header:
+                sample_header_multi.append("GT." + sample_id)
+                sample_header_multi.append("DP." + sample_id)
+                sample_header_multi.append("ALT." + sample_id)
+                sample_header_multi.append(sample_id + ".full")
+            vcf_as_dataframe[sample_header_multi] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_sample_information(x, sample, sample_header)), axis=1)
+        else:
+            vcf_as_dataframe.rename(columns={sample: sample + ".full"}, inplace=True)
+            sample_header_default = ["GT." + sample, "DP." + sample, "REF." + sample, "ALT." + sample, "AF." + sample, "GQ." + sample]
+            vcf_as_dataframe[sample_header_default] = vcf_as_dataframe.apply(lambda x: pd.Series(extract_sample_information(x, sample)), axis=1)
+            vcf_as_dataframe = vcf_as_dataframe.drop(columns=[sample + ".full"])
 
     vcf_as_dataframe = vcf_as_dataframe.drop(columns=["FORMAT"])
 
@@ -240,31 +289,34 @@ def add_sample_information_to_dataframe(sample_ids, vcf_as_dataframe):
 def convert_vcf_to_pandas_dataframe(input_file, process_indel, num_cores):
     header, vcf_as_dataframe = reformat_vcf_file_and_read_into_pandas_and_extract_header(input_file)
 
+    logger.info("Convert")
+
     sample_ids = []
     # FORMAT column has index 8 (counted from 0) and sample columns follow afterwards (sample names are unique)
     ## TODO: add condition to check if FORMAT column exists
     for i in range(9, len(vcf_as_dataframe.columns)):
         sample_ids.append(vcf_as_dataframe.columns[i])
 
     annotation_header = extract_annotation_header(header)
+    sample_header = extract_sample_header(header)
 
     if not vcf_as_dataframe.empty:
         vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_INFO_fields_to_dataframe, process_indel), num_cores)
         vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_VEP_annotation_to_dataframe, annotation_header), num_cores)
 
         if len(vcf_as_dataframe.columns) > 8:
             if "FORMAT" in vcf_as_dataframe.columns:
-                vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_sample_information_to_dataframe, sample_ids), num_cores)
+                vcf_as_dataframe = parallelize_dataframe_processing(vcf_as_dataframe, partial(add_sample_information_to_dataframe, sample_ids, sample_header), num_cores)
             else:
                 # This warning is always triggered when the expanded indel vcf file is processed. If it is only triggered once in this case it can be ignored.
-                print("WARNING: It seems that your VCF file does contain sample information but not FORMAT description!")
+                logger.warn("It seems that your VCF file does contain sample information but no FORMAT description!")
         else:
-            print("MISSING SAMPLE INFORMATION!")
+            logger.error("MISSING SAMPLE INFORMATION!")
 
         # replace empty strings or only spaces with NaN
         vcf_as_dataframe = vcf_as_dataframe.replace(r"^\s*$", np.nan, regex=True)
     else:
-        print("WARNING: The given VCF file is empty!")
+        logger.error("The given VCF file is empty!")
 
     return vcf_as_dataframe
 

diff --git a/aidiva/helper_modules/create_result_vcf.py b/aidiva/helper_modules/create_result_vcf.py
@@ -1,6 +1,10 @@
 import pandas as pd
 import numpy as np
 import argparse
+import logging
+
+
+logger = logging.getLogger(__name__)
 
 
 def write_header(out_file, single):

diff --git a/aidiva/helper_modules/filter_vcf.py b/aidiva/helper_modules/filter_vcf.py
@@ -1,6 +1,7 @@
 import argparse
 import gzip
 import tempfile
+import logging
 
 
 coding_variants = ["splice_acceptor_variant",
@@ -22,6 +23,8 @@
                    "5_prime_UTR_variant",
                    "3_prime_UTR_variant"]
 
+logger = logging.getLogger(__name__)
+
 
 def filter_coding_variants(filepath, filepath_out, annotation_field_name):
     if filepath.endswith(".gz"):
@@ -62,6 +65,9 @@ def filter_coding_variants(filepath, filepath_out, annotation_field_name):
             elif line.strip().startswith("##FILTER="):
                 outfile.write(line)
                 continue
+            elif line.strip().startswith("##ANALYSISTYPE="):
+                outfile.write(line)
+                continue
             elif line.strip().startswith("##SAMPLE="):
                 outfile.write(line)
                 continue
@@ -102,7 +108,8 @@ def filter_coding_variants(filepath, filepath_out, annotation_field_name):
                 splitted_line[7] = "."
                 outfile.write("\t".join(splitted_line))
         else:
-            print("WARNING: Annotation field missing!")
+            logger.error("Annotation field missing!")
+            logger.warn("Variant filtering will be skipped!")
 
     vcf_file_to_reformat.close()
     outfile.close()

diff --git a/aidiva/helper_modules/generate_HPO_resources.py b/aidiva/helper_modules/generate_HPO_resources.py
@@ -91,7 +91,7 @@ def extract_hpo_graph_edges(hpo_ontology, hpo_edges_file):
 # awk -F '\t'  '{print $5}' < phenotype_annotation.tab | sort  | uniq -c | awk '{print $2 "\t" $1}' > HPO_counts.txt
 def generate_hpo_graph(hpo_counts, hpo_edges_file, hpo_graph_file):
     print("Generate HPO graph...")
-    offset =1000 # penalization for the distance
+    offset = 1000 # penalization for the distance
     # idea is a graph with attribute the IC value per node calculated
     output = hpo_graph_file # contains the nodes and edges to import in a DiGraph
 

diff --git a/aidiva/helper_modules/split_vcf_in_indel_and_snp_set.py b/aidiva/helper_modules/split_vcf_in_indel_and_snp_set.py
@@ -3,6 +3,10 @@
 import tempfile
 import argparse
 import gzip
+import logging
+
+
+logger = logging.getLogger(__name__)
 
 
 def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel):
@@ -38,8 +42,7 @@ def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel)
         # remove variants with multiple alternative alleles reported
         # TODO decide how to handle them in general
         if "," in splitted_line[4]:
-            print("Variant was removed!")
-            print("REASON: Too many alternative alleles reported!")
+            logger.warn("Processed variant was removed, too many alternative alleles reported!")
             continue
         else:
             ref_length = len(splitted_line[3])
@@ -55,7 +58,7 @@ def split_vcf_file_in_indel_and_snps_set(filepath, filepath_snp, filepath_indel)
                     splitted_line[7] = splitted_line[7].replace("\n", "") + ";INDEL_ID=" + str(indel_ID)
                 outfile_indel.write("\t".join(splitted_line))
             else:
-                print("Something was not rigtht!")
+                logger.critical("Something bad happened!")
 
     vcf_file_to_reformat.close()
     outfile_snps.close()