Development (#19)

* Add inheritance information to the result vcf * Fix typo * Fix a problem with the multiprocessing * Fix some problems with multiprocessing in standalone mode. Introduce small changes to the result file creation to prevent possible problems if some data is missing.
imgag · Mar 12, 2021 · 13ca753 · 13ca753
1 parent 20bdf35
commit 13ca753
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 21 deletions.
diff --git a/aidiva/helper_modules/create_result_vcf.py b/aidiva/helper_modules/create_result_vcf.py
@@ -6,11 +6,11 @@
 def write_header(out_file, single):
     out_file.write("##fileformat=VCFv4.1\n")
     if not single:
-        out_file.write("##INFO=<ID=AIDIVA,Number=4,Type=String,Description=\"AIdiva scores: AIdiva-score,AIdiva-final-score,AIdiva-hpo-relatedness,AIdiva-hpo-relatedness-interacting,AIdiva-filter. (AIdiva-score is the pathogenicity prediction from the random forest; AIdiva-final-score is the predction finalized with the given HPO terms; AIdiva-hpo-relatedness indicates how strong the currrent variant is associated with the given HPO terms; AIdiva-filter 0 or 1 wether all internal filters were passed or not)\">\n")
+        out_file.write("##INFO=<ID=AIDIVA,Number=5,Type=String,Description=\"AIdiva scores: AIdiva-score,AIdiva-final-score,AIdiva-hpo-relatedness,AIdiva-hpo-relatedness-interacting,AIdiva-filter. (AIdiva-score is the pathogenicity prediction from the random forest; AIdiva-final-score is the predction finalized with the given HPO terms; AIdiva-hpo-relatedness indicates how strong the currrent variant is associated with the given HPO terms; AIdiva-filter 0 or 1 wether all internal filters were passed or not)\">\n")
         out_file.write("##INFO=<ID=AIDIVA_INHERITANCE,Number=4,Type=String,Description=\"AIdiva inheritance flags: dominant,denovo,recessive,xlinked,compound. (Each value can be 0 or 1)\">\n")
         out_file.write("##INFO=<ID=AIDIVA_INHERITANCE_COMMENT,Number=1,Type=String,Description=\"AIdiva inheritance flags (dominant,denovo,recessive,xlinked,compound) in written form\">\n")
     else:
-        out_file.write("##INFO=<ID=AIDIVA,Number=4,Type=String,Description=\"AIdiva scores: AIdiva-score,AIdiva-final-score,AIdiva-hpo-relatedness,AIdiva-hpo-relatedness-interacting,AIdiva-filter. (AIdiva-score is the pathogenicity prediction from the random forest; AIdiva-final-score is the predction finalized with the given HPO terms; AIdiva-hpo-relatedness indicates how strong the currrent variant is associated with the given HPO terms; AIdiva-filter 0 or 1 wether all internal filters were passed or not)\">\n")
+        out_file.write("##INFO=<ID=AIDIVA,Number=5,Type=String,Description=\"AIdiva scores: AIdiva-score,AIdiva-final-score,AIdiva-hpo-relatedness,AIdiva-hpo-relatedness-interacting,AIdiva-filter. (AIdiva-score is the pathogenicity prediction from the random forest; AIdiva-final-score is the predction finalized with the given HPO terms; AIdiva-hpo-relatedness indicates how strong the currrent variant is associated with the given HPO terms; AIdiva-filter 0 or 1 wether all internal filters were passed or not)\">\n")
         out_file.write("##INFO=<ID=AIDIVA_INHERITANCE,Number=2,Type=String,Description=\"AIdiva inheritance flags: recessive,compound. (Each value can be 0 or 1)\">\n")
         out_file.write("##INFO=<ID=AIDIVA_INHERITANCE_COMMENT,Number=1,Type=String,Description=\"AIdiva inheritance flags (recessive,compound) in written form\">\n")
 
@@ -43,35 +43,104 @@ def write_header(out_file, single):
 def write_result_vcf(input_data, vcf_file, single):
     input_data = input_data.sort_values(["CHROM", "POS"], ascending=[True, True])
     input_data = input_data.reset_index(drop=True)
+    colnames = input_data.columns
 
     with open(vcf_file, "w") as out:
         write_header(out, single)
 
         for row in input_data.itertuples():
-            if str(row.AIDIVA_SCORE) == "nan":
-                aidiva_score = ""
+            if ("AIDIVA_SCORE" in colnames):
+                if (str(row.AIDIVA_SCORE) == "nan"):
+                    aidiva_score = "."
+                else:
+                    aidiva_score = str(row.AIDIVA_SCORE)
             else:
-                aidiva_score = str(row.AIDIVA_SCORE)
+                aidiva_score = "."
 
-            if str(row.FINAL_AIDIVA_SCORE) == "nan":
-                final_aidiva_score = ""
+            if ("FINAL_AIDIVA_SCORE" in colnames):
+                if (str(row.FINAL_AIDIVA_SCORE) == "nan"):
+                    final_aidiva_score = "."
+                else:
+                    final_aidiva_score = str(row.FINAL_AIDIVA_SCORE)
             else:
-                final_aidiva_score = str(row.FINAL_AIDIVA_SCORE)
+                final_aidiva_score = "."
 
-            if str(row.HPO_RELATEDNESS) == "nan":
-                hpo_relatedness = ""
+            if ("HPO_RELATEDNESS" in colnames):
+                if (str(row.HPO_RELATEDNESS) == "nan"):
+                    hpo_relatedness = "."
+                else:
+                    hpo_relatedness = str(row.HPO_RELATEDNESS)
             else:
-                hpo_relatedness = str(row.HPO_RELATEDNESS)
+                hpo_relatedness = "."
 
-            if str(row.HPO_RELATEDNESS_INTERACTING) == "nan":
-                hpo_relatedness_interacting = ""
+            if ("HPO_RELATEDNESS_INTERACTING" in colnames):
+                if (str(row.HPO_RELATEDNESS_INTERACTING) == "nan"):
+                    hpo_relatedness_interacting = "."
+                else:
+                    hpo_relatedness_interacting = str(row.HPO_RELATEDNESS_INTERACTING)
             else:
-                hpo_relatedness_interacting = str(row.HPO_RELATEDNESS_INTERACTING)
+                hpo_relatedness_interacting = "."
+
+            if ("FILTER_PASSED" in colnames):
+                if (str(row.FILTER_PASSED) == "nan"):
+                    filter_passed = "."
+                else:
+                    filter_passed = str(row.FILTER_PASSED)
+            else:
+                filter_passed = "."
+
+            if ("DOMINANT" in colnames):
+                if (str(row.DOMINANT) == "nan"):
+                    dominant = "."
+                else:
+                    dominant = str(row.DOMINANT)
+            else:
+                dominant = "."
+
+            if ("DOMINANT_DENOVO" in colnames):
+                if (str(row.DOMINANT_DENOVO) == "nan"):
+                    dominant_denovo = "."
+                else:
+                    dominant_denovo = str(row.DOMINANT_DENOVO)
+            else:
+                dominant_denovo = "."
+
+            if ("RECESSIVE" in colnames):
+                if (str(row.RECESSIVE) == "nan"):
+                    recessive = "."
+                else:
+                    recessive = str(row.RECESSIVE)
+            else:
+                recessive = "."
+
+            if ("XLINKED" in colnames):
+                if (str(row.XLINKED) == "nan"):
+                    xlinked = "."
+                else:
+                    xlinked = str(row.XLINKED)
+            else:
+                xlinked = "."
+
+            if ("COMPOUND" in colnames):
+                if (str(row.COMPOUND) == "nan"):
+                    compound = "."
+                else:
+                    compound = str(row.COMPOUND)
+            else:
+                compound = "."
+
+            if ("INHERITANCE" in colnames):
+                if (str(row.INHERITANCE) == "nan") or (str(row.INHERITANCE) == ""):
+                    inheritance_comment = "."
+                else:
+                    inheritance_comment = str(row.INHERITANCE)
+            else:
+                inheritance_comment = "."
 
             if not single:
-                info_entry = "AIDIVA=" + aidiva_score + "," + final_aidiva_score + "," + hpo_relatedness + "," + hpo_relatedness_interacting + "," + str(row.FILTER_PASSED) + ";AIDIVA_INHERITANCE=" + str(row.DOMINANT) + "," + str(row.DOMINANT_DENOVO) + "," + str(row.RECESSIVE) + "," + str(row.XLINKED) + "," + str(row.COMPOUND) + ";AIDIVA_INHERITANCE_COMMENT=" + str(row.INHERITANCE)
+                info_entry = "AIDIVA=" + aidiva_score + "," + final_aidiva_score + "," + hpo_relatedness + "," + hpo_relatedness_interacting + "," + filter_passed + ";AIDIVA_INHERITANCE=" + dominant + "," + dominant_denovo + "," + recessive + "," + xlinked + "," + compound + ";AIDIVA_INHERITANCE_COMMENT=" + inheritance_comment
             else:
-                info_entry = "AIDIVA=" + aidiva_score + "," + final_aidiva_score + "," + hpo_relatedness + "," + hpo_relatedness_interacting + "," + str(row.FILTER_PASSED) + ";AIDIVA_INHERITANCE=" + str(row.RECESSIVE) + "," + str(row.COMPOUND) + ";AIDIVA_INHERITANCE_COMMENT=" + str(row.INHERITANCE)
+                info_entry = "AIDIVA=" + aidiva_score + "," + final_aidiva_score + "," + hpo_relatedness + "," + hpo_relatedness_interacting + "," + filter_passed + ";AIDIVA_INHERITANCE=" + recessive + "," + compound + ";AIDIVA_INHERITANCE_COMMENT=" + inheritance_comment
 
             out.write(str(row.CHROM).strip() + "\t" + str(row.POS) + "\t" + "." + "\t" + str(row.REF) + "\t" + str(row.ALT) + "\t" + "." + "\t" + "." + "\t" + info_entry + "\n")
 

diff --git a/aidiva/run_AIdiva.py b/aidiva/run_AIdiva.py
@@ -107,7 +107,9 @@
         write_result.write_result_vcf(prioritized_data, str(working_directory + output_filename + ".vcf"), bool(family_type == "SINGLE"))
         write_result.write_result_vcf(prioritized_data[prioritized_data["FILTER_PASSED"] == 1], str(working_directory + output_filename + "_filtered.vcf"), bool(family_type == "SINGLE"))
         prioritized_data.to_csv(str(working_directory + output_filename + ".csv"), sep="\t", index=False)
-        prioritized_data[prioritized_data["FILTER_PASSED"] == 1].to_csv(str(working_directory + output_filename + "_passed_filters.csv"), sep="\t", index=False)
+        prioritized_data[prioritized_data["FILTER_PASSED"] == 1].to_csv(str(working_directory + output_filename + "_filtered.csv"), sep="\t", index=False)
         print("Pipeline successfully finsished!")
     else:
+        write_result.write_result_vcf(input_data_snp, str(working_directory + output_filename + ".vcf"), bool(family_type == "SINGLE"))
+        write_result.write_result_vcf(input_data_snp, str(working_directory + output_filename + "_filtered.vcf"), bool(family_type == "SINGLE"))
         print("ERROR: The given input files were empty!")
diff --git a/aidiva/run_annotation_and_AIdiva.py b/aidiva/run_annotation_and_AIdiva.py
@@ -51,7 +51,7 @@
     scoring_model_indel = os.path.dirname(os.path.abspath(__file__)) + "/../data/" + configuration["Analysis-Input"]["scoring-model-indel"]
 
     # obtain number of threads to use during computation
-    num_cores = configuration["VEP-Annotation"]["num-threads"]
+    num_cores = int(configuration["VEP-Annotation"]["num-threads"])
 
     # parse disease and inheritance information
     if args.hpo_list is not None:
@@ -133,11 +133,11 @@
 
     # prioritize and filter variants
     print("Filter variants and finalize score...")
-    prioritized_data = prio.prioritize_variants(predicted_data, hpo_resources_folder, family_file, family_type, hpo_file, gene_exclusion_file, num_cores)
+    prioritized_data = prio.prioritize_variants(predicted_data, hpo_resources_folder, num_cores, family_file, family_type, hpo_file, gene_exclusion_file)
 
     ## TODO: create additional output files according to the inheritance information (only filtered data)
     write_result.write_result_vcf(prioritized_data, str(working_directory + input_filename + "_aidiva_result.vcf"), bool(family_type == "SINGLE"))
     write_result.write_result_vcf(prioritized_data[prioritized_data["FILTER_PASSED"] == 1], str(working_directory + input_filename + "_aidiva_result_filtered.vcf"), bool(family_type == "SINGLE"))
     prioritized_data.to_csv(str(working_directory + input_filename + "_aidiva_result.csv"), sep="\t", index=False)
-    prioritized_data[prioritized_data["FILTER_PASSED"] == 1].to_csv(str(working_directory + input_filename + "_aidiva_result_filt.csv"), sep="\t", index=False)
+    prioritized_data[prioritized_data["FILTER_PASSED"] == 1].to_csv(str(working_directory + input_filename + "_aidiva_result_filtered.csv"), sep="\t", index=False)
     print("Pipeline successfully finsished!")
diff --git a/aidiva/variant_prioritization/prioritize_variants.py b/aidiva/variant_prioritization/prioritize_variants.py
@@ -325,7 +325,7 @@ def add_inheritance_mode(variant, variant_columns):
         if variant["XLINKED"] == 1:
             inheritance_list.append("XLINKED")
 
-    inheritance_mode = ";".join(inheritance_list)
+    inheritance_mode = "&".join(inheritance_list)
 
     return inheritance_mode