Merge pull request #17 from imgag/development

Development
imgag · Feb 26, 2021 · e49bc46 · e49bc46
2 parents 58bf624 + e85655b
commit e49bc46
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 55 deletions.
diff --git a/aidiva/run_annotation_and_AIdiva.py b/aidiva/run_annotation_and_AIdiva.py
@@ -27,7 +27,7 @@
     parser.add_argument("--hpo_list", type=str, dest="hpo_list", metavar="hpo.txt", required=False, help="TXT file containing the HPO terms reported for the current patient")
     parser.add_argument("--gene_exclusion", type=str, dest="gene_exclusion", metavar="gene_exclusion.txt", required=False, help="TXT file containing the genes to exclude in the analysis")
     parser.add_argument("--family_file", type=str, dest="family_file", metavar="family.txt", required=False, help="TXT file showing the family relation of the current patient")
-    parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="String indicating the present family type [SINGLE, TRIO, FAMILY]")
+    parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="String indicating the present family type [SINGLE, TRIO]")
     #parser.add_argument("--threads", type=int, dest="threads", metavar="1", required=False, help="Number of threads to use. (default: 1)")
     args = parser.parse_args()
 
@@ -43,7 +43,7 @@
 
     data_path = "/mnt/data/"
     # data_path = os.path.dirname(os.path.abspath(__file__)) + "/../data/"
-    ref_path = data_path + configuration["Analysis-Input"]["ref-path"]
+    ref_path = configuration["Analysis-Input"]["ref-path"]
 
     # parse input files
     input_vcf = args.vcf
@@ -91,8 +91,9 @@
     hpo_resources_folder = os.path.dirname(os.path.abspath(__file__)) + "/../data/hpo_resources/"
 
     print("Starting VCF preparation...")
-    # filtering step to remove unsupported variants
-    annotate.annotate_consequence_information(input_vcf, str(working_directory + input_filename + "_consequence.vcf"), vep_annotation_dict, num_cores)
+    # sorting and filtering step to remove unsupported variants
+    annotate.sort_vcf(input_vcf, str(working_directory + input_filename + "_sorted.vcf"), vep_annotation_dict)
+    annotate.annotate_consequence_information(str(working_directory + input_filename + "_sorted.vcf"), str(working_directory + input_filename + "_consequence.vcf"), vep_annotation_dict, num_cores)
     filt_vcf.filter_coding_variants(str(working_directory + input_filename + "_consequence.vcf"), str(working_directory + input_filename + "_filtered.vcf"), "CONS")
 
     # convert input vcf to pandas dataframe
@@ -108,8 +109,8 @@
     # Additional annotation with AnnotateFromVCF (a ngs-bits tool)
     # If VCF is used as output format VEP won't annotate from custom VCF files
     print("Starting AnnotateFromVCF annotation...")
-    annotate.annotate_from_vcf(str(working_directory + input_filename + "_snp_vep.vcf"), str(working_directory + input_filename + "_snp_vep_annotated.vcf"), num_cores)
-    annotate.annotate_from_vcf(str(working_directory + input_filename + "_indel_expanded_vep.vcf"), str(working_directory + input_filename + "_indel_expanded_vep_annotated.vcf"), num_cores)
+    annotate.annotate_from_vcf(str(working_directory + input_filename + "_snp_vep.vcf"), str(working_directory + input_filename + "_snp_vep_annotated.vcf"), vep_annotation_dict, num_cores)
+    annotate.annotate_from_vcf(str(working_directory + input_filename + "_indel_expanded_vep.vcf"), str(working_directory + input_filename + "_indel_expanded_vep_annotated.vcf"), vep_annotation_dict, num_cores)
 
     # convert annotated vcfs back to pandas dataframes
     input_data_snp_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + input_filename + "_snp_vep_annotated.vcf"), False, num_cores)

diff --git a/aidiva/variant_annotation/annotate_with_vep.py b/aidiva/variant_annotation/annotate_with_vep.py
@@ -7,28 +7,23 @@
 
 def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict, basic=False, expanded=False, num_cores=1):
     # the path to the executable
-    database_path = "/mnt/data/dbs/"
-    # database_path = os.path.dirname(__file__) + "/../../annotation_resources/
-
-    tool_path = "/mnt/data/tools/"
-    # tool_path = os.path.dirname(__file__) + "/../../tools/"
-    vep_command = tool_path + vep_annotation_dict["vep"] + " "
+    vep_command = vep_annotation_dict["vep"] + "/" + "vep "
 
     bed_annotation = vep_annotation_dict["custom"]["bed-files"]
     bigwig_annotation = vep_annotation_dict["custom"]["bigwig-files"]
     #vcf_annotation = vep_annotation_dict["custom"]["vcf-files"]
 
     # set the correct paths to the needed perl modules
     if "PERL5LIB" in os.environ:
-        os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/:" + os.environ["PERL5LIB"]
+        os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"]
     else:
-        os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/"
+        os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/"
 
     # add essential parameters
     vep_command = vep_command + "--species homo_sapiens --assembly GRCh37" + " "
     vep_command = vep_command + "--offline" + " "
     vep_command = vep_command + "--cache" + " "
-    vep_command = vep_command + "--dir_cache " + database_path + vep_annotation_dict["cache-path"] + " "
+    vep_command = vep_command + "--dir_cache " + vep_annotation_dict["db"] + vep_annotation_dict["cache-path"] + " "
     vep_command = vep_command + "--gencode_basic" + " "
     vep_command = vep_command + "--symbol" + " "
     vep_command = vep_command + "--biotype" + " "
@@ -42,22 +37,22 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di
         #vep_command = vep_command + "--af_1kg" + " "
         #vep_command = vep_command + "--af_esp" + " "
         #vep_command = vep_command + "--af_gnomad" + " "
-        vep_command = vep_command + "--custom " + database_path + bed_annotation["simpleRepeat"]["file"]  + "," + "simpleRepeat" + ",bed," + bed_annotation["simpleRepeat"]["method"] + ",0" + " "
-        vep_command = vep_command + "--custom " + database_path + bed_annotation["oe_lof"]["file"]  + "," + "oe_lof" + ",bed," + bed_annotation["oe_lof"]["method"] + ",0" + " "
+        vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation["simpleRepeat"]["file"]  + "," + "simpleRepeat" + ",bed," + bed_annotation["simpleRepeat"]["method"] + ",0" + " "
+        vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation["oe_lof"]["file"]  + "," + "oe_lof" + ",bed," + bed_annotation["oe_lof"]["method"] + ",0" + " "
 
     # vep plugins to use
     if not basic:
         vep_command = vep_command + "--sift s" + " "
         vep_command = vep_command + "--polyphen s" + " "
         #vep_command = vep_command + "--plugin Condel," + vep_annotation_dict["condel"] + ",s" + " "
-        vep_command = vep_command + "--plugin CADD," + database_path + vep_annotation_dict["cadd-snps"] + "," + database_path + vep_annotation_dict["cadd-indel"] + " "
-        vep_command = vep_command + "--plugin REVEL," + database_path + vep_annotation_dict["revel"] + " "
+        vep_command = vep_command + "--plugin CADD," + vep_annotation_dict["db"] + vep_annotation_dict["cadd-snps"] + " " #"," + vep_annotation_dict["db"] + vep_annotation_dict["cadd-indel"] + " "
+        vep_command = vep_command + "--plugin REVEL," + vep_annotation_dict["db"] + vep_annotation_dict["revel"] + " "
         #vep_command = vep_command + "--plugin dbNSFP," + vep_annotation_dict["dbNSFP"] + ",MutationAssessor_score,Eigen-raw,Eigen-phred" + " "
 
         if bed_annotation:
             for key in bed_annotation:
                 if (key != "simpleRepeat") and (key != "oe_lof"):
-                    vep_command = vep_command + "--custom " + database_path + bed_annotation[key]["file"]  + "," + key + ",bed," + bed_annotation[key]["method"] + ",0" + " "
+                    vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation[key]["file"]  + "," + key + ",bed," + bed_annotation[key]["method"] + ",0" + " "
 
         # does not work (seems to be a problem with VEP)
         #if vcf_annotation:
@@ -66,11 +61,11 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di
 
         if bigwig_annotation:
             for key in bigwig_annotation:
-                vep_command = vep_command + "--custom " + database_path + bigwig_annotation[key]["file"] + "," + key + ",bigwig," + bigwig_annotation[key]["method"] + ",0" + " "
+                vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bigwig_annotation[key]["file"] + "," + key + ",bigwig," + bigwig_annotation[key]["method"] + ",0" + " "
 
     vep_command = vep_command + "-i " + input_vcf_file + " "
     vep_command = vep_command + "-o " + output_vcf_file + " "
-    vep_command = vep_command + "--fork " + str(vep_annotation_dict["num-threads"]) + " "
+    vep_command = vep_command + "--fork " + str(num_cores) + " "
     vep_command = vep_command + "--vcf" + " "
     vep_command = vep_command + "--no_stats" + " "
     vep_command = vep_command + "--force_overwrite"
@@ -81,32 +76,27 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di
 
 def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annotation_dict, num_cores=1):
 	# the path to the executable
-    database_path = "/mnt/data/dbs/"
-    # database_path = os.path.dirname(__file__) + "/../../annotation_resources/
-
-    tool_path = "/mnt/data/tools/"
-    # tool_path = os.path.dirname(__file__) + "/../../tools/"
-    vep_command = tool_path + vep_annotation_dict["vep"] + " "
+    vep_command = vep_annotation_dict["vep"] + "/" + "vep "
 
     # set the correct paths to the needed perl modules
     if "PERL5LIB" in os.environ:
-        os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/:" + os.environ["PERL5LIB"]
+        os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"]
     else:
-        os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/"
+        os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/"
 
     # add essential parameters
     vep_command = vep_command + "--species homo_sapiens --assembly GRCh37" + " "
     vep_command = vep_command + "--offline" + " "
     vep_command = vep_command + "--cache" + " "
-    vep_command = vep_command + "--dir_cache " + database_path + vep_annotation_dict["cache-path"] + " "
+    vep_command = vep_command + "--dir_cache " + vep_annotation_dict["db"] + vep_annotation_dict["cache-path"] + " "
     vep_command = vep_command + "--gencode_basic" + " "
 
 
     vep_command = vep_command + "-i " + input_vcf_file + " "
     vep_command = vep_command + "-o " + output_vcf_file + " "
     vep_command = vep_command + "--fields Consequence "
     vep_command = vep_command + "--vcf_info_field CONS "
-    vep_command = vep_command + "--fork " + str(vep_annotation_dict["num-threads"]) + " "
+    vep_command = vep_command + "--fork " + str(num_cores) + " "
     vep_command = vep_command + "--vcf" + " "
     vep_command = vep_command + "--no_stats" + " "
     vep_command = vep_command + "--force_overwrite"
@@ -115,30 +105,30 @@ def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annota
     print("The annotated VCF is saved as %s" % (output_vcf_file))
 
 
-def annotate_from_vcf(input_vcf_file, output_vcf_file, num_cores):
+def annotate_from_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict, num_cores):
     tmp = tempfile.NamedTemporaryFile(mode="w+b", suffix=".config", delete=False)
 
-    database_path = "/mnt/data/dbs/"
-    # database_path = os.path.dirname(__file__) + "/../../annotation_resources/
-    #tool_path = "/mnt/storage1/share/opt/"
-    tool_path = "/mnt/data/tools/"
-    # tool_path = os.path.dirname(__file__) + "/../../tools/"
-
-    command = tool_path + "ngs-bits/bin/VcfAnnotateFromVcf -config_file " + tmp.name + " -in " + input_vcf_file + " -out " + output_vcf_file + " -threads " + str(num_cores)
+    vcf_annotation = vep_annotation_dict["custom"]["vcf-files"]
+    command = vep_annotation_dict["ngs-bits"] + "/" + "VcfAnnotateFromVcf -config_file " + tmp.name + " -in " + input_vcf_file + " -out " + output_vcf_file + " -threads " + str(num_cores)
 
     try:
-        tmp.write(str(database_path + "Condel/hg19_precomputed_Condel.vcf.gz\t\tCONDEL\t\ttrue\n").encode())
-        tmp.write(str(database_path + "Eigen/hg19_Eigen-phred_coding_chrom1-22.vcf.gz\t\tEIGEN_PHRED\t\ttrue\n").encode())
-        tmp.write(str(database_path + "fathmm-XF/hg19_fathmm_xf_coding.vcf.gz\t\tFATHMM_XF\t\ttrue\n").encode())
-        tmp.write(str(database_path + "MutationAssessor/hg19_precomputed_MutationAssessor.vcf.gz\t\tMutationAssessor\t\ttrue\n").encode())
-        tmp.write(str(database_path + "gnomAD/gnomAD_genome_r2.1.1.vcf.gz\tgnomAD\tAN,Hom\t\ttrue\n").encode())
+        tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["CONDEL"]["file"] + "\t\tCONDEL\t\ttrue\n").encode())
+        tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["EIGEN_PHRED"]["file"] + "\t\tEIGEN_PHRED\t\ttrue\n").encode())
+        tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["FATHMM_XF"]["file"] + "\t\tFATHMM_XF\t\ttrue\n").encode())
+        tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["MutationAssessor"]["file"] + "\t\tMutationAssessor\t\ttrue\n").encode())
+        tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["gnomAD"]["file"] + "\tgnomAD\tAN,Hom\t\ttrue\n").encode())
         tmp.close()
 
         subprocess.run(command, shell=True, check=True)
     finally:
         os.remove(tmp.name)
 
 
+def sort_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict):
+    command = vep_annotation_dict["ngs-bits"] + "/" + "VcfSort -in " + input_vcf_file + " -out " + output_vcf_file
+    subprocess.run(command, shell=True, check=True)
+
+
 if __name__=="__main__":
     parser = argparse.ArgumentParser(description = "AIdiva -- Annotation with VEP")
     parser.add_argument("--in_data", type=str, dest="in_data", metavar="data.vcf", required=True, help="VCF file containing the data, you want to annotate with VEP\n")

diff --git a/aidiva/variant_prioritization/prioritize_variants.py b/aidiva/variant_prioritization/prioritize_variants.py
@@ -170,9 +170,9 @@ def parallelize_dataframe_processing(variant_data, function, n_cores):
 
 
 def parallelized_variant_processing(variant_data):
+    variant_data = check_inheritance(variant_data, family_type, family)
     variant_data[["HPO_RELATEDNESS", "HPO_RELATEDNESS_INTERACTING", "FINAL_AIDIVA_SCORE"]] = variant_data.apply(lambda variant: pd.Series(compute_hpo_relatedness_and_final_score(variant)), axis=1)
     variant_data[["FILTER_PASSED", "FILTER_COMMENT"]] = variant_data.apply(lambda variant: pd.Series(check_filters(variant)), axis=1)
-    variant_data = check_inheritance(variant_data, family_type, family)
 
     return variant_data
 
@@ -224,11 +224,13 @@ def compute_hpo_relatedness_and_final_score(variant):
                     gene_distances_interacting.append(g_dist)
 
                 if gene_distances or gene_distances_interacting:
-                    hpo_relatedness = max(gene_distances, default=0.0)
-                    hpo_relatedness_interacting = max(gene_distances_interacting, default=0.0)
+                    # take only the maximum HPO relatedness to prevent downvoting of genes if no HPO relation in interacting genes is observed
+                    hpo_relatedness = max(max(gene_distances, default=0.0), max(gene_distances_interacting, default=0.0))
+                    #hpo_relatedness_interacting = max(gene_distances_interacting, default=0.0)
                     ## TODO: try different weighting of AIDIVA_SCORE and HPO_RELATEDNESS and HPO_RELATEDNESS_INTERACTING (eg 0.6 and 0.3 and 0.1)
-                    #final_score = (float(variant["AIDIVA_SCORE"]) * 0.6 + float(hpo_relatedness) * 0.3 + float(hpo_relatedness_interacting) * 0.1)# / 3
-                    final_score = (float(variant["AIDIVA_SCORE"]) + float(hpo_relatedness) + float(hpo_relatedness_interacting)) / 3
+                    # predicted pathogenicity has a higher weight than the HPO relatedness
+                    final_score = (float(variant["AIDIVA_SCORE"]) * 0.67 + float(hpo_relatedness) * 0.33) #+ float(hpo_relatedness_interacting) * 0.1)# / 3
+                    #final_score = (float(variant["AIDIVA_SCORE"]) + float(hpo_relatedness) + float(hpo_relatedness_interacting)) / 3
             else:
                 final_score = np.nan
                 hpo_relatedness = np.nan

diff --git a/data/AIdiva_configuration_with_annotation.yaml b/data/AIdiva_configuration_with_annotation.yaml
@@ -13,7 +13,7 @@ Analysis-Input:
     #work-dir: test_workdir2/
 
     # path to the reference assembly used during the expansion of the indels
-    ref-path: genomes/GRCh37.fa
+    ref-path: /mnt/share/data/genomes/GRCh37.fa
 
     # trained scoring models used to predict the pathogenicity score
     # if no trained model is present you can use the train_model.py script to train a new custom model
@@ -87,13 +87,16 @@ VEP-Annotation:
     perform-vep-annotation: True
 
     # VEP install path
-    vep: ensembl-vep-release-100.3/vep
+    vep: /mnt/share/opt/ensembl-vep-release-100.3/
 
-    # path to the annotation databases
-    db: dbs/
+    # ngs-bits install path
+    ngs-bits: /mnt/share/opt/ngs-bits-current/
+
+    # path to the annotation databases (the databases itself are given as relative path in this directory)
+    db: /mnt/share/data/dbs/
 
     # threads to use during the annotation
-    num-threads: 8
+    num-threads: 10
 
     # Cache directory and plugin directory
     cache-path: ensembl-vep-100/cache
@@ -143,6 +146,10 @@ VEP-Annotation:
                 file: MutationAssessor/hg19_precomputed_MutationAssessor.vcf.gz
                 method: exact
                 prefix: custom
+            gnomAD:
+                file: gnomAD/gnomAD_genome_r2.1.1.vcf.gz
+                method: exact
+                prefix: custom
 
         # Bigwig files the key of the dictionary is used as the name to present the feature in the annotated file
         bigwig-files: