diff --git a/aidiva/run_annotation_and_AIdiva.py b/aidiva/run_annotation_and_AIdiva.py index d82f10f..37aaead 100644 --- a/aidiva/run_annotation_and_AIdiva.py +++ b/aidiva/run_annotation_and_AIdiva.py @@ -27,7 +27,7 @@ parser.add_argument("--hpo_list", type=str, dest="hpo_list", metavar="hpo.txt", required=False, help="TXT file containing the HPO terms reported for the current patient") parser.add_argument("--gene_exclusion", type=str, dest="gene_exclusion", metavar="gene_exclusion.txt", required=False, help="TXT file containing the genes to exclude in the analysis") parser.add_argument("--family_file", type=str, dest="family_file", metavar="family.txt", required=False, help="TXT file showing the family relation of the current patient") - parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="String indicating the present family type [SINGLE, TRIO, FAMILY]") + parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="String indicating the present family type [SINGLE, TRIO]") #parser.add_argument("--threads", type=int, dest="threads", metavar="1", required=False, help="Number of threads to use. (default: 1)") args = parser.parse_args() @@ -43,7 +43,7 @@ data_path = "/mnt/data/" # data_path = os.path.dirname(os.path.abspath(__file__)) + "/../data/" - ref_path = data_path + configuration["Analysis-Input"]["ref-path"] + ref_path = configuration["Analysis-Input"]["ref-path"] # parse input files input_vcf = args.vcf @@ -91,8 +91,9 @@ hpo_resources_folder = os.path.dirname(os.path.abspath(__file__)) + "/../data/hpo_resources/" print("Starting VCF preparation...") - # filtering step to remove unsupported variants - annotate.annotate_consequence_information(input_vcf, str(working_directory + input_filename + "_consequence.vcf"), vep_annotation_dict, num_cores) + # sorting and filtering step to remove unsupported variants + annotate.sort_vcf(input_vcf, str(working_directory + input_filename + "_sorted.vcf"), vep_annotation_dict) + annotate.annotate_consequence_information(str(working_directory + input_filename + "_sorted.vcf"), str(working_directory + input_filename + "_consequence.vcf"), vep_annotation_dict, num_cores) filt_vcf.filter_coding_variants(str(working_directory + input_filename + "_consequence.vcf"), str(working_directory + input_filename + "_filtered.vcf"), "CONS") # convert input vcf to pandas dataframe @@ -108,8 +109,8 @@ # Additional annotation with AnnotateFromVCF (a ngs-bits tool) # If VCF is used as output format VEP won't annotate from custom VCF files print("Starting AnnotateFromVCF annotation...") - annotate.annotate_from_vcf(str(working_directory + input_filename + "_snp_vep.vcf"), str(working_directory + input_filename + "_snp_vep_annotated.vcf"), num_cores) - annotate.annotate_from_vcf(str(working_directory + input_filename + "_indel_expanded_vep.vcf"), str(working_directory + input_filename + "_indel_expanded_vep_annotated.vcf"), num_cores) + annotate.annotate_from_vcf(str(working_directory + input_filename + "_snp_vep.vcf"), str(working_directory + input_filename + "_snp_vep_annotated.vcf"), vep_annotation_dict, num_cores) + annotate.annotate_from_vcf(str(working_directory + input_filename + "_indel_expanded_vep.vcf"), str(working_directory + input_filename + "_indel_expanded_vep_annotated.vcf"), vep_annotation_dict, num_cores) # convert annotated vcfs back to pandas dataframes input_data_snp_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + input_filename + "_snp_vep_annotated.vcf"), False, num_cores) diff --git a/aidiva/variant_annotation/annotate_with_vep.py b/aidiva/variant_annotation/annotate_with_vep.py index cb4d2ee..acfb9e6 100644 --- a/aidiva/variant_annotation/annotate_with_vep.py +++ b/aidiva/variant_annotation/annotate_with_vep.py @@ -7,12 +7,7 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict, basic=False, expanded=False, num_cores=1): # the path to the executable - database_path = "/mnt/data/dbs/" - # database_path = os.path.dirname(__file__) + "/../../annotation_resources/ - - tool_path = "/mnt/data/tools/" - # tool_path = os.path.dirname(__file__) + "/../../tools/" - vep_command = tool_path + vep_annotation_dict["vep"] + " " + vep_command = vep_annotation_dict["vep"] + "/" + "vep " bed_annotation = vep_annotation_dict["custom"]["bed-files"] bigwig_annotation = vep_annotation_dict["custom"]["bigwig-files"] @@ -20,15 +15,15 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di # set the correct paths to the needed perl modules if "PERL5LIB" in os.environ: - os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/:" + os.environ["PERL5LIB"] + os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"] else: - os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/" + os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/" # add essential parameters vep_command = vep_command + "--species homo_sapiens --assembly GRCh37" + " " vep_command = vep_command + "--offline" + " " vep_command = vep_command + "--cache" + " " - vep_command = vep_command + "--dir_cache " + database_path + vep_annotation_dict["cache-path"] + " " + vep_command = vep_command + "--dir_cache " + vep_annotation_dict["db"] + vep_annotation_dict["cache-path"] + " " vep_command = vep_command + "--gencode_basic" + " " vep_command = vep_command + "--symbol" + " " vep_command = vep_command + "--biotype" + " " @@ -42,22 +37,22 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di #vep_command = vep_command + "--af_1kg" + " " #vep_command = vep_command + "--af_esp" + " " #vep_command = vep_command + "--af_gnomad" + " " - vep_command = vep_command + "--custom " + database_path + bed_annotation["simpleRepeat"]["file"] + "," + "simpleRepeat" + ",bed," + bed_annotation["simpleRepeat"]["method"] + ",0" + " " - vep_command = vep_command + "--custom " + database_path + bed_annotation["oe_lof"]["file"] + "," + "oe_lof" + ",bed," + bed_annotation["oe_lof"]["method"] + ",0" + " " + vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation["simpleRepeat"]["file"] + "," + "simpleRepeat" + ",bed," + bed_annotation["simpleRepeat"]["method"] + ",0" + " " + vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation["oe_lof"]["file"] + "," + "oe_lof" + ",bed," + bed_annotation["oe_lof"]["method"] + ",0" + " " # vep plugins to use if not basic: vep_command = vep_command + "--sift s" + " " vep_command = vep_command + "--polyphen s" + " " #vep_command = vep_command + "--plugin Condel," + vep_annotation_dict["condel"] + ",s" + " " - vep_command = vep_command + "--plugin CADD," + database_path + vep_annotation_dict["cadd-snps"] + "," + database_path + vep_annotation_dict["cadd-indel"] + " " - vep_command = vep_command + "--plugin REVEL," + database_path + vep_annotation_dict["revel"] + " " + vep_command = vep_command + "--plugin CADD," + vep_annotation_dict["db"] + vep_annotation_dict["cadd-snps"] + " " #"," + vep_annotation_dict["db"] + vep_annotation_dict["cadd-indel"] + " " + vep_command = vep_command + "--plugin REVEL," + vep_annotation_dict["db"] + vep_annotation_dict["revel"] + " " #vep_command = vep_command + "--plugin dbNSFP," + vep_annotation_dict["dbNSFP"] + ",MutationAssessor_score,Eigen-raw,Eigen-phred" + " " if bed_annotation: for key in bed_annotation: if (key != "simpleRepeat") and (key != "oe_lof"): - vep_command = vep_command + "--custom " + database_path + bed_annotation[key]["file"] + "," + key + ",bed," + bed_annotation[key]["method"] + ",0" + " " + vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation[key]["file"] + "," + key + ",bed," + bed_annotation[key]["method"] + ",0" + " " # does not work (seems to be a problem with VEP) #if vcf_annotation: @@ -66,11 +61,11 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di if bigwig_annotation: for key in bigwig_annotation: - vep_command = vep_command + "--custom " + database_path + bigwig_annotation[key]["file"] + "," + key + ",bigwig," + bigwig_annotation[key]["method"] + ",0" + " " + vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bigwig_annotation[key]["file"] + "," + key + ",bigwig," + bigwig_annotation[key]["method"] + ",0" + " " vep_command = vep_command + "-i " + input_vcf_file + " " vep_command = vep_command + "-o " + output_vcf_file + " " - vep_command = vep_command + "--fork " + str(vep_annotation_dict["num-threads"]) + " " + vep_command = vep_command + "--fork " + str(num_cores) + " " vep_command = vep_command + "--vcf" + " " vep_command = vep_command + "--no_stats" + " " vep_command = vep_command + "--force_overwrite" @@ -81,24 +76,19 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annotation_dict, num_cores=1): # the path to the executable - database_path = "/mnt/data/dbs/" - # database_path = os.path.dirname(__file__) + "/../../annotation_resources/ - - tool_path = "/mnt/data/tools/" - # tool_path = os.path.dirname(__file__) + "/../../tools/" - vep_command = tool_path + vep_annotation_dict["vep"] + " " + vep_command = vep_annotation_dict["vep"] + "/" + "vep " # set the correct paths to the needed perl modules if "PERL5LIB" in os.environ: - os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/:" + os.environ["PERL5LIB"] + os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"] else: - os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/" + os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/" # add essential parameters vep_command = vep_command + "--species homo_sapiens --assembly GRCh37" + " " vep_command = vep_command + "--offline" + " " vep_command = vep_command + "--cache" + " " - vep_command = vep_command + "--dir_cache " + database_path + vep_annotation_dict["cache-path"] + " " + vep_command = vep_command + "--dir_cache " + vep_annotation_dict["db"] + vep_annotation_dict["cache-path"] + " " vep_command = vep_command + "--gencode_basic" + " " @@ -106,7 +96,7 @@ def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annota vep_command = vep_command + "-o " + output_vcf_file + " " vep_command = vep_command + "--fields Consequence " vep_command = vep_command + "--vcf_info_field CONS " - vep_command = vep_command + "--fork " + str(vep_annotation_dict["num-threads"]) + " " + vep_command = vep_command + "--fork " + str(num_cores) + " " vep_command = vep_command + "--vcf" + " " vep_command = vep_command + "--no_stats" + " " vep_command = vep_command + "--force_overwrite" @@ -115,23 +105,18 @@ def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annota print("The annotated VCF is saved as %s" % (output_vcf_file)) -def annotate_from_vcf(input_vcf_file, output_vcf_file, num_cores): +def annotate_from_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict, num_cores): tmp = tempfile.NamedTemporaryFile(mode="w+b", suffix=".config", delete=False) - database_path = "/mnt/data/dbs/" - # database_path = os.path.dirname(__file__) + "/../../annotation_resources/ - #tool_path = "/mnt/storage1/share/opt/" - tool_path = "/mnt/data/tools/" - # tool_path = os.path.dirname(__file__) + "/../../tools/" - - command = tool_path + "ngs-bits/bin/VcfAnnotateFromVcf -config_file " + tmp.name + " -in " + input_vcf_file + " -out " + output_vcf_file + " -threads " + str(num_cores) + vcf_annotation = vep_annotation_dict["custom"]["vcf-files"] + command = vep_annotation_dict["ngs-bits"] + "/" + "VcfAnnotateFromVcf -config_file " + tmp.name + " -in " + input_vcf_file + " -out " + output_vcf_file + " -threads " + str(num_cores) try: - tmp.write(str(database_path + "Condel/hg19_precomputed_Condel.vcf.gz\t\tCONDEL\t\ttrue\n").encode()) - tmp.write(str(database_path + "Eigen/hg19_Eigen-phred_coding_chrom1-22.vcf.gz\t\tEIGEN_PHRED\t\ttrue\n").encode()) - tmp.write(str(database_path + "fathmm-XF/hg19_fathmm_xf_coding.vcf.gz\t\tFATHMM_XF\t\ttrue\n").encode()) - tmp.write(str(database_path + "MutationAssessor/hg19_precomputed_MutationAssessor.vcf.gz\t\tMutationAssessor\t\ttrue\n").encode()) - tmp.write(str(database_path + "gnomAD/gnomAD_genome_r2.1.1.vcf.gz\tgnomAD\tAN,Hom\t\ttrue\n").encode()) + tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["CONDEL"]["file"] + "\t\tCONDEL\t\ttrue\n").encode()) + tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["EIGEN_PHRED"]["file"] + "\t\tEIGEN_PHRED\t\ttrue\n").encode()) + tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["FATHMM_XF"]["file"] + "\t\tFATHMM_XF\t\ttrue\n").encode()) + tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["MutationAssessor"]["file"] + "\t\tMutationAssessor\t\ttrue\n").encode()) + tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["gnomAD"]["file"] + "\tgnomAD\tAN,Hom\t\ttrue\n").encode()) tmp.close() subprocess.run(command, shell=True, check=True) @@ -139,6 +124,11 @@ def annotate_from_vcf(input_vcf_file, output_vcf_file, num_cores): os.remove(tmp.name) +def sort_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict): + command = vep_annotation_dict["ngs-bits"] + "/" + "VcfSort -in " + input_vcf_file + " -out " + output_vcf_file + subprocess.run(command, shell=True, check=True) + + if __name__=="__main__": parser = argparse.ArgumentParser(description = "AIdiva -- Annotation with VEP") parser.add_argument("--in_data", type=str, dest="in_data", metavar="data.vcf", required=True, help="VCF file containing the data, you want to annotate with VEP\n") diff --git a/aidiva/variant_prioritization/prioritize_variants.py b/aidiva/variant_prioritization/prioritize_variants.py index 5b61a6a..8c8142a 100644 --- a/aidiva/variant_prioritization/prioritize_variants.py +++ b/aidiva/variant_prioritization/prioritize_variants.py @@ -170,9 +170,9 @@ def parallelize_dataframe_processing(variant_data, function, n_cores): def parallelized_variant_processing(variant_data): + variant_data = check_inheritance(variant_data, family_type, family) variant_data[["HPO_RELATEDNESS", "HPO_RELATEDNESS_INTERACTING", "FINAL_AIDIVA_SCORE"]] = variant_data.apply(lambda variant: pd.Series(compute_hpo_relatedness_and_final_score(variant)), axis=1) variant_data[["FILTER_PASSED", "FILTER_COMMENT"]] = variant_data.apply(lambda variant: pd.Series(check_filters(variant)), axis=1) - variant_data = check_inheritance(variant_data, family_type, family) return variant_data @@ -224,11 +224,13 @@ def compute_hpo_relatedness_and_final_score(variant): gene_distances_interacting.append(g_dist) if gene_distances or gene_distances_interacting: - hpo_relatedness = max(gene_distances, default=0.0) - hpo_relatedness_interacting = max(gene_distances_interacting, default=0.0) + # take only the maximum HPO relatedness to prevent downvoting of genes if no HPO relation in interacting genes is observed + hpo_relatedness = max(max(gene_distances, default=0.0), max(gene_distances_interacting, default=0.0)) + #hpo_relatedness_interacting = max(gene_distances_interacting, default=0.0) ## TODO: try different weighting of AIDIVA_SCORE and HPO_RELATEDNESS and HPO_RELATEDNESS_INTERACTING (eg 0.6 and 0.3 and 0.1) - #final_score = (float(variant["AIDIVA_SCORE"]) * 0.6 + float(hpo_relatedness) * 0.3 + float(hpo_relatedness_interacting) * 0.1)# / 3 - final_score = (float(variant["AIDIVA_SCORE"]) + float(hpo_relatedness) + float(hpo_relatedness_interacting)) / 3 + # predicted pathogenicity has a higher weight than the HPO relatedness + final_score = (float(variant["AIDIVA_SCORE"]) * 0.67 + float(hpo_relatedness) * 0.33) #+ float(hpo_relatedness_interacting) * 0.1)# / 3 + #final_score = (float(variant["AIDIVA_SCORE"]) + float(hpo_relatedness) + float(hpo_relatedness_interacting)) / 3 else: final_score = np.nan hpo_relatedness = np.nan diff --git a/data/AIdiva_configuration_with_annotation.yaml b/data/AIdiva_configuration_with_annotation.yaml index 6895e24..f98dbbd 100644 --- a/data/AIdiva_configuration_with_annotation.yaml +++ b/data/AIdiva_configuration_with_annotation.yaml @@ -13,7 +13,7 @@ Analysis-Input: #work-dir: test_workdir2/ # path to the reference assembly used during the expansion of the indels - ref-path: genomes/GRCh37.fa + ref-path: /mnt/share/data/genomes/GRCh37.fa # trained scoring models used to predict the pathogenicity score # if no trained model is present you can use the train_model.py script to train a new custom model @@ -87,13 +87,16 @@ VEP-Annotation: perform-vep-annotation: True # VEP install path - vep: ensembl-vep-release-100.3/vep + vep: /mnt/share/opt/ensembl-vep-release-100.3/ - # path to the annotation databases - db: dbs/ + # ngs-bits install path + ngs-bits: /mnt/share/opt/ngs-bits-current/ + + # path to the annotation databases (the databases itself are given as relative path in this directory) + db: /mnt/share/data/dbs/ # threads to use during the annotation - num-threads: 8 + num-threads: 10 # Cache directory and plugin directory cache-path: ensembl-vep-100/cache @@ -143,6 +146,10 @@ VEP-Annotation: file: MutationAssessor/hg19_precomputed_MutationAssessor.vcf.gz method: exact prefix: custom + gnomAD: + file: gnomAD/gnomAD_genome_r2.1.1.vcf.gz + method: exact + prefix: custom # Bigwig files the key of the dictionary is used as the name to present the feature in the annotated file bigwig-files: