Skip to content

Commit

Permalink
Merge pull request #17 from imgag/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
dboceck authored Feb 26, 2021
2 parents 58bf624 + e85655b commit e49bc46
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 55 deletions.
13 changes: 7 additions & 6 deletions aidiva/run_annotation_and_AIdiva.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
parser.add_argument("--hpo_list", type=str, dest="hpo_list", metavar="hpo.txt", required=False, help="TXT file containing the HPO terms reported for the current patient")
parser.add_argument("--gene_exclusion", type=str, dest="gene_exclusion", metavar="gene_exclusion.txt", required=False, help="TXT file containing the genes to exclude in the analysis")
parser.add_argument("--family_file", type=str, dest="family_file", metavar="family.txt", required=False, help="TXT file showing the family relation of the current patient")
parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="String indicating the present family type [SINGLE, TRIO, FAMILY]")
parser.add_argument("--family_type", type=str, dest="family_type", metavar="SINGLE", required=False, help="String indicating the present family type [SINGLE, TRIO]")
#parser.add_argument("--threads", type=int, dest="threads", metavar="1", required=False, help="Number of threads to use. (default: 1)")
args = parser.parse_args()

Expand All @@ -43,7 +43,7 @@

data_path = "/mnt/data/"
# data_path = os.path.dirname(os.path.abspath(__file__)) + "/../data/"
ref_path = data_path + configuration["Analysis-Input"]["ref-path"]
ref_path = configuration["Analysis-Input"]["ref-path"]

# parse input files
input_vcf = args.vcf
Expand Down Expand Up @@ -91,8 +91,9 @@
hpo_resources_folder = os.path.dirname(os.path.abspath(__file__)) + "/../data/hpo_resources/"

print("Starting VCF preparation...")
# filtering step to remove unsupported variants
annotate.annotate_consequence_information(input_vcf, str(working_directory + input_filename + "_consequence.vcf"), vep_annotation_dict, num_cores)
# sorting and filtering step to remove unsupported variants
annotate.sort_vcf(input_vcf, str(working_directory + input_filename + "_sorted.vcf"), vep_annotation_dict)
annotate.annotate_consequence_information(str(working_directory + input_filename + "_sorted.vcf"), str(working_directory + input_filename + "_consequence.vcf"), vep_annotation_dict, num_cores)
filt_vcf.filter_coding_variants(str(working_directory + input_filename + "_consequence.vcf"), str(working_directory + input_filename + "_filtered.vcf"), "CONS")

# convert input vcf to pandas dataframe
Expand All @@ -108,8 +109,8 @@
# Additional annotation with AnnotateFromVCF (a ngs-bits tool)
# If VCF is used as output format VEP won't annotate from custom VCF files
print("Starting AnnotateFromVCF annotation...")
annotate.annotate_from_vcf(str(working_directory + input_filename + "_snp_vep.vcf"), str(working_directory + input_filename + "_snp_vep_annotated.vcf"), num_cores)
annotate.annotate_from_vcf(str(working_directory + input_filename + "_indel_expanded_vep.vcf"), str(working_directory + input_filename + "_indel_expanded_vep_annotated.vcf"), num_cores)
annotate.annotate_from_vcf(str(working_directory + input_filename + "_snp_vep.vcf"), str(working_directory + input_filename + "_snp_vep_annotated.vcf"), vep_annotation_dict, num_cores)
annotate.annotate_from_vcf(str(working_directory + input_filename + "_indel_expanded_vep.vcf"), str(working_directory + input_filename + "_indel_expanded_vep_annotated.vcf"), vep_annotation_dict, num_cores)

# convert annotated vcfs back to pandas dataframes
input_data_snp_annotated = convert_vcf.convert_vcf_to_pandas_dataframe(str(working_directory + input_filename + "_snp_vep_annotated.vcf"), False, num_cores)
Expand Down
68 changes: 29 additions & 39 deletions aidiva/variant_annotation/annotate_with_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,23 @@

def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict, basic=False, expanded=False, num_cores=1):
# the path to the executable
database_path = "/mnt/data/dbs/"
# database_path = os.path.dirname(__file__) + "/../../annotation_resources/

tool_path = "/mnt/data/tools/"
# tool_path = os.path.dirname(__file__) + "/../../tools/"
vep_command = tool_path + vep_annotation_dict["vep"] + " "
vep_command = vep_annotation_dict["vep"] + "/" + "vep "

bed_annotation = vep_annotation_dict["custom"]["bed-files"]
bigwig_annotation = vep_annotation_dict["custom"]["bigwig-files"]
#vcf_annotation = vep_annotation_dict["custom"]["vcf-files"]

# set the correct paths to the needed perl modules
if "PERL5LIB" in os.environ:
os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/:" + os.environ["PERL5LIB"]
os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"]
else:
os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/"
os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/"

# add essential parameters
vep_command = vep_command + "--species homo_sapiens --assembly GRCh37" + " "
vep_command = vep_command + "--offline" + " "
vep_command = vep_command + "--cache" + " "
vep_command = vep_command + "--dir_cache " + database_path + vep_annotation_dict["cache-path"] + " "
vep_command = vep_command + "--dir_cache " + vep_annotation_dict["db"] + vep_annotation_dict["cache-path"] + " "
vep_command = vep_command + "--gencode_basic" + " "
vep_command = vep_command + "--symbol" + " "
vep_command = vep_command + "--biotype" + " "
Expand All @@ -42,22 +37,22 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di
#vep_command = vep_command + "--af_1kg" + " "
#vep_command = vep_command + "--af_esp" + " "
#vep_command = vep_command + "--af_gnomad" + " "
vep_command = vep_command + "--custom " + database_path + bed_annotation["simpleRepeat"]["file"] + "," + "simpleRepeat" + ",bed," + bed_annotation["simpleRepeat"]["method"] + ",0" + " "
vep_command = vep_command + "--custom " + database_path + bed_annotation["oe_lof"]["file"] + "," + "oe_lof" + ",bed," + bed_annotation["oe_lof"]["method"] + ",0" + " "
vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation["simpleRepeat"]["file"] + "," + "simpleRepeat" + ",bed," + bed_annotation["simpleRepeat"]["method"] + ",0" + " "
vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation["oe_lof"]["file"] + "," + "oe_lof" + ",bed," + bed_annotation["oe_lof"]["method"] + ",0" + " "

# vep plugins to use
if not basic:
vep_command = vep_command + "--sift s" + " "
vep_command = vep_command + "--polyphen s" + " "
#vep_command = vep_command + "--plugin Condel," + vep_annotation_dict["condel"] + ",s" + " "
vep_command = vep_command + "--plugin CADD," + database_path + vep_annotation_dict["cadd-snps"] + "," + database_path + vep_annotation_dict["cadd-indel"] + " "
vep_command = vep_command + "--plugin REVEL," + database_path + vep_annotation_dict["revel"] + " "
vep_command = vep_command + "--plugin CADD," + vep_annotation_dict["db"] + vep_annotation_dict["cadd-snps"] + " " #"," + vep_annotation_dict["db"] + vep_annotation_dict["cadd-indel"] + " "
vep_command = vep_command + "--plugin REVEL," + vep_annotation_dict["db"] + vep_annotation_dict["revel"] + " "
#vep_command = vep_command + "--plugin dbNSFP," + vep_annotation_dict["dbNSFP"] + ",MutationAssessor_score,Eigen-raw,Eigen-phred" + " "

if bed_annotation:
for key in bed_annotation:
if (key != "simpleRepeat") and (key != "oe_lof"):
vep_command = vep_command + "--custom " + database_path + bed_annotation[key]["file"] + "," + key + ",bed," + bed_annotation[key]["method"] + ",0" + " "
vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bed_annotation[key]["file"] + "," + key + ",bed," + bed_annotation[key]["method"] + ",0" + " "

# does not work (seems to be a problem with VEP)
#if vcf_annotation:
Expand All @@ -66,11 +61,11 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di

if bigwig_annotation:
for key in bigwig_annotation:
vep_command = vep_command + "--custom " + database_path + bigwig_annotation[key]["file"] + "," + key + ",bigwig," + bigwig_annotation[key]["method"] + ",0" + " "
vep_command = vep_command + "--custom " + vep_annotation_dict["db"] + bigwig_annotation[key]["file"] + "," + key + ",bigwig," + bigwig_annotation[key]["method"] + ",0" + " "

vep_command = vep_command + "-i " + input_vcf_file + " "
vep_command = vep_command + "-o " + output_vcf_file + " "
vep_command = vep_command + "--fork " + str(vep_annotation_dict["num-threads"]) + " "
vep_command = vep_command + "--fork " + str(num_cores) + " "
vep_command = vep_command + "--vcf" + " "
vep_command = vep_command + "--no_stats" + " "
vep_command = vep_command + "--force_overwrite"
Expand All @@ -81,32 +76,27 @@ def call_vep_and_annotate_vcf(input_vcf_file, output_vcf_file, vep_annotation_di

def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annotation_dict, num_cores=1):
# the path to the executable
database_path = "/mnt/data/dbs/"
# database_path = os.path.dirname(__file__) + "/../../annotation_resources/

tool_path = "/mnt/data/tools/"
# tool_path = os.path.dirname(__file__) + "/../../tools/"
vep_command = tool_path + vep_annotation_dict["vep"] + " "
vep_command = vep_annotation_dict["vep"] + "/" + "vep "

# set the correct paths to the needed perl modules
if "PERL5LIB" in os.environ:
os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/:" + os.environ["PERL5LIB"]
os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/:" + os.environ["PERL5LIB"]
else:
os.environ["PERL5LIB"] = tool_path + "/ensembl-vep-release-100.3/Bio/:" + tool_path + "/ensembl-vep-release-100.3/cpan/lib/perl5/"
os.environ["PERL5LIB"] = vep_annotation_dict["vep"] + "/" + "Bio/:" + vep_annotation_dict["vep"] + "/" + "cpan/lib/perl5/"

# add essential parameters
vep_command = vep_command + "--species homo_sapiens --assembly GRCh37" + " "
vep_command = vep_command + "--offline" + " "
vep_command = vep_command + "--cache" + " "
vep_command = vep_command + "--dir_cache " + database_path + vep_annotation_dict["cache-path"] + " "
vep_command = vep_command + "--dir_cache " + vep_annotation_dict["db"] + vep_annotation_dict["cache-path"] + " "
vep_command = vep_command + "--gencode_basic" + " "


vep_command = vep_command + "-i " + input_vcf_file + " "
vep_command = vep_command + "-o " + output_vcf_file + " "
vep_command = vep_command + "--fields Consequence "
vep_command = vep_command + "--vcf_info_field CONS "
vep_command = vep_command + "--fork " + str(vep_annotation_dict["num-threads"]) + " "
vep_command = vep_command + "--fork " + str(num_cores) + " "
vep_command = vep_command + "--vcf" + " "
vep_command = vep_command + "--no_stats" + " "
vep_command = vep_command + "--force_overwrite"
Expand All @@ -115,30 +105,30 @@ def annotate_consequence_information(input_vcf_file, output_vcf_file, vep_annota
print("The annotated VCF is saved as %s" % (output_vcf_file))


def annotate_from_vcf(input_vcf_file, output_vcf_file, num_cores):
def annotate_from_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict, num_cores):
tmp = tempfile.NamedTemporaryFile(mode="w+b", suffix=".config", delete=False)

database_path = "/mnt/data/dbs/"
# database_path = os.path.dirname(__file__) + "/../../annotation_resources/
#tool_path = "/mnt/storage1/share/opt/"
tool_path = "/mnt/data/tools/"
# tool_path = os.path.dirname(__file__) + "/../../tools/"

command = tool_path + "ngs-bits/bin/VcfAnnotateFromVcf -config_file " + tmp.name + " -in " + input_vcf_file + " -out " + output_vcf_file + " -threads " + str(num_cores)
vcf_annotation = vep_annotation_dict["custom"]["vcf-files"]
command = vep_annotation_dict["ngs-bits"] + "/" + "VcfAnnotateFromVcf -config_file " + tmp.name + " -in " + input_vcf_file + " -out " + output_vcf_file + " -threads " + str(num_cores)

try:
tmp.write(str(database_path + "Condel/hg19_precomputed_Condel.vcf.gz\t\tCONDEL\t\ttrue\n").encode())
tmp.write(str(database_path + "Eigen/hg19_Eigen-phred_coding_chrom1-22.vcf.gz\t\tEIGEN_PHRED\t\ttrue\n").encode())
tmp.write(str(database_path + "fathmm-XF/hg19_fathmm_xf_coding.vcf.gz\t\tFATHMM_XF\t\ttrue\n").encode())
tmp.write(str(database_path + "MutationAssessor/hg19_precomputed_MutationAssessor.vcf.gz\t\tMutationAssessor\t\ttrue\n").encode())
tmp.write(str(database_path + "gnomAD/gnomAD_genome_r2.1.1.vcf.gz\tgnomAD\tAN,Hom\t\ttrue\n").encode())
tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["CONDEL"]["file"] + "\t\tCONDEL\t\ttrue\n").encode())
tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["EIGEN_PHRED"]["file"] + "\t\tEIGEN_PHRED\t\ttrue\n").encode())
tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["FATHMM_XF"]["file"] + "\t\tFATHMM_XF\t\ttrue\n").encode())
tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["MutationAssessor"]["file"] + "\t\tMutationAssessor\t\ttrue\n").encode())
tmp.write(str(vep_annotation_dict["db"] + vcf_annotation["gnomAD"]["file"] + "\tgnomAD\tAN,Hom\t\ttrue\n").encode())
tmp.close()

subprocess.run(command, shell=True, check=True)
finally:
os.remove(tmp.name)


def sort_vcf(input_vcf_file, output_vcf_file, vep_annotation_dict):
command = vep_annotation_dict["ngs-bits"] + "/" + "VcfSort -in " + input_vcf_file + " -out " + output_vcf_file
subprocess.run(command, shell=True, check=True)


if __name__=="__main__":
parser = argparse.ArgumentParser(description = "AIdiva -- Annotation with VEP")
parser.add_argument("--in_data", type=str, dest="in_data", metavar="data.vcf", required=True, help="VCF file containing the data, you want to annotate with VEP\n")
Expand Down
12 changes: 7 additions & 5 deletions aidiva/variant_prioritization/prioritize_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,9 @@ def parallelize_dataframe_processing(variant_data, function, n_cores):


def parallelized_variant_processing(variant_data):
variant_data = check_inheritance(variant_data, family_type, family)
variant_data[["HPO_RELATEDNESS", "HPO_RELATEDNESS_INTERACTING", "FINAL_AIDIVA_SCORE"]] = variant_data.apply(lambda variant: pd.Series(compute_hpo_relatedness_and_final_score(variant)), axis=1)
variant_data[["FILTER_PASSED", "FILTER_COMMENT"]] = variant_data.apply(lambda variant: pd.Series(check_filters(variant)), axis=1)
variant_data = check_inheritance(variant_data, family_type, family)

return variant_data

Expand Down Expand Up @@ -224,11 +224,13 @@ def compute_hpo_relatedness_and_final_score(variant):
gene_distances_interacting.append(g_dist)

if gene_distances or gene_distances_interacting:
hpo_relatedness = max(gene_distances, default=0.0)
hpo_relatedness_interacting = max(gene_distances_interacting, default=0.0)
# take only the maximum HPO relatedness to prevent downvoting of genes if no HPO relation in interacting genes is observed
hpo_relatedness = max(max(gene_distances, default=0.0), max(gene_distances_interacting, default=0.0))
#hpo_relatedness_interacting = max(gene_distances_interacting, default=0.0)
## TODO: try different weighting of AIDIVA_SCORE and HPO_RELATEDNESS and HPO_RELATEDNESS_INTERACTING (eg 0.6 and 0.3 and 0.1)
#final_score = (float(variant["AIDIVA_SCORE"]) * 0.6 + float(hpo_relatedness) * 0.3 + float(hpo_relatedness_interacting) * 0.1)# / 3
final_score = (float(variant["AIDIVA_SCORE"]) + float(hpo_relatedness) + float(hpo_relatedness_interacting)) / 3
# predicted pathogenicity has a higher weight than the HPO relatedness
final_score = (float(variant["AIDIVA_SCORE"]) * 0.67 + float(hpo_relatedness) * 0.33) #+ float(hpo_relatedness_interacting) * 0.1)# / 3
#final_score = (float(variant["AIDIVA_SCORE"]) + float(hpo_relatedness) + float(hpo_relatedness_interacting)) / 3
else:
final_score = np.nan
hpo_relatedness = np.nan
Expand Down
17 changes: 12 additions & 5 deletions data/AIdiva_configuration_with_annotation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Analysis-Input:
#work-dir: test_workdir2/

# path to the reference assembly used during the expansion of the indels
ref-path: genomes/GRCh37.fa
ref-path: /mnt/share/data/genomes/GRCh37.fa

# trained scoring models used to predict the pathogenicity score
# if no trained model is present you can use the train_model.py script to train a new custom model
Expand Down Expand Up @@ -87,13 +87,16 @@ VEP-Annotation:
perform-vep-annotation: True

# VEP install path
vep: ensembl-vep-release-100.3/vep
vep: /mnt/share/opt/ensembl-vep-release-100.3/

# path to the annotation databases
db: dbs/
# ngs-bits install path
ngs-bits: /mnt/share/opt/ngs-bits-current/

# path to the annotation databases (the databases itself are given as relative path in this directory)
db: /mnt/share/data/dbs/

# threads to use during the annotation
num-threads: 8
num-threads: 10

# Cache directory and plugin directory
cache-path: ensembl-vep-100/cache
Expand Down Expand Up @@ -143,6 +146,10 @@ VEP-Annotation:
file: MutationAssessor/hg19_precomputed_MutationAssessor.vcf.gz
method: exact
prefix: custom
gnomAD:
file: gnomAD/gnomAD_genome_r2.1.1.vcf.gz
method: exact
prefix: custom

# Bigwig files the key of the dictionary is used as the name to present the feature in the annotated file
bigwig-files:
Expand Down

0 comments on commit e49bc46

Please sign in to comment.