From 3fb800db314314ba902b972c1b53fb36d483ad08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CMarcel-Mueck=E2=80=9D?= <“mueckm1@gmail.com”> Date: Mon, 23 Dec 2024 16:46:51 +0100 Subject: [PATCH] externalized 1 hot encoding of sequences, made encoding faster,modified ancoding to include indels --- deeprvat/annotations/annotations.py | 172 +++++++++++++++++++--- pipelines/annotations/deepripe3.snakefile | 66 +++++++++ 2 files changed, 218 insertions(+), 20 deletions(-) mode change 100644 => 100755 deeprvat/annotations/annotations.py create mode 100644 pipelines/annotations/deepripe3.snakefile diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py old mode 100644 new mode 100755 index 622af12e..221ba032 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -21,7 +21,8 @@ from tqdm import tqdm, trange from fastparquet import ParquetFile import yaml - +import lzma +import pysam def precision(y_true, y_pred): """ @@ -817,17 +818,17 @@ def deepsea_pca( @click.argument("output-dir", type=click.Path(exists=True)) @click.argument("genomefasta", type=click.Path(exists=True)) @click.argument("pybedtools_tmp_dir", type=click.Path(exists=True)) -@click.argument("saved_deepripe_models_path", type=click.Path(exists=True)) +@click.argument("seq_output_file", type=click.Path()) @click.argument("n_jobs", type=int) -@click.argument("saved-model-type", type=str) -def scorevariants_deepripe( +@click.argument("seq-len", type=int) +def encodeseqs_deepripe( variants_file: str, output_dir: str, genomefasta: str, pybedtools_tmp_dir: str, - saved_deepripe_models_path: str, + seq_output_file: str, n_jobs: int, - saved_model_type: str = "parclip", + seq_len: int, ): """ Score variants using deep learning models trained on PAR-CLIP and eCLIP data. @@ -865,6 +866,38 @@ def scorevariants_deepripe( os.mkdir(tmp_path) pybedtools.set_tempdir(tmp_path) + + + if not os.path.exists(bed_file): + convert2bed(variants_file, output_dir) + + variant_bed = pybedtools.BedTool(bed_file) + print(f"Encoding seqs for: {bed_file}") + encode_sequences(variant_bed, genomefasta, seq_output_file, seq_len, n_jobs) + + + + + + + + +@cli.command() +@click.argument("variants-file", type=click.Path(exists=True)) +@click.argument("output-dir", type=click.Path(exists=True)) +@click.argument("saved_deepripe_models_path", type=click.Path(exists=True)) +@click.argument("encoded-seq-path", type=click.Path(exists=True)) +@click.argument("batch-size", type=int) +@click.argument("saved-model-type", type=str) +def scorevariants_deepripe( + variants_file: str, + output_dir: str, + saved_deepripe_models_path: str, + encoded_seq_path: str, + batch_size: int = 512, + saved_model_type: str = "parclip", +): + file_name = variants_file.split("/")[-1] ### reading variants to score df_variants = pd.read_csv( variants_file, @@ -872,13 +905,8 @@ def scorevariants_deepripe( header=None, names=["chr", "pos", "Uploaded_variant", "ref", "alt"], ) - - if not os.path.exists(bed_file): - convert2bed(variants_file, output_dir) - - variant_bed = pybedtools.BedTool(bed_file) - print(f"Scoring variants for: {bed_file}") - + batch_size = 512 + logger.info(f"batch size is {batch_size}") ### paths for experiments saved_models_dict = { "parclip": "parclip_model", @@ -939,12 +967,12 @@ def scorevariants_deepripe( current_model_type = list_of_model_groups[saved_model_type] predictions = deepripe_score_variant_onlyseq_all( current_model_type, - variant_bed, - genomefasta, + encoded_seq_path, seq_len=model_seq_len[saved_model_type], - n_jobs=n_jobs, + batch_size=batch_size ) + for choice in current_model_type.keys(): print(choice) _, RBPnames = current_model_type[choice] @@ -1109,8 +1137,8 @@ def aggregate_abscores( logger = logging.getLogger(__name__) -def deepripe_score_variant_onlyseq_all( - model_group, variant_bed, genomefasta, seq_len=200, batch_size=1024, n_jobs=32 +def encode_sequences( + variant_bed, genomefasta, encoded_seq_output_path, seq_len=200, n_jobs=32, ): """ Compute variant scores using a deep learning model for each specified variant. @@ -1127,7 +1155,6 @@ def deepripe_score_variant_onlyseq_all( dict: A dictionary containing variant scores for each choice in the model_group. Each entry has the choice name as the key and the corresponding scores as the value. """ - predictions = {} # Parallelize the encoding of variant bedlines encoded_seqs_list = Parallel(n_jobs=n_jobs, verbose=10)( @@ -1144,6 +1171,16 @@ def deepripe_score_variant_onlyseq_all( ] # Concatenate encoded sequences + with lzma.open(encoded_seq_output_path, "wb") as f: + pickle.dump(encoded_seqs_list, f) + + +def deepripe_score_variant_onlyseq_all(model_group, encoded_seq_path, seq_len=200, batch_size=512): + predictions = {} + + with lzma.open(encoded_seq_path, "rb") as f: + encoded_seqs_list = pickle.load(f) + encoded_seqs = tf.concat(encoded_seqs_list, 0) logger.info("Computing predictions") @@ -1154,7 +1191,7 @@ def deepripe_score_variant_onlyseq_all( for i in range(4): cropped_seqs = encoded_seqs[:, i : i + seq_len, :] model, _ = model_group[choice] - pred = model.predict_on_batch(cropped_seqs) + pred = model.predict(cropped_seqs, batch_size = batch_size) wild_indices = tf.range(pred.shape[0], delta=2) mut_indices = tf.range(1, pred.shape[0], delta=2) pred_wild = pred[wild_indices, :] @@ -1614,6 +1651,56 @@ def concatenate_deepsea( raise ValueError +@cli.command() +@click.argument("vep_header_line", type=int) +@click.argument("vep_file", type=click.Path(exists=True)) +@click.argument("variant_file", type=click.Path(exists=True)) +@click.argument("vcf_file", type=click.Path(exists=True)) +@click.argument("out_file", type=click.Path()) +@click.argument("column_yaml", type=click.Path(exists=True)) +def merge_annotations_no_deepripe( + vep_header_line: int, + vep_file: str, + variant_file: str, + vcf_file: str, + out_file: str, + column_yaml: str, +): + """ + Merge VEP, DeepRipe (parclip, hg2, k5), and variant files into one dataFrame and save result as parquet file + + Parameters: + - vep_header_line (int): Line number of the header line in the VEP output file. + - vep_file (str): Path to the VEP file. + - variant_file (str): Path to the variant file. + - vcf_file (str): vcf file containing chrom, pos, ref and alt information + - out_file (str): Path to save the merged output file in Parquet format. + - column yaml file + + Returns: + None + + Example: + $ python annotations.py merge_annotations 1 vep_file.tsv deepripe_parclip.csv deepripe_hg2.csv deepripe_k5.csv variant_file.tsv merged_output.parquet --vepcols_to_retain="AlphaMissense,PolyPhen" + """ + # load vep file + _, _, _, _, _, types_mapping = readYamlColumns(column_yaml) + vep_df = pd.read_csv(vep_file, header=vep_header_line, sep="\t", na_values="-") + + vep_df = process_vep( + vep_file=vep_df, vcf_file=vcf_file, types_mapping=types_mapping + ) + logger.info(f"vep_df shape is {vep_df.shape}") + logger.info(f"reading in {variant_file}") + variants = pd.read_parquet(variant_file) + + logger.info("merge vep to variants M:1") + ca = vep_df.merge( + variants, how="left", on=["chrom", "pos", "ref", "alt"], validate="m:1" + ) + del vep_df + ca.to_parquet(out_file, compression="zstd") + @cli.command() @click.argument("vep_header_line", type=int) @@ -2052,6 +2139,51 @@ def select_rename_fill_annotations( anno_df.fillna(fill_value_mapping, inplace=True) anno_df.to_parquet(out_file) +def one_hot_encode(sequence): + encoding = {'A': [1, 0, 0, 0], 'C': [0, 1, 0, 0], 'G': [0, 0, 1, 0], 'T': [0, 0, 0, 1]} + return [encoding[base] if base in encoding else [0, 0, 0, 0] for base in sequence] + +def fetch_flanking_sequence(fasta, chrom, pos, ref, alt, flank_size=75): + # Fetch reference sequence with flanking region + start = pos - flank_size-2 + end = pos + flank_size + 2 + max(len(ref), len(alt)) # Adjust for reference length + reference_seq = fasta.fetch(chrom, start, end) + + ref_seq = (reference_seq[:flank_size+1]+ref+reference_seq[flank_size+1 + len(ref):])[0:(flank_size*2)+4] + alt_seq = (reference_seq[:flank_size+1] + alt + reference_seq[flank_size+1 + len(ref):])[0:(flank_size*2)+4] + return ref_seq, alt_seq + + +@cli.command() +@click.argument("vcf_file", type=click.Path(exists=True)) +@click.argument("fasta_file", type=click.Path(exists=True)) +@click.argument("out_file", type=click.Path()) +@click.argument("seq_length", type=int) +def flank_encode_vcf(vcf_file, fasta_file, out_file, seq_length): + fasta = pysam.FastaFile(fasta_file) + vcf = pysam.VariantFile(vcf_file) + + sequences = [] + for record in vcf: + chrom = record.chrom + pos = record.pos + ref = record.ref + for alt in record.alts: + # Fetch the 150bp sequences for both reference and alternative alleles + ref_seq, alt_seq = fetch_flanking_sequence(fasta, chrom, pos, ref, alt, flank_size= int(seq_length/2)) + + # One-hot encode the sequences + ref_encoded = one_hot_encode(ref_seq) + alt_encoded = one_hot_encode(alt_seq) + + # Append the pair to the list + sequences.append([ref_encoded, alt_encoded]) + fasta.close() + + with lzma.open(out_file, 'wb') as f: + pickle.dump(sequences, f) if __name__ == "__main__": cli() + + diff --git a/pipelines/annotations/deepripe3.snakefile b/pipelines/annotations/deepripe3.snakefile new file mode 100644 index 00000000..d7394720 --- /dev/null +++ b/pipelines/annotations/deepripe3.snakefile @@ -0,0 +1,66 @@ + +rule encode_150: + input: + variants=rules.extract_with_header.output, + fasta=fasta_dir / fasta_file_name, + output: + anno_tmp_dir / ("{file_stem}_encoded_150.pkl"), + threads: 1 + resources: + mem_mb=lambda wildcards, attempt: 5_000 + 500 * (attempt), + shell: + f"cp {{input.fasta}} $TMPDIR && cp {{input.fasta}}.fai $TMPDIR && python /home/m991k/deeprvat_vep_plugin/deeprvat/deeprvat/annotations/annotations.py flank-encode-vcf {{input.variants}} $TMPDIR/{fasta_file_name} {{output}} 150" + + +rule encode_200: + input: + variants=rules.extract_with_header.output, + fasta=fasta_dir / fasta_file_name, + output: + anno_tmp_dir / ("{file_stem}_encoded_200.pkl"), + threads: 1 + resources: + mem_mb=lambda wildcards, attempt: 5_000 + 500 * (attempt), + shell: + f"cp {{input.fasta}} $TMPDIR && cp {{input.fasta}}.fai $TMPDIR && python /home/m991k/deeprvat_vep_plugin/deeprvat/deeprvat/annotations/annotations.py flank-encode-vcf {{input.variants}} $TMPDIR/{fasta_file_name} {{output}} 200" + + +rule deepRiPe_parclip: + input: + variants=rules.extract_variants.output, + encoded_seqs= rules.encode_150.output, + output: + anno_dir / ("{file_stem}_variants.parclip_deepripe.csv.gz"), + threads: 8 + resources: + mem_mb=lambda wildcards, attempt: 100_000 + 25_000 * (attempt), + shell: + f"cp -R {saved_deepripe_models_path} $TMPDIR && python /home/m991k/deeprvat_vep_plugin/deeprvat/deeprvat/annotations/annotations.py scorevariants-deepripe {{input.variants}} {anno_dir} $TMPDIR/deepripe_models {{input.encoded_seqs}} 512 'parclip'" + + +rule deepRiPe_eclip_hg2: + input: + variants=rules.extract_variants.output, + fasta=fasta_dir / fasta_file_name, + encoded_seqs= rules.encode_200.output, + output: + anno_dir / ("{file_stem}_variants.eclip_hg2_deepripe.csv.gz"), + threads: 8 + resources: + mem_mb=lambda wildcards, attempt: 100_000 + 25_000 * (attempt), + shell: + f"cp -R {saved_deepripe_models_path} $TMPDIR && python /home/m991k/deeprvat_vep_plugin/deeprvat/deeprvat/annotations/annotations.py scorevariants-deepripe {{input.variants}} {anno_dir} $TMPDIR/deepripe_models {{input.encoded_seqs}} 512 'eclip_hg2'" + + +rule deepRiPe_eclip_k5: + input: + variants=rules.extract_variants.output, + fasta=fasta_dir / fasta_file_name, + encoded_seqs= rules.encode_200.output, + output: + anno_dir / ("{file_stem}_variants.eclip_k5_deepripe.csv.gz"), + threads: 8 + resources: + mem_mb=lambda wildcards, attempt: 100_000 + 25_000 * (attempt), + shell: + f"cp -R {saved_deepripe_models_path} $TMPDIR && python /home/m991k/deeprvat_vep_plugin/deeprvat/deeprvat/annotations/annotations.py scorevariants-deepripe {{input.variants}} {anno_dir} $TMPDIR/deepripe_models {{input.encoded_seqs}} 512 'eclip_k5'"