Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Postprocess dedup filter #59

Open
wants to merge 24 commits into
base: remove_pybedtools_dependency
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions neusomatic/python/call.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import shutil
import pickle
import multiprocessing
import copy

import pysam
import numpy as np
Expand Down Expand Up @@ -68,6 +69,11 @@ def call_variants(net, vartype_classes, call_loader, out_dir, model_tag, use_cud
for data in loader_:
(matrices, labels, var_pos_s, var_len_s,
non_transformed_matrices), (paths) = data

paths_ = copy.deepcopy(paths)
del paths
paths = paths_

matrices = Variable(matrices)
iii += 1
j += len(paths[0])
Expand Down
65 changes: 26 additions & 39 deletions neusomatic/python/filter_candidates.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# filter_candidates.py
# filter raw candidates extracted by 'scan_alignments.py' using min_af and other cut-offs
#-------------------------------------------------------------------------
import os
import argparse
import traceback
import logging
Expand All @@ -24,6 +25,16 @@ def filter_candidates(candidate_record):
thread_logger.info(
"---------------------Filter Candidates---------------------")

if dbsnp:
if dbsnp[-6:] != "vcf.gz":
msahraeian marked this conversation as resolved.
Show resolved Hide resolved
thread_logger.error("Aborting!")
raise Exception(
"The dbSNP file should be a tabix indexed file with .vcf.gz format")
if not os.path.exists(dbsnp + ".tbi"):
thread_logger.error("Aborting!")
raise Exception(
"The dbSNP file should be a tabix indexed file with .vcf.gz format. No {}.tbi file exists.".format(dbsnp))

records = {}
with open(candidates_vcf) as v_f:
for line in v_f:
Expand Down Expand Up @@ -260,49 +271,25 @@ def filter_candidates(candidate_record):
final_records.append([chrom, pos - 1, ref, alt, line])
final_records = sorted(final_records, key=lambda x: x[0:2])
if dbsnp:
filtered_bed = get_tmp_file()
intervals = []
for x in enumerate(final_records):
intervals.append([x[1][0], int(x[1][1]), int(
x[1][1]) + 1, x[1][2], x[1][3], str(x[0])])
write_tsv_file(filtered_bed, intervals)
filtered_bed = bedtools_sort(
filtered_bed, run_logger=thread_logger)

dbsnp_tmp = get_tmp_file()
vcf_2_bed(dbsnp, dbsnp_tmp)
bedtools_sort(dbsnp_tmp, output_fn=dbsnp, run_logger=thread_logger)
non_in_dbsnp_1 = bedtools_window(
filtered_bed, dbsnp, args=" -w 0 -v", run_logger=thread_logger)
non_in_dbsnp_2 = bedtools_window(
filtered_bed, dbsnp, args=" -w 0", run_logger=thread_logger)

cmd = '''awk '($2!=$8)||($4!=$10)||($5!=$11){{print $0}}' {}'''.format(
non_in_dbsnp_2)
non_in_dbsnp_2 = run_bedtools_cmd(cmd, run_logger=thread_logger)
non_in_dbsnp_2 = bedtools_sort(
non_in_dbsnp_2, run_logger=thread_logger)

non_in_dbsnp_ids = []
with open(non_in_dbsnp_1) as i_f:
for line in i_f:
if not line.strip():
continue
x = line.strip().split("\t")
non_in_dbsnp_ids.append(int(x[5]))
with open(non_in_dbsnp_2) as i_f:
for line in i_f:
if not line.strip():
continue
x = line.strip().split("\t")
non_in_dbsnp_ids.append(int(x[5]))
final_records = list(map(lambda x: x[1], filter(
lambda x: x[0] in non_in_dbsnp_ids, enumerate(final_records))))
dbsnp_tb = pysam.TabixFile(dbsnp)
with open(filtered_candidates_vcf, "w") as o_f:
o_f.write("##fileformat=VCFv4.2\n")
o_f.write(
"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n")
for record in final_records:
if dbsnp:
chrom, pos, ref, alt = record[0:4]
var_id = "-".join(map(str,[chrom, pos, ref, alt]))
region = "{}:{}-{}".format(chrom, pos, pos + 1)
dbsnp_vars = []
for x in dbsnp_tb.fetch(region=region):
chrom_, pos_, _, ref_, alts_ = x.strip().split("\t")[
0:5]
for alt_ in alts_.split(","):
dbsnp_var_id = "-".join(map(str,[chrom_, pos_, ref_, alt_]))
dbsnp_vars.append(dbsnp_var_id)
if var_id in dbsnp_vars:
continue
o_f.write(record[-1] + "\n")
return filtered_candidates_vcf

Expand All @@ -325,7 +312,7 @@ def filter_candidates(candidate_record):
parser.add_argument('--reference', type=str, help='reference fasta filename',
required=True)
parser.add_argument('--dbsnp_to_filter', type=str,
help='dbsnp vcf (will be used to filter candidate variants)', default=None)
help='dbsnp vcf.gz (will be used to filter candidate variants)', default=None)
parser.add_argument('--good_ao', type=float, help='good alternate count (ignores maf)',
default=10)
parser.add_argument('--min_ao', type=float,
Expand Down
72 changes: 42 additions & 30 deletions neusomatic/python/generate_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import numpy as np
import pysam
from scipy.misc import imresize
from PIL import Image

from split_bed import split_region
from utils import concatenate_vcfs, get_chromosomes_order, run_bedtools_cmd, vcf_2_bed, bedtools_sort, bedtools_window, bedtools_intersect, bedtools_slop, get_tmp_file
Expand Down Expand Up @@ -520,39 +520,43 @@ def prepare_info_matrices_tabix(ref_file, tumor_count_bed, normal_count_bed, rec
else:
col_pos_map = {i: int(round(v / float(ncols) * matrix_width))
for i, v in col_pos_map.items()}
tumor_count_matrix = imresize(
tumor_matrix, (5, matrix_width)).astype(int)
bq_tumor_count_matrix = imresize(
bq_tumor_matrix, (5, matrix_width)).astype(int)
mq_tumor_count_matrix = imresize(
mq_tumor_matrix, (5, matrix_width)).astype(int)
st_tumor_count_matrix = imresize(
st_tumor_matrix, (5, matrix_width)).astype(int)
lsc_tumor_count_matrix = imresize(
lsc_tumor_matrix, (5, matrix_width)).astype(int)
rsc_tumor_count_matrix = imresize(
rsc_tumor_matrix, (5, matrix_width)).astype(int)
tumor_count_matrix = np.array(Image.fromarray(
tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
bq_tumor_count_matrix = np.array(Image.fromarray(
bq_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
mq_tumor_count_matrix = np.array(Image.fromarray(
mq_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
st_tumor_count_matrix = np.array(Image.fromarray(
st_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
lsc_tumor_count_matrix = np.array(Image.fromarray(
lsc_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)
rsc_tumor_count_matrix = np.array(Image.fromarray(
rsc_tumor_matrix).resize((matrix_width, 5), 2)).astype(int)

tag_tumor_count_matrices = []
for iii in range(len(tag_tumor_matrices)):
tag_tumor_count_matrices.append(
imresize(tag_tumor_matrices[iii], (5, matrix_width)).astype(int))
normal_count_matrix = imresize(
normal_matrix, (5, matrix_width)).astype(int)
bq_normal_count_matrix = imresize(
bq_normal_matrix, (5, matrix_width)).astype(int)
mq_normal_count_matrix = imresize(
mq_normal_matrix, (5, matrix_width)).astype(int)
st_normal_count_matrix = imresize(
st_normal_matrix, (5, matrix_width)).astype(int)
lsc_normal_count_matrix = imresize(
lsc_normal_matrix, (5, matrix_width)).astype(int)
rsc_normal_count_matrix = imresize(
rsc_normal_matrix, (5, matrix_width)).astype(int)
np.array(Image.fromarray(tag_tumor_matrices[iii]).resize((matrix_width, 5), 2)).astype(int))

normal_count_matrix = np.array(Image.fromarray(
normal_matrix).resize((matrix_width, 5), 2)).astype(int)
bq_normal_count_matrix = np.array(Image.fromarray(
bq_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
mq_normal_count_matrix = np.array(Image.fromarray(
mq_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
st_normal_count_matrix = np.array(Image.fromarray(
st_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
lsc_normal_count_matrix = np.array(Image.fromarray(
lsc_normal_matrix).resize((matrix_width, 5), 2)).astype(int)
rsc_normal_count_matrix = np.array(Image.fromarray(
rsc_normal_matrix).resize((matrix_width, 5), 2)).astype(int)

tag_normal_count_matrices = []
for iii in range(len(tag_normal_matrices)):
tag_normal_count_matrices.append(
imresize(tag_normal_matrices[iii], (5, matrix_width)).astype(int))
ref_count_matrix = imresize(ref_matrix, (5, matrix_width)).astype(int)
np.array(Image.fromarray(tag_normal_matrices[iii]).resize((matrix_width, 5), 2)).astype(int))
ref_count_matrix = np.array(Image.fromarray(
ref_matrix).resize((matrix_width, 5), 2)).astype(int)

if int(pos) + rcenter[0] not in col_pos_map:
center = min(col_pos_map.values()) + rcenter[0] - 1 + rcenter[1]
Expand Down Expand Up @@ -881,6 +885,7 @@ def find_records(input_record):
records = []
i = 0
anns = {}
fasta_file = pysam.Fastafile(ref_file)
if ensemble_bed:
with open(not_in_ensemble_bed) as ni_f:
for line in ni_f:
Expand Down Expand Up @@ -1053,8 +1058,16 @@ def find_records(input_record):
if line[0] == "#":
continue
record = line.strip().split()
pos = int(record[1])
if len(record[3]) != len(record[4]) and min(len(record[3]), len(record[4])) > 0 and record[3][0] != record[4][0]:
if pos > 1:
l_base = fasta_file.fetch(
record[0], pos - 2, pos - 1).upper()
record[3] = l_base + record[3]
record[4] = l_base + record[4]
pos -= 1
truth_records.append(
[record[0], int(record[1]), record[3], record[4], str(i)])
[record[0], pos, record[3], record[4], str(i)])
i += 1

truth_bed = get_tmp_file()
Expand Down Expand Up @@ -1095,7 +1108,6 @@ def find_records(input_record):
record_center = {}

chroms_order = get_chromosomes_order(reference=ref_file)
fasta_file = pysam.Fastafile(ref_file)

good_records = {"INS": [], "DEL": [], "SNP": []}
vtype = {}
Expand Down
Loading