Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Write regions file as VCF is being processed instead of saving in memory #159

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 15 additions & 37 deletions definitions/tools/bam_readcount.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,6 @@ task bamReadcount {
import csv
from subprocess import Popen, PIPE

def generate_region_list(hash):
fh = tempfile.NamedTemporaryFile("w", delete=False)
writer = csv.writer(fh, delimiter="\t")
for chr, positions in hash.items():
for pos in sorted(positions.keys()):
writer.writerow([chr, pos, pos])
fh.close()
return fh.name

def filter_sites_in_hash(region_list, bam_file, ref_fasta, prefixed_sample, output_dir, insertion_centric, map_qual, base_qual):
bam_readcount_cmd = ["/usr/bin/bam-readcount", "-f", ref_fasta, "-l", region_list, "-w", "0", "-b", str(base_qual), "-q", str(map_qual)]
if insertion_centric:
Expand Down Expand Up @@ -73,8 +64,10 @@ task bamReadcount {
vcf_file = VCF(vcf_filename)
sample_index = vcf_file.samples.index(sample)

rc_for_indel = {}
rc_for_snp = {}
snv_region_fh = tempfile.NamedTemporaryFile("w", delete=False)
snv_region_writer = csv.writer(snv_region_fh, delimiter="\t")
indel_region_fh = tempfile.NamedTemporaryFile("w", delete=False)
indel_region_writer = csv.writer(indel_region_fh, delimiter="\t")
for variant in vcf_file:
ref = variant.REF
chr = variant.CHROM
Expand All @@ -90,42 +83,27 @@ task bamReadcount {
elif len(ref) > len(var):
#it is a deletion
pos += 1
unmodified_ref = ref
ref = unmodified_ref[1]
var = "-%s" % unmodified_ref[1:]
else:
#it is an insertion
var = "+%s" % var[1:]
if chr not in rc_for_indel:
rc_for_indel[chr] = {}
if pos not in rc_for_indel[chr]:
rc_for_indel[chr][pos] = {}
if ref not in rc_for_indel[chr][pos]:
rc_for_indel[chr][pos][ref] = {}
rc_for_indel[chr][pos][ref] = variant
indel_region_writer.writerow([chr, pos, pos])
else:
#it is a SNP
if chr not in rc_for_snp:
rc_for_snp[chr] = {}
if pos not in rc_for_snp[chr]:
rc_for_snp[chr][pos] = {}
if ref not in rc_for_snp[chr][pos]:
rc_for_snp[chr][pos][ref] = {}
rc_for_snp[chr][pos][ref] = variant
snv_region_writer.writerow([chr, pos, pos])
snv_region_fh.close()
indel_region_fh.close()

if len(rc_for_snp.keys()) > 0:
region_file = generate_region_list(rc_for_snp)
filter_sites_in_hash(region_file, bam_file, ref_fasta, prefixed_sample, output_dir, False, min_mapping_qual, min_base_qual)
if os.path.getsize(snv_region_fh.name) > 0:
filter_sites_in_hash(snv_region_fh.name, bam_file, ref_fasta, prefixed_sample, output_dir, False, min_mapping_qual, min_base_qual)
else:
output_file = os.path.join(output_dir, prefixed_sample + "_bam_readcount_snv.tsv")
open(output_file, "w").close()

if len(rc_for_indel.keys()) > 0:
region_file = generate_region_list(rc_for_indel)
filter_sites_in_hash(region_file, bam_file, ref_fasta, prefixed_sample, output_dir, True, min_mapping_qual, min_base_qual)
if os.path.getsize(indel_region_fh.name) > 0:
filter_sites_in_hash(indel_region_fh.name, bam_file, ref_fasta, prefixed_sample, output_dir, True, min_mapping_qual, min_base_qual)
else:
output_file = os.path.join(output_dir, prefixed_sample + "_bam_readcount_indel.tsv")
open(output_file, "w").close()

os.remove(snv_region_fh.name)
os.remove(indel_region_fh.name)
' > ~{stdout_file}
>>>

Expand Down
Binary file modified workflows.zip
Binary file not shown.
Loading