Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimize logging, only stderr is real-time logging #96

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 20 additions & 29 deletions utilities/genSeqErrorModel.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,3 @@
#!/usr/bin/env source

#
#
# genSeqErrorModel.source
# Computes sequencing error model for gen_reads.source
#
#
# Usage: source genSeqErrorModel.source -i input_reads.fq -o path/to/output_name.p
#
#
# Python 3 ready


import numpy as np
import argparse
import sys
Expand All @@ -21,6 +7,7 @@
import pysam
from functools import reduce

print("genSeqErrorModel starts running....", file=sys.stderr)
# enables import from neighboring package
sys.path.append(str(pathlib.Path(__file__).resolve().parents[1]))

Expand All @@ -32,24 +19,24 @@ def parse_file(input_file, real_q, off_q, max_reads, n_samp, plot_stuff):
prob_smooth = 0.

# Takes a gzip or sam file and returns the simulation's average error rate,
print('reading ' + input_file + '...')
print('reading ' + input_file + '...', file=sys.stderr)
is_aligned = False
lines_to_read = 0
try:
if input_file[-4:] == '.bam' or input_file[-4:] == '.sam':
print('detected aligned file....')
print('detected aligned file....', file=sys.stderr)
stats = pysam.idxstats(input_file).strip().split('\n')
lines_to_read = reduce(lambda x, y: x + y, [eval('+'.join(l.rstrip('\n').split('\t')[2:])) for l in stats])
f = pysam.AlignmentFile(input_file)
is_aligned = True
else:
print('detected fastq file....')
print('detected fastq file....', file=sys.stderr)
with pysam.FastxFile(input_file) as f:
for _ in f:
lines_to_read += 1
f = pysam.FastxFile(input_file)
except FileNotFoundError:
print("Check input file. Must be fastq, gzipped fastq, or bam/sam file.")
print("Check input file. Must be fastq, gzipped fastq, or bam/sam file.", file=sys.stderr)
sys.exit(1)

actual_readlen = 0
Expand All @@ -69,14 +56,14 @@ def parse_file(input_file, real_q, off_q, max_reads, n_samp, plot_stuff):
qualities_to_check = read.get_quality_array()
if actual_readlen == 0:
actual_readlen = len(qualities_to_check) - 1
print('assuming read length is uniform...')
print('detected read length (from first read found):', actual_readlen)
print('assuming read length is uniform...', file=sys.stderr)
print('detected read length (from first read found):', actual_readlen, file=sys.stderr)
prior_q = np.zeros([actual_readlen, real_q])
total_q = [None] + [np.zeros([real_q, real_q]) for n in range(actual_readlen - 1)]

# sanity-check readlengths
if len(qualities_to_check) - 1 != actual_readlen:
print('skipping read with unexpected length...')
print('skipping read with unexpected length...', file=sys.stderr)
continue

for i in range(actual_readlen):
Expand All @@ -100,13 +87,13 @@ def parse_file(input_file, real_q, off_q, max_reads, n_samp, plot_stuff):
# some sanity checking again...
q_range = [min(q_dict.keys()), max(q_dict.keys())]
if q_range[0] < 0:
print('\nError: Read in Q-scores below 0\n')
print('\nError: Read in Q-scores below 0\n', file=sys.stderr)
exit(1)
if q_range[1] > real_q:
print('\nError: Read in Q-scores above specified maximum:', q_range[1], '>', real_q, '\n')
print('\nError: Read in Q-scores above specified maximum:', q_range[1], '>', real_q, '\n', file=sys.stderr)
exit(1)

print('computing probabilities...')
print('computing probabilities...', file=sys.stderr)
prob_q = [None] + [[[0. for m in range(real_q)] for n in range(real_q)] for p in range(actual_readlen - 1)]
for p in range(1, actual_readlen):
for i in range(real_q):
Expand Down Expand Up @@ -188,7 +175,7 @@ def parse_file(input_file, real_q, off_q, max_reads, n_samp, plot_stuff):
# mpl.tight_layout()
mpl.show()

print('estimating average error rate via simulation...')
print('estimating average error rate via simulation...', file=sys.stderr)
q_scores = range(real_q)
# print (len(init_q), len(init_q[0]))
# print (len(prob_q), len(prob_q[1]), len(prob_q[1][0]))
Expand Down Expand Up @@ -223,7 +210,7 @@ def parse_file(input_file, real_q, off_q, max_reads, n_samp, plot_stuff):
eVal = 10. ** (-k / 10.)
# print k, eVal, count_dict[k]
avg_err += eVal * (count_dict[k] / tot_bases)
print('AVG ERROR RATE:', avg_err)
print('AVG ERROR RATE:', avg_err, file=sys.stderr)

return init_q, prob_q, avg_err

Expand All @@ -248,15 +235,18 @@ def main():

(infile, outfile, off_q, max_q, max_reads, n_samp) = (args.i, args.o, args.q, args.Q, args.n, args.s)
(infile2, pile_up) = (args.i2, args.p)
print("Read1 is {}, Read2 is {}, output model is {}".format(infile, infile2, outfile), file=sys.stderr)

real_q = max_q + 1

plot_stuff = args.plot

q_scores = range(real_q)
if infile2 is None:
print('Read2 is not input. Generate Seq Error Model on single end reads', file=sys.stderr)
(init_q, prob_q, avg_err) = parse_file(infile, real_q, off_q, max_reads, n_samp, plot_stuff)
else:
print('Read2 is inputted. Generate Seq Error Model on pair-end reads', file=sys.stderr)
(init_q, prob_q, avg_err1) = parse_file(infile, real_q, off_q, max_reads, n_samp, plot_stuff)
(init_q2, prob_q2, avg_err2) = parse_file(infile2, real_q, off_q, max_reads, n_samp, plot_stuff)
avg_err = (avg_err1 + avg_err2) / 2.
Expand All @@ -266,7 +256,7 @@ def main():
#
if pile_up == None:

print('Using default sequencing error parameters...')
print('Using default sequencing error parameters...', file=sys.stderr)

# sequencing substitution transition probabilities
sse_prob = [[0., 0.4918, 0.3377, 0.1705],
Expand All @@ -287,7 +277,7 @@ def main():
# otherwise we need to parse a pileup and compute statistics!
#
else:
print('\nPileup parsing coming soon!\n')
print('\nPileup parsing coming soon!\n', file=sys.stderr)
exit(1)

err_params = [sse_prob, sie_rate, sie_prob, sie_val, sie_ins_freq, sie_ins_nucl]
Expand All @@ -296,12 +286,13 @@ def main():
# finally, let's save our output model
#
outfile = pathlib.Path(outfile).with_suffix(".p")
print('saving model...')
print('saving model...', file=sys.stderr)
if infile2 is None:
pickle.dump([init_q, prob_q, q_scores, off_q, avg_err, err_params], open(outfile, 'wb'))
else:
pickle.dump([init_q, prob_q, init_q2, prob_q2, q_scores, off_q, avg_err, err_params], open(outfile, 'wb'))


if __name__ == '__main__':
print("genSeqErrorModel starts executing the main function....", file=sys.stderr)
main()