Skip to content

Commit

Permalink
MIN Better error messages
Browse files Browse the repository at this point in the history
Also refactoring to simplify internal interfaces and avoid code
duplications
  • Loading branch information
luispedro committed Oct 18, 2023
1 parent bdc1087 commit d5cfeaf
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 25 deletions.
45 changes: 21 additions & 24 deletions SemiBin/generate_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def calculate_coverage(depth_stream, bam_file, must_link_threshold, edge=75, is_
return pd.DataFrame({
'{0}_mean'.format(bam_file): mean_coverage,
'{0}_var'.format(bam_file): var,
}, index=contigs)
}, index=contigs), None


def generate_cov(bam_file, bam_index, out, threshold,
Expand All @@ -97,39 +97,36 @@ def generate_cov(bam_file, bam_index, out, threshold,
universal_newlines=True,
stdout=subprocess.PIPE)

contig_cov, must_link_contig_cov = calculate_coverage(
bed_p.stdout,
bam_file,
threshold,
is_combined=is_combined,
sep=sep,
contig_threshold=(contig_threshold if sep is None else 1000),
contig_threshold_dict=(contig_threshold if sep is not None else None))

if bed_p.wait() != 0:
raise OSError(f"Failure in running bedtools ({bam_file})")
elif len(contig_cov) == 0:
logger.critical(f"Running `bedtools genomecov` did not return an error, but the result is an empty file (processing {bam_file}). "
"Please check your input files: SemiBin expects that they are sorted BAM files.")
raise OSError(f"Running bedtools returned an empty file ({bam_file})")

contig_cov += 1e-5

This comment has been minimized.

Copy link
@psj1997

psj1997 Nov 5, 2023

Collaborator

This change will lead to an issue that the coverage file is not normalized when generating in combined mode.

This comment has been minimized.

Copy link
@luispedro

luispedro Nov 5, 2023

Author Member

You're right, good catch!

with atomic_write(os.path.join(out, '{}_data_cov.csv'.format(bam_name)), overwrite=True) as ofile:
contig_cov.to_csv(ofile)

if is_combined:
contig_cov, must_link_contig_cov = calculate_coverage(bed_p.stdout, bam_file, threshold, is_combined = is_combined, sep = sep, contig_threshold = contig_threshold if sep is None else 1000, contig_threshold_dict = contig_threshold if sep is not None else None)
if bed_p.wait() != 0:
logger.critical(f"Running `bedtools genomecov` failed ({bam_file}). Please check your input files: SemiBin expects that they are sorted BAM files.")
raise OSError(f"Failure running `bedtools genomecov` ({bam_file})")
elif len(contig_cov) == 0:
logger.critical("Running `bedtools genomecov` did not return an error, but the result is an empty file. Please check your input files: SemiBin expects that they are sorted BAM files.")
raise OSError("Running bedtools returned an empty file")

contig_cov = contig_cov.apply(lambda x: x + 1e-5)
must_link_contig_cov = must_link_contig_cov.apply(lambda x: x + 1e-5)
if sep is None:
abun_scale = (contig_cov.mean() / 100).apply(np.ceil) * 100
abun_split_scale = (must_link_contig_cov.mean() / 100).apply(np.ceil) * 100
contig_cov = contig_cov.div(abun_scale)
must_link_contig_cov = must_link_contig_cov.div(abun_split_scale)

with atomic_write(os.path.join(out, '{}_data_cov.csv'.format(bam_name)), overwrite=True) as ofile:
contig_cov.to_csv(ofile)

with atomic_write(os.path.join(out, '{}_data_split_cov.csv'.format(bam_name)), overwrite=True) as ofile:
must_link_contig_cov.to_csv(ofile)
else:
contig_cov = calculate_coverage(bed_p.stdout, bam_file, threshold, is_combined=is_combined, sep = sep,
contig_threshold = contig_threshold if sep is None else 1000,
contig_threshold_dict = contig_threshold if sep is not None else None)
if bed_p.wait() != 0:
raise OSError("Failure in running bedtools")

contig_cov = contig_cov.apply(lambda x: x + 1e-5)

with atomic_write(os.path.join(out, '{}_data_cov.csv'.format(bam_name)), overwrite=True) as ofile:
contig_cov.to_csv(ofile)

return bam_file

Expand Down
3 changes: 2 additions & 1 deletion test/test_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,13 @@ def test_cov():
assert_frame_equal(cov_split, pd.DataFrame([2.609756, 2.554217, 2.682243, 2.574074], index=[
'k141_63080_1', 'k141_63080_2', 'k141_0_1', 'k141_0_2'], columns=['cov']))

cov = calculate_coverage(
cov, split = calculate_coverage(
test_data,
'test_data',
must_link_threshold=0,
is_combined=False,
contig_threshold=0)
assert split is None
cov.columns = ['mean', 'var']
assert_frame_equal(cov, pd.DataFrame([[2.581818, 0.606942], [
2.627907, 0.252244]], index=['k141_63080', 'k141_0'], columns=['mean', 'var']))

0 comments on commit d5cfeaf

Please sign in to comment.