From d5cfeaf7373379055839853def0d231347272c07 Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Wed, 18 Oct 2023 12:30:22 +1000 Subject: [PATCH] MIN Better error messages Also refactoring to simplify internal interfaces and avoid code duplications --- SemiBin/generate_coverage.py | 45 +++++++++++++++++------------------- test/test_cov.py | 3 ++- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/SemiBin/generate_coverage.py b/SemiBin/generate_coverage.py index d86e6fc..b4ec10f 100644 --- a/SemiBin/generate_coverage.py +++ b/SemiBin/generate_coverage.py @@ -71,7 +71,7 @@ def calculate_coverage(depth_stream, bam_file, must_link_threshold, edge=75, is_ return pd.DataFrame({ '{0}_mean'.format(bam_file): mean_coverage, '{0}_var'.format(bam_file): var, - }, index=contigs) + }, index=contigs), None def generate_cov(bam_file, bam_index, out, threshold, @@ -97,16 +97,27 @@ def generate_cov(bam_file, bam_index, out, threshold, universal_newlines=True, stdout=subprocess.PIPE) + contig_cov, must_link_contig_cov = calculate_coverage( + bed_p.stdout, + bam_file, + threshold, + is_combined=is_combined, + sep=sep, + contig_threshold=(contig_threshold if sep is None else 1000), + contig_threshold_dict=(contig_threshold if sep is not None else None)) + + if bed_p.wait() != 0: + raise OSError(f"Failure in running bedtools ({bam_file})") + elif len(contig_cov) == 0: + logger.critical(f"Running `bedtools genomecov` did not return an error, but the result is an empty file (processing {bam_file}). " + "Please check your input files: SemiBin expects that they are sorted BAM files.") + raise OSError(f"Running bedtools returned an empty file ({bam_file})") + + contig_cov += 1e-5 + with atomic_write(os.path.join(out, '{}_data_cov.csv'.format(bam_name)), overwrite=True) as ofile: + contig_cov.to_csv(ofile) + if is_combined: - contig_cov, must_link_contig_cov = calculate_coverage(bed_p.stdout, bam_file, threshold, is_combined = is_combined, sep = sep, contig_threshold = contig_threshold if sep is None else 1000, contig_threshold_dict = contig_threshold if sep is not None else None) - if bed_p.wait() != 0: - logger.critical(f"Running `bedtools genomecov` failed ({bam_file}). Please check your input files: SemiBin expects that they are sorted BAM files.") - raise OSError(f"Failure running `bedtools genomecov` ({bam_file})") - elif len(contig_cov) == 0: - logger.critical("Running `bedtools genomecov` did not return an error, but the result is an empty file. Please check your input files: SemiBin expects that they are sorted BAM files.") - raise OSError("Running bedtools returned an empty file") - - contig_cov = contig_cov.apply(lambda x: x + 1e-5) must_link_contig_cov = must_link_contig_cov.apply(lambda x: x + 1e-5) if sep is None: abun_scale = (contig_cov.mean() / 100).apply(np.ceil) * 100 @@ -114,22 +125,8 @@ def generate_cov(bam_file, bam_index, out, threshold, contig_cov = contig_cov.div(abun_scale) must_link_contig_cov = must_link_contig_cov.div(abun_split_scale) - with atomic_write(os.path.join(out, '{}_data_cov.csv'.format(bam_name)), overwrite=True) as ofile: - contig_cov.to_csv(ofile) - with atomic_write(os.path.join(out, '{}_data_split_cov.csv'.format(bam_name)), overwrite=True) as ofile: must_link_contig_cov.to_csv(ofile) - else: - contig_cov = calculate_coverage(bed_p.stdout, bam_file, threshold, is_combined=is_combined, sep = sep, - contig_threshold = contig_threshold if sep is None else 1000, - contig_threshold_dict = contig_threshold if sep is not None else None) - if bed_p.wait() != 0: - raise OSError("Failure in running bedtools") - - contig_cov = contig_cov.apply(lambda x: x + 1e-5) - - with atomic_write(os.path.join(out, '{}_data_cov.csv'.format(bam_name)), overwrite=True) as ofile: - contig_cov.to_csv(ofile) return bam_file diff --git a/test/test_cov.py b/test/test_cov.py index bc88453..5011b8d 100644 --- a/test/test_cov.py +++ b/test/test_cov.py @@ -40,12 +40,13 @@ def test_cov(): assert_frame_equal(cov_split, pd.DataFrame([2.609756, 2.554217, 2.682243, 2.574074], index=[ 'k141_63080_1', 'k141_63080_2', 'k141_0_1', 'k141_0_2'], columns=['cov'])) - cov = calculate_coverage( + cov, split = calculate_coverage( test_data, 'test_data', must_link_threshold=0, is_combined=False, contig_threshold=0) + assert split is None cov.columns = ['mean', 'var'] assert_frame_equal(cov, pd.DataFrame([[2.581818, 0.606942], [ 2.627907, 0.252244]], index=['k141_63080', 'k141_0'], columns=['mean', 'var']))