diff --git a/micov/_io.py b/micov/_io.py index c249c15..41f2d1b 100644 --- a/micov/_io.py +++ b/micov/_io.py @@ -40,7 +40,7 @@ def _parse_bed_cov(data, feature_drop, feature_keep, lazy): frame = pl.read_csv(data.read(), separator='\t', new_columns=BED_COV_SCHEMA.columns, - dtypes=BED_COV_SCHEMA.dtypes_dict, + schema_overrides=BED_COV_SCHEMA.dtypes_dict, has_header=False, skip_rows=skip_rows).lazy() if feature_drop is not None: @@ -130,7 +130,7 @@ def _parse_qiita_coverages(tgz, compress_size=50_000_000, sample_keep=None, def _single_df(coverages): if len(coverages) > 1: - df = pl.concat(coverages) + df = pl.concat(coverages, rechunk=True) elif len(coverages) == 0: raise ValueError("No coverages") else: @@ -315,13 +315,13 @@ def compress_from_stream(sam, bufsize=100_000_000, disable_compression=False): current_df = compress_f(pl.concat([current_df, next_df])) buf = data.readlines(bufsize) - return current_df + return current_df.rechunk() def parse_coverage(data, features_to_keep): cov_df = pl.read_csv(data.read(), separator='\t', new_columns=GENOME_COVERAGE_SCHEMA.columns, - dtypes=GENOME_COVERAGE_SCHEMA.dtypes_dict).lazy() + schema_overrides=GENOME_COVERAGE_SCHEMA.dtypes_dict).lazy() if features_to_keep is not None: cov_df = cov_df.filter(pl.col(COLUMN_GENOME_ID).is_in(feature_keep)) diff --git a/micov/_per_sample.py b/micov/_per_sample.py index f0885a9..14ab29d 100644 --- a/micov/_per_sample.py +++ b/micov/_per_sample.py @@ -32,4 +32,4 @@ def compress_per_sample(coverage, lengths): if len(sample_contig_coverage) == 0: return None else: - return pl.concat(sample_contig_coverage) + return pl.concat(sample_contig_coverage, rechunk=True) diff --git a/micov/tests/test_cov.py b/micov/tests/test_cov.py index 2acaefd..23176bf 100644 --- a/micov/tests/test_cov.py +++ b/micov/tests/test_cov.py @@ -15,6 +15,7 @@ def test_compress(self): ['G123', 101, 110], ['G456', 200, 300], ['G456', 400, 500]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) data = pl.DataFrame([['G123', 11, 50], ['G123', 20, 30], @@ -25,6 +26,7 @@ def test_compress(self): ['G123', 51, 89], ['G123', 101, 110], ['G456', 400, 500]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = compress(data).sort(COLUMN_GENOME_ID) plt.assert_frame_equal(obs, exp) @@ -34,16 +36,19 @@ def test_coverage_percent(self): ['G456', 200, 299], ['G123', 90, 100], ['G456', 400, 500]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) lengths = pl.DataFrame([['G123', 100], ['G456', 1000], ['G789', 500]], + orient='row', schema=GENOME_LENGTH_SCHEMA.dtypes_flat) g123_covered = (50 - 11) + (100 - 90) g456_covered = (299 - 200) + (500 - 400) exp = pl.DataFrame([['G123', g123_covered, 100, (g123_covered / 100) * 100], ['G456', g456_covered, 1000, (g456_covered / 1000) * 100]], + orient='row', schema=GENOME_COVERAGE_SCHEMA.dtypes_flat) obs = coverage_percent(data, lengths).sort(COLUMN_GENOME_ID).collect() diff --git a/micov/tests/test_io.py b/micov/tests/test_io.py index a92f0f3..0d882d9 100644 --- a/micov/tests/test_io.py +++ b/micov/tests/test_io.py @@ -86,6 +86,7 @@ def test_write_qiita_cov(self): lengths = pl.DataFrame([['GXXX', 600], ['GYYY', 1100], ['GZZZ', 2000]], + orient='row', schema=GENOME_LENGTH_SCHEMA.dtypes_flat) write_qiita_cov(self.name, paths, lengths) @@ -101,11 +102,13 @@ def test_write_qiita_cov(self): ['GYYY', 100, 400], ['GYYY', 500, 1000], ['GZZZ', 200, 400]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) exp_cov_percent = pl.DataFrame([['GXXX', 400, 600, (400 / 600) * 100], ['GYYY', 800, 1100, (800 / 1100) * 100], ['GZZZ', 200, 2000, (200 / 2000) * 100]], + orient='row', schema=GENOME_COVERAGE_SCHEMA.dtypes_flat) obs_artifact_cov = obs_artifact_cov.sort([COLUMN_GENOME_ID, COLUMN_START]) @@ -124,6 +127,7 @@ def test_parse_qiita_coverages(self): ['G123', 1000, 10000], ['G456', 5, 20], ['G789', 1, 100]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) # always compress obs = parse_qiita_coverages(self.name) @@ -137,6 +141,7 @@ def test_parse_qiita_coverages_always_compress(self): ['G123', 1000, 10000], ['G456', 5, 20], ['G789', 1, 100]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) # always compress obs = parse_qiita_coverages(self.name, compress_size=0) @@ -152,6 +157,7 @@ def test_parse_qiita_coverages_never_compress(self): ['G456', 5, 20], ['G789', 1, 100], ['G789', 2, 40]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = parse_qiita_coverages(self.name, compress_size=None) obs = obs.sort([COLUMN_GENOME_ID, COLUMN_START]) @@ -163,6 +169,7 @@ def test_parse_qiita_coverages_keep(self): ['G123', 300, 400], ['G456', 5, 20], ['G789', 1, 100]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = parse_qiita_coverages(self.name, sample_keep={'sample_a', 'sample_b'}) @@ -175,6 +182,7 @@ def test_parse_qiita_coverages_drop(self): ['G123', 300, 400], ['G456', 5, 20], ['G789', 1, 100]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = parse_qiita_coverages(self.name, sample_drop={'sample_c', }) @@ -186,6 +194,7 @@ def test_parse_qiita_coverages_keep_drop(self): ['G123', 100, 200], ['G456', 5, 20], ['G789', 2, 40]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = parse_qiita_coverages(self.name, sample_drop={'sample_c', }, @@ -198,6 +207,7 @@ def test_parse_qiita_coverages_keep_feature(self): ['G123', 100, 200], ['G123', 300, 400], ['G123', 1000, 10000]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = parse_qiita_coverages(self.name, feature_keep={'G123', }) @@ -207,6 +217,7 @@ def test_parse_qiita_coverages_keep_feature(self): def test_parse_qiita_coverages_drop_feature(self): exp = pl.DataFrame([['G456', 5, 20], ['G789', 2, 40]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = parse_qiita_coverages(self.name, sample_drop={'sample_c', }, @@ -236,6 +247,7 @@ def test_parse_genome_lengths_good(self): exp = pl.DataFrame([['a', 10], ['b', 20], ['c', 30]], + orient='row', schema=[COLUMN_GENOME_ID, COLUMN_LENGTH]) obs = parse_genome_lengths(self.name) plt.assert_frame_equal(obs, exp) @@ -251,6 +263,7 @@ def test_parse_genome_lengths_noheader(self): exp = pl.DataFrame([['a', 10], ['b', 20], ['c', 30]], + orient='row', schema=[COLUMN_GENOME_ID, COLUMN_LENGTH]) obs = parse_genome_lengths(self.name) plt.assert_frame_equal(obs, exp) @@ -303,6 +316,7 @@ def test_compress_from_stream(self): ['X', 90, 150], ['Y', 10, 60], ['Y', 100, 150]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = compress_from_stream(data, bufsize=2) plt.assert_frame_equal(obs.sort([COLUMN_GENOME_ID, ]), exp) @@ -323,6 +337,7 @@ def test_compress_from_stream_disable_compression(self): ['X', 100, 150], ['Y', 10, 60], ['Y', 100, 150]], + orient='row', schema=BED_COV_SCHEMA.dtypes_flat) obs = compress_from_stream(data, bufsize=2, disable_compression=True) plt.assert_frame_equal(obs.sort([COLUMN_GENOME_ID, COLUMN_START]), exp) @@ -339,6 +354,7 @@ def test_parse_sam_to_df(self): exp = pl.DataFrame([['A', 0, 'X', 1, '50M', 51], ['B', 0, 'Y', 10, '50M', 60], ['C', 0, 'X', 100, '50M', 150]], + orient='row', schema=SAM_SUBSET_SCHEMA_PARSED.dtypes_flat) obs = parse_sam_to_df(data) plt.assert_frame_equal(obs, exp) diff --git a/micov/tests/test_per_sample.py b/micov/tests/test_per_sample.py index 4556147..97a0e9e 100644 --- a/micov/tests/test_per_sample.py +++ b/micov/tests/test_per_sample.py @@ -12,6 +12,7 @@ class Tests(unittest.TestCase): def test_compress_per_sample(self): lengths = pl.DataFrame([['A', 400], ['B', 500]], + orient='row', schema=GENOME_LENGTH_SCHEMA.dtypes_flat) df = pl.DataFrame([['A', 10, 100, 'S1'], ['A', 10, 20, 'S1'], @@ -22,6 +23,7 @@ def test_compress_per_sample(self): ['A', 50, 150, 'S2'], ['A', 200, 300, 'S1'], ['A', 201, 299, 'S1']], + orient='row', schema=BED_COV_SAMPLEID_SCHEMA.dtypes_flat) s1_a = ((110 - 10) + (300 - 200)) s1_b = ((150 - 50) + (250 - 200)) @@ -29,6 +31,7 @@ def test_compress_per_sample(self): exp = pl.DataFrame([['A', s1_a, 400, (s1_a / 400) * 100, 'S1'], ['A', s2_a, 400, (s2_a / 400) * 100, 'S2'], ['B', s1_b, 500, (s1_b / 500) * 100, 'S1']], + orient='row', schema=GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA.dtypes_flat) obs = compress_per_sample(df, lengths).sort([COLUMN_GENOME_ID, COLUMN_SAMPLE_ID]).collect()