diff --git a/micov/__init__.py b/micov/__init__.py index aba121c..41da9d4 100644 --- a/micov/__init__.py +++ b/micov/__init__.py @@ -1,3 +1,5 @@ """micov: microbiome coverage.""" + from . import _version -__version__ = _version.get_versions()['version'] + +__version__ = _version.get_versions()["version"] diff --git a/micov/_constants.py b/micov/_constants.py index cb0e6e5..f36fed0 100644 --- a/micov/_constants.py +++ b/micov/_constants.py @@ -1,29 +1,30 @@ -COLUMN_GENOME_ID = 'genome_id' +COLUMN_GENOME_ID = "genome_id" COLUMN_GENOME_ID_DTYPE = str -COLUMN_SAMPLE_ID = 'sample_id' +COLUMN_SAMPLE_ID = "sample_id" COLUMN_SAMPLE_ID_DTYPE = str -COLUMN_START = 'start' +COLUMN_START = "start" COLUMN_START_DTYPE = int -COLUMN_STOP = 'stop' +COLUMN_STOP = "stop" COLUMN_STOP_DTYPE = int -COLUMN_READ_ID = 'read' +COLUMN_READ_ID = "read" COLUMN_READ_ID_DTYPE = str -COLUMN_FLAG = 'flag' +COLUMN_FLAG = "flag" COLUMN_FLAG_DTYPE = int -COLUMN_CIGAR = 'cigar' +COLUMN_CIGAR = "cigar" COLUMN_CIGAR_DTYPE = str -COLUMN_LENGTH = 'length' +COLUMN_LENGTH = "length" COLUMN_LENGTH_DTYPE = int -COLUMN_TAXONOMY = 'taxonomy' +COLUMN_TAXONOMY = "taxonomy" COLUMN_TAXONOMY_DTYPE = str -COLUMN_COVERED = 'covered' -COLUMN_PERCENT_COVERED = 'percent_covered' +COLUMN_COVERED = "covered" +COLUMN_PERCENT_COVERED = "percent_covered" COLUMN_COVERED_DTYPE = int -COLUMN_PERCENT_COVERED = 'percent_covered' +COLUMN_PERCENT_COVERED = "percent_covered" COLUMN_PERCENT_COVERED_DTYPE = float ### should really probably just use a dataclass, and type annotations? + class _SCHEMA: def __init__(self): self.dtypes_dict = dict(self.dtypes_flat) @@ -31,17 +32,25 @@ def __init__(self): class _BED_COV_SCHEMA(_SCHEMA): - dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_START, COLUMN_START_DTYPE), - (COLUMN_STOP, COLUMN_STOP_DTYPE)] + dtypes_flat = [ + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_START, COLUMN_START_DTYPE), + (COLUMN_STOP, COLUMN_STOP_DTYPE), + ] + + BED_COV_SCHEMA = _BED_COV_SCHEMA() class _BED_COV_WITH_SAMPLEID_SCHEMA(_SCHEMA): - dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_START, COLUMN_START_DTYPE), - (COLUMN_STOP, COLUMN_STOP_DTYPE), - (COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE)] + dtypes_flat = [ + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_START, COLUMN_START_DTYPE), + (COLUMN_STOP, COLUMN_STOP_DTYPE), + (COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE), + ] + + BED_COV_SAMPLEID_SCHEMA = _BED_COV_WITH_SAMPLEID_SCHEMA() @@ -51,48 +60,72 @@ class _SAM_SUBSET_SCHEMA(_SCHEMA): # for binary coverage, we don't care about the flag, but we're parsing it # now so we can care in the future. column_indices = [0, 1, 2, 3, 5] - dtypes_flat = [(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE), - (COLUMN_FLAG, COLUMN_FLAG_DTYPE), - (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_START, COLUMN_START_DTYPE), - (COLUMN_CIGAR, COLUMN_CIGAR_DTYPE)] + dtypes_flat = [ + (COLUMN_READ_ID, COLUMN_READ_ID_DTYPE), + (COLUMN_FLAG, COLUMN_FLAG_DTYPE), + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_START, COLUMN_START_DTYPE), + (COLUMN_CIGAR, COLUMN_CIGAR_DTYPE), + ] + + SAM_SUBSET_SCHEMA = _SAM_SUBSET_SCHEMA() class _SAM_SUBSET_SCHEMA_PARSED(_SCHEMA): - dtypes_flat = [(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE), - (COLUMN_FLAG, COLUMN_FLAG_DTYPE), - (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_START, COLUMN_START_DTYPE), - (COLUMN_CIGAR, COLUMN_CIGAR_DTYPE), - (COLUMN_STOP, COLUMN_STOP_DTYPE)] + dtypes_flat = [ + (COLUMN_READ_ID, COLUMN_READ_ID_DTYPE), + (COLUMN_FLAG, COLUMN_FLAG_DTYPE), + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_START, COLUMN_START_DTYPE), + (COLUMN_CIGAR, COLUMN_CIGAR_DTYPE), + (COLUMN_STOP, COLUMN_STOP_DTYPE), + ] + + SAM_SUBSET_SCHEMA_PARSED = _SAM_SUBSET_SCHEMA_PARSED() class _GENOME_LENGTH_SCHEMA(_SCHEMA): - dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_LENGTH, COLUMN_LENGTH_DTYPE)] + dtypes_flat = [ + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_LENGTH, COLUMN_LENGTH_DTYPE), + ] + + GENOME_LENGTH_SCHEMA = _GENOME_LENGTH_SCHEMA() class _GENOME_TAXONOMY_SCHEMA(_SCHEMA): - dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE)] + dtypes_flat = [ + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE), + ] + + GENOME_TAXNOMY_SCHEMA = _GENOME_TAXONOMY_SCHEMA() class _GENOME_COVERAGE_SCHEMA(_SCHEMA): - dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_COVERED, COLUMN_COVERED_DTYPE), - (COLUMN_LENGTH, COLUMN_LENGTH_DTYPE), - (COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE)] + dtypes_flat = [ + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_COVERED, COLUMN_COVERED_DTYPE), + (COLUMN_LENGTH, COLUMN_LENGTH_DTYPE), + (COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE), + ] + + GENOME_COVERAGE_SCHEMA = _GENOME_COVERAGE_SCHEMA() class _GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA(_SCHEMA): - dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), - (COLUMN_COVERED, COLUMN_COVERED_DTYPE), - (COLUMN_LENGTH, COLUMN_LENGTH_DTYPE), - (COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE), - (COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE)] + dtypes_flat = [ + (COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE), + (COLUMN_COVERED, COLUMN_COVERED_DTYPE), + (COLUMN_LENGTH, COLUMN_LENGTH_DTYPE), + (COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE), + (COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE), + ] + + GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA = _GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA() diff --git a/micov/_convert.py b/micov/_convert.py index de2b0d7..5446af6 100644 --- a/micov/_convert.py +++ b/micov/_convert.py @@ -29,14 +29,14 @@ def cigar_to_lens(cigar): """ align, offset = 0, 0 - n = '' # current step size + n = "" # current step size for c in cigar: - if c in 'MDIHNPSX=': - if c in 'M=X': + if c in "MDIHNPSX=": + if c in "M=X": align += int(n) - elif c in 'DN': + elif c in "DN": offset += int(n) - n = '' + n = "" else: n += c diff --git a/micov/_cov.py b/micov/_cov.py index 046e4b0..f1dc178 100644 --- a/micov/_cov.py +++ b/micov/_cov.py @@ -2,28 +2,44 @@ import numba import polars as pl -from ._constants import (COLUMN_GENOME_ID, COLUMN_START, COLUMN_STOP, - COLUMN_COVERED, BED_COV_SCHEMA, - COLUMN_LENGTH, COLUMN_PERCENT_COVERED) +from ._constants import ( + COLUMN_GENOME_ID, + COLUMN_START, + COLUMN_STOP, + COLUMN_COVERED, + BED_COV_SCHEMA, + COLUMN_LENGTH, + COLUMN_PERCENT_COVERED, +) def coverage_percent(coverages, lengths): - missing = (set(coverages[COLUMN_GENOME_ID]) - - set(lengths[COLUMN_GENOME_ID])) + missing = set(coverages[COLUMN_GENOME_ID]) - set(lengths[COLUMN_GENOME_ID]) if len(missing) > 0: - raise ValueError(f"{len(missing)} genome(s) appear unrepresented in " - f"the length information, examples: " - f"{sorted(missing)[:5]}") + raise ValueError( + f"{len(missing)} genome(s) appear unrepresented in " + f"the length information, examples: " + f"{sorted(missing)[:5]}" + ) - return (coverages - .lazy() - .with_columns((pl.col(COLUMN_STOP) - - pl.col(COLUMN_START)).alias(COLUMN_COVERED)) - .group_by([COLUMN_GENOME_ID, ]) - .agg(pl.col(COLUMN_COVERED).sum()) - .join(lengths.lazy(), on=COLUMN_GENOME_ID) - .with_columns(((pl.col(COLUMN_COVERED) / - pl.col(COLUMN_LENGTH)) * 100).alias(COLUMN_PERCENT_COVERED))) # noqa + return ( + coverages.lazy() + .with_columns( + (pl.col(COLUMN_STOP) - pl.col(COLUMN_START)).alias(COLUMN_COVERED) + ) + .group_by( + [ + COLUMN_GENOME_ID, + ] + ) + .agg(pl.col(COLUMN_COVERED).sum()) + .join(lengths.lazy(), on=COLUMN_GENOME_ID) + .with_columns( + ((pl.col(COLUMN_COVERED) / pl.col(COLUMN_LENGTH)) * 100).alias( + COLUMN_PERCENT_COVERED + ) + ) + ) # noqa @numba.jit(nopython=True) @@ -59,22 +75,31 @@ def _compress(rows): def compress(df): compressed = [] - for (genome, ), grp in df.group_by([COLUMN_GENOME_ID, ]): - rows = (grp - .lazy() - .select([COLUMN_START, COLUMN_STOP]) - .sort(COLUMN_START) - .collect() - .to_numpy(order='c')) + for (genome,), grp in df.group_by( + [ + COLUMN_GENOME_ID, + ] + ): + rows = ( + grp.lazy() + .select([COLUMN_START, COLUMN_STOP]) + .sort(COLUMN_START) + .collect() + .to_numpy(order="c") + ) grp_compressed = _compress(rows) - grp_compressed_df = pl.LazyFrame(grp_compressed, - schema=[BED_COV_SCHEMA.dtypes_flat[1], - BED_COV_SCHEMA.dtypes_flat[2]], - orient='row') - grp_compressed_df = (grp_compressed_df - .with_columns(pl.lit(genome).alias(COLUMN_GENOME_ID)) - .select(BED_COV_SCHEMA.columns)) + grp_compressed_df = pl.LazyFrame( + grp_compressed, + schema=[ + BED_COV_SCHEMA.dtypes_flat[1], + BED_COV_SCHEMA.dtypes_flat[2], + ], + orient="row", + ) + grp_compressed_df = grp_compressed_df.with_columns( + pl.lit(genome).alias(COLUMN_GENOME_ID) + ).select(BED_COV_SCHEMA.columns) compressed.append(grp_compressed_df.collect()) diff --git a/micov/_io.py b/micov/_io.py index 49d3059..bdc2b11 100644 --- a/micov/_io.py +++ b/micov/_io.py @@ -10,10 +10,18 @@ import gzip from ._cov import compress, coverage_percent -from ._constants import (BED_COV_SCHEMA, GENOME_COVERAGE_SCHEMA, - COLUMN_GENOME_ID, COLUMN_LENGTH, COLUMN_TAXONOMY, - SAM_SUBSET_SCHEMA, COLUMN_CIGAR, COLUMN_STOP, - COLUMN_START, COLUMN_SAMPLE_ID) +from ._constants import ( + BED_COV_SCHEMA, + GENOME_COVERAGE_SCHEMA, + COLUMN_GENOME_ID, + COLUMN_LENGTH, + COLUMN_TAXONOMY, + SAM_SUBSET_SCHEMA, + COLUMN_CIGAR, + COLUMN_STOP, + COLUMN_START, + COLUMN_SAMPLE_ID, +) from ._convert import cigar_to_lens @@ -39,10 +47,14 @@ def _parse_bed_cov(data, feature_drop, feature_keep, lazy): else: skip_rows = 0 - frame = pl.read_csv(data.read(), separator='\t', - new_columns=BED_COV_SCHEMA.columns, - schema_overrides=BED_COV_SCHEMA.dtypes_dict, - has_header=False, skip_rows=skip_rows).lazy() + frame = pl.read_csv( + data.read(), + separator="\t", + new_columns=BED_COV_SCHEMA.columns, + schema_overrides=BED_COV_SCHEMA.dtypes_dict, + has_header=False, + skip_rows=skip_rows, + ).lazy() if feature_drop is not None: frame = frame.filter(~pl.col(COLUMN_GENOME_ID).is_in(feature_drop)) @@ -58,36 +70,52 @@ def _parse_bed_cov(data, feature_drop, feature_keep, lazy): def parse_qiita_coverages(tgzs, *args, **kwargs): if not isinstance(tgzs, (list, tuple, set, frozenset)): - tgzs = [tgzs, ] + tgzs = [ + tgzs, + ] - compress_size = kwargs.get('compress_size', 50_000_000) + compress_size = kwargs.get("compress_size", 50_000_000) if compress_size is not None: assert isinstance(compress_size, int) and compress_size >= 0 else: compress_size = math.inf - kwargs['compress_size'] = compress_size + kwargs["compress_size"] = compress_size frame = _parse_qiita_coverages(tgzs[0], *args, **kwargs) for tgz in tgzs[1:]: next_frame = _parse_qiita_coverages(tgz, *args, **kwargs) - frame = _single_df(_check_and_compress([frame, next_frame], - compress_size)) + frame = _single_df( + _check_and_compress([frame, next_frame], compress_size) + ) if compress_size == math.inf: return frame else: - return _single_df(_check_and_compress([frame, ], compress_size=0)) - - -def _parse_qiita_coverages(tgz, compress_size=50_000_000, sample_keep=None, - sample_drop=None, feature_keep=None, - feature_drop=None, append_sample_id=False): + return _single_df( + _check_and_compress( + [ + frame, + ], + compress_size=0, + ) + ) + + +def _parse_qiita_coverages( + tgz, + compress_size=50_000_000, + sample_keep=None, + sample_drop=None, + feature_keep=None, + feature_drop=None, + append_sample_id=False, +): # compress_size=None to disable compression fp = tarfile.open(tgz) try: - fp.extractfile('coverage_percentage.txt') + fp.extractfile("coverage_percentage.txt") except KeyError: raise KeyError(f"{tgz} does not look like a Qiita coverage tgz") @@ -99,11 +127,11 @@ def _parse_qiita_coverages(tgz, compress_size=50_000_000, sample_keep=None, coverages = [] for name in fp.getnames(): - if 'coverages/' not in name: + if "coverages/" not in name: continue - _, filename = name.split('/') - sample_id = filename.rsplit('.', 1)[0] + _, filename = name.split("/") + sample_id = filename.rsplit(".", 1)[0] if sample_id in sample_drop: continue @@ -118,7 +146,9 @@ def _parse_qiita_coverages(tgz, compress_size=50_000_000, sample_keep=None, continue if append_sample_id: - frame = frame.with_columns(pl.lit(sample_id).alias(COLUMN_SAMPLE_ID)) + frame = frame.with_columns( + pl.lit(sample_id).alias(COLUMN_SAMPLE_ID) + ) coverages.append(frame.collect()) coverages = _check_and_compress(coverages, compress_size) @@ -144,21 +174,23 @@ def _check_and_compress(coverages, compress_size): rowcount = sum([len(df) for df in coverages]) if rowcount > compress_size: df = compress(_single_df(coverages)) - coverages = [df, ] + coverages = [ + df, + ] return coverages def _test_has_header(line): if isinstance(line, bytes): - line = line.decode('utf-8') + line = line.decode("utf-8") - genome_id_columns = (COLUMN_GENOME_ID) + genome_id_columns = COLUMN_GENOME_ID - if line.startswith('#'): + if line.startswith("#"): has_header = True - elif line.split('\t')[0] in genome_id_columns: + elif line.split("\t")[0] in genome_id_columns: has_header = True - elif not line.split('\t')[1].strip().isdigit(): + elif not line.split("\t")[1].strip().isdigit(): has_header = True else: has_header = False @@ -168,15 +200,17 @@ def _test_has_header(line): def _test_has_header_taxonomy(line): if isinstance(line, bytes): - line = line.decode('utf-8') + line = line.decode("utf-8") - genome_id_columns = (COLUMN_GENOME_ID) - taxonomy_columns = (COLUMN_TAXONOMY) + genome_id_columns = COLUMN_GENOME_ID + taxonomy_columns = COLUMN_TAXONOMY - if line.startswith('#'): + if line.startswith("#"): has_header = True - elif line.split('\t')[0] in genome_id_columns and \ - line.split('\t')[1] in taxonomy_columns: + elif ( + line.split("\t")[0] in genome_id_columns + and line.split("\t")[1] in taxonomy_columns + ): has_header = True else: has_header = False @@ -189,7 +223,7 @@ def parse_genome_lengths(lengths): first_line = fp.readline() has_header = _test_has_header(first_line) - df = pl.read_csv(lengths, separator='\t', has_header=has_header) + df = pl.read_csv(lengths, separator="\t", has_header=has_header) genome_id_col = df.columns[0] length_col = df.columns[1] @@ -203,8 +237,7 @@ def parse_genome_lengths(lengths): if df[length_col].min() <= 0: raise ValueError(f"Lengths of zero or less cannot be used") - rename = {genome_id_col: COLUMN_GENOME_ID, - length_col: COLUMN_LENGTH} + rename = {genome_id_col: COLUMN_GENOME_ID, length_col: COLUMN_LENGTH} return df[[genome_id_col, length_col]].rename(rename) @@ -213,7 +246,7 @@ def parse_taxonomy(taxonomy): first_line = fp.readline() has_header = _test_has_header_taxonomy(first_line) - df = pl.read_csv(taxonomy, separator='\t', has_header=has_header) + df = pl.read_csv(taxonomy, separator="\t", has_header=has_header) genome_id_col = df.columns[0] taxonomy_col = df.columns[1] @@ -221,38 +254,47 @@ def parse_taxonomy(taxonomy): if len(genome_ids) != len(set(genome_ids)): raise ValueError(f"'{genome_id_col}' is not unique") - rename = {genome_id_col: COLUMN_GENOME_ID, - taxonomy_col: COLUMN_TAXONOMY} + rename = {genome_id_col: COLUMN_GENOME_ID, taxonomy_col: COLUMN_TAXONOMY} return df[[genome_id_col, taxonomy_col]].rename(rename) def set_taxonomy_as_id(coverages, taxonomy): - missing = (set(coverages[COLUMN_GENOME_ID]) - - set(taxonomy[COLUMN_GENOME_ID])) + missing = set(coverages[COLUMN_GENOME_ID]) - set( + taxonomy[COLUMN_GENOME_ID] + ) if len(missing) > 0: - raise ValueError(f"{len(missing)} genome(s) appear unrepresented in " - f"the taxonomy information, examples: " - f"{sorted(missing)[:5]}") + raise ValueError( + f"{len(missing)} genome(s) appear unrepresented in " + f"the taxonomy information, examples: " + f"{sorted(missing)[:5]}" + ) - return (coverages - .join(taxonomy, on=COLUMN_GENOME_ID, how='inner') - .select(COLUMN_TAXONOMY, - pl.exclude(COLUMN_TAXONOMY))) + return coverages.join(taxonomy, on=COLUMN_GENOME_ID, how="inner").select( + COLUMN_TAXONOMY, pl.exclude(COLUMN_TAXONOMY) + ) # TODO: this is not the greatest method name def parse_sam_to_df(sam): - df = pl.read_csv(sam, separator='\t', has_header=False, - columns=SAM_SUBSET_SCHEMA.column_indices, - comment_prefix='@', - new_columns=SAM_SUBSET_SCHEMA.columns).lazy() - - return (df - .with_columns(stop=pl.col(COLUMN_CIGAR).map_elements(cigar_to_lens, - return_dtype=int)) # noqa - .with_columns(stop=pl.col(COLUMN_STOP) + pl.col(COLUMN_START)) - .collect()) + df = pl.read_csv( + sam, + separator="\t", + has_header=False, + columns=SAM_SUBSET_SCHEMA.column_indices, + comment_prefix="@", + new_columns=SAM_SUBSET_SCHEMA.columns, + ).lazy() + + return ( + df.with_columns( + stop=pl.col(COLUMN_CIGAR).map_elements( + cigar_to_lens, return_dtype=int + ) + ) # noqa + .with_columns(stop=pl.col(COLUMN_STOP) + pl.col(COLUMN_START)) + .collect() + ) def _add_file(tf, name, data): @@ -267,22 +309,22 @@ def write_qiita_cov(name, paths, lengths): coverages = [] for p in paths: - with open(p, 'rb') as fp: + with open(p, "rb") as fp: data = fp.read() if len(data) == 0: continue base = os.path.basename(p) - if base.endswith('.cov.gz'): + if base.endswith(".cov.gz"): data = gzip.decompress(data) - name = base.rsplit('.', 2)[0] + '.cov' - elif base.endswith('.cov'): + name = base.rsplit(".", 2)[0] + ".cov" + elif base.endswith(".cov"): name = base else: - name = base + '.cov' + name = base + ".cov" - name = f'coverages/{name}' + name = f"coverages/{name}" _add_file(tf, name, data) current_coverage = _parse_bed_cov(io.BytesIO(data), None, None, False) @@ -291,16 +333,16 @@ def write_qiita_cov(name, paths, lengths): coverage = _single_df(_check_and_compress(coverages, compress_size=0)) - covdataname = 'artifact.cov' + covdataname = "artifact.cov" covdata = io.BytesIO() - coverage.write_csv(covdata, separator='\t', include_header=True) + coverage.write_csv(covdata, separator="\t", include_header=True) covdata.seek(0) _add_file(tf, covdataname, covdata.read()) genome_coverage = coverage_percent(coverage, lengths).collect() - pername = 'coverage_percentage.txt' + pername = "coverage_percentage.txt" perdata = io.BytesIO() - genome_coverage.write_csv(perdata, separator='\t', include_header=True) + genome_coverage.write_csv(perdata, separator="\t", include_header=True) perdata.seek(0) _add_file(tf, pername, perdata.read()) @@ -308,35 +350,35 @@ def write_qiita_cov(name, paths, lengths): def parse_sample_metadata(path): - df = pl.read_csv(path, separator='\t', infer_schema_length=0) + df = pl.read_csv(path, separator="\t", infer_schema_length=0) return df.rename({df.columns[0]: COLUMN_SAMPLE_ID}) @contextmanager def _reader(sam): """Indirection to support reading from stdin or a file.""" - if sam == '-' or sam is None: + if sam == "-" or sam is None: data = sys.stdin.buffer yield data elif isinstance(sam, io.BytesIO): yield sam else: - if sam.endswith('.sam.xz'): - with lzma.open(sam, mode='rb') as fp: + if sam.endswith(".sam.xz"): + with lzma.open(sam, mode="rb") as fp: yield fp - elif sam.endswith('.sam.gz'): - with gzip.open(sam, mode='rb') as fp: + elif sam.endswith(".sam.gz"): + with gzip.open(sam, mode="rb") as fp: yield fp else: - with open(sam, 'rb') as fp: + with open(sam, "rb") as fp: yield fp def _buf_to_bytes(buf): if isinstance(buf[0], str): - return io.StringIO(''.join(buf)) + return io.StringIO("".join(buf)) else: - return io.BytesIO(b''.join(buf)) + return io.BytesIO(b"".join(buf)) def _subset_sam_to_bed(df): @@ -358,9 +400,9 @@ def compress_from_stream(sam, bufsize=100_000_000, disable_compression=False): line = buf[0] if isinstance(line, str): - delim = '\t' + delim = "\t" elif isinstance(line, bytes): - delim = b'\t' + delim = b"\t" else: raise ValueError(f"Unexpected buffer type: {type(line)}") @@ -378,11 +420,16 @@ def compress_from_stream(sam, bufsize=100_000_000, disable_compression=False): def parse_coverage(data, features_to_keep): - cov_df = pl.read_csv(data.read(), separator='\t', - new_columns=GENOME_COVERAGE_SCHEMA.columns, - schema_overrides=GENOME_COVERAGE_SCHEMA.dtypes_dict).lazy() + cov_df = pl.read_csv( + data.read(), + separator="\t", + new_columns=GENOME_COVERAGE_SCHEMA.columns, + schema_overrides=GENOME_COVERAGE_SCHEMA.dtypes_dict, + ).lazy() if features_to_keep is not None: - cov_df = cov_df.filter(pl.col(COLUMN_GENOME_ID).is_in(features_to_keep)) + cov_df = cov_df.filter( + pl.col(COLUMN_GENOME_ID).is_in(features_to_keep) + ) return cov_df diff --git a/micov/_per_sample.py b/micov/_per_sample.py index 14ab29d..436f52a 100644 --- a/micov/_per_sample.py +++ b/micov/_per_sample.py @@ -4,15 +4,22 @@ import polars as pl -def per_sample_coverage(qiita_coverages, current_samples, features_to_keep, - features_to_ignore, lengths): +def per_sample_coverage( + qiita_coverages, + current_samples, + features_to_keep, + features_to_ignore, + lengths, +): try: - coverage = parse_qiita_coverages(qiita_coverages, - sample_keep=current_samples, - feature_keep=features_to_keep, - feature_drop=features_to_ignore, - compress_size=None, - append_sample_id=True) + coverage = parse_qiita_coverages( + qiita_coverages, + sample_keep=current_samples, + feature_keep=features_to_keep, + feature_drop=features_to_ignore, + compress_size=None, + append_sample_id=True, + ) except ValueError: # we expect this to only occur when requested samples or features # are not present @@ -23,7 +30,11 @@ def per_sample_coverage(qiita_coverages, current_samples, features_to_keep, def compress_per_sample(coverage, lengths): sample_contig_coverage = [] - for (sample, ), sample_grp in coverage.group_by([COLUMN_SAMPLE_ID, ]): + for (sample,), sample_grp in coverage.group_by( + [ + COLUMN_SAMPLE_ID, + ] + ): compressed = compress(sample_grp) cov_per = coverage_percent(compressed, lengths) cov_per = cov_per.with_columns(pl.lit(sample).alias(COLUMN_SAMPLE_ID)) diff --git a/micov/_plot.py b/micov/_plot.py index ddbc92c..9450cfd 100644 --- a/micov/_plot.py +++ b/micov/_plot.py @@ -5,39 +5,75 @@ import polars as pl import scipy.stats as ss from ._cov import coverage_percent, compress -from ._constants import (COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, - COLUMN_PERCENT_COVERED, COLUMN_LENGTH, - BED_COV_SCHEMA, COLUMN_START, COLUMN_STOP) +from ._constants import ( + COLUMN_SAMPLE_ID, + COLUMN_GENOME_ID, + COLUMN_PERCENT_COVERED, + COLUMN_LENGTH, + BED_COV_SCHEMA, + COLUMN_START, + COLUMN_STOP, +) def ordered_coverage(coverage, grp, target): - return (coverage.lazy() + return ( + coverage.lazy() .join(grp.lazy(), on=COLUMN_SAMPLE_ID) .filter(pl.col(COLUMN_GENOME_ID) == target) .sort(COLUMN_PERCENT_COVERED) .with_row_index() - .with_columns(x=pl.col('index') / pl.len())).collect() + .with_columns(x=pl.col("index") / pl.len()) + ).collect() def slice_positions(positions, id_): - return (positions - .lazy() - .filter(pl.col(COLUMN_SAMPLE_ID) == id_) - .select(pl.col(COLUMN_GENOME_ID), pl.col(COLUMN_START), - pl.col(COLUMN_STOP))) - - -def per_sample_plots(all_coverage, all_covered_positions, metadata, - sample_metadata_column, output): + return ( + positions.lazy() + .filter(pl.col(COLUMN_SAMPLE_ID) == id_) + .select( + pl.col(COLUMN_GENOME_ID), pl.col(COLUMN_START), pl.col(COLUMN_STOP) + ) + ) + + +def per_sample_plots( + all_coverage, + all_covered_positions, + metadata, + sample_metadata_column, + output, +): for genome in all_coverage[COLUMN_GENOME_ID].unique(): - non_cumulative(metadata, all_coverage, genome, sample_metadata_column, - output) - cumulative(metadata, all_coverage, all_covered_positions, genome, - sample_metadata_column, output) - position_plot(metadata, all_coverage, all_covered_positions, genome, - sample_metadata_column, output, scale=None) - position_plot(metadata, all_coverage, all_covered_positions, genome, - sample_metadata_column, output, scale=10000) + non_cumulative( + metadata, all_coverage, genome, sample_metadata_column, output + ) + cumulative( + metadata, + all_coverage, + all_covered_positions, + genome, + sample_metadata_column, + output, + ) + position_plot( + metadata, + all_coverage, + all_covered_positions, + genome, + sample_metadata_column, + output, + scale=None, + ) + position_plot( + metadata, + all_coverage, + all_covered_positions, + genome, + sample_metadata_column, + output, + scale=10000, + ) def cumulative(metadata, coverage, positions, target, variable, output): @@ -61,7 +97,12 @@ def cumulative(metadata, coverage, positions, target, variable, output): length = lengths[COLUMN_LENGTH].item(0) - for (name, ), grp in metadata.group_by([variable, ], maintain_order=True): + for (name,), grp in metadata.group_by( + [ + variable, + ], + maintain_order=True, + ): current = pl.DataFrame([], schema=BED_COV_SCHEMA.dtypes_flat) grp_coverage = ordered_coverage(coverage, grp, target) @@ -71,7 +112,7 @@ def cumulative(metadata, coverage, positions, target, variable, output): labels.append(name) cur_y = [] - cur_x = grp_coverage['x'] + cur_x = grp_coverage["x"] for id_ in grp_coverage[COLUMN_SAMPLE_ID]: next_ = slice_positions(target_positions, id_).collect() current = compress(pl.concat([current, next_])) @@ -83,23 +124,22 @@ def cumulative(metadata, coverage, positions, target, variable, output): if len(covs) > 1: k, p = ss.kruskal(*covs) - stat = f'\nKruskal: stat={k:.2f}; p={p:.2e}' + stat = f"\nKruskal: stat={k:.2f}; p={p:.2e}" else: - stat = '' + stat = "" ax = plt.gca() - ax.set_ylabel('Percent genome covered', fontsize=20) - ax.set_xlabel('Proportion of samples', fontsize=20) - ax.tick_params(axis='both', which='major', labelsize=16) - ax.tick_params(axis='both', which='minor', labelsize=16) + ax.set_ylabel("Percent genome covered", fontsize=20) + ax.set_xlabel("Proportion of samples", fontsize=20) + ax.tick_params(axis="both", which="major", labelsize=16) + ax.tick_params(axis="both", which="minor", labelsize=16) ax.set_xlim(0, 1) ax.set_ylim(0, 100) - ax.set_title((f'Cumulative: {target} ' - f'({length}bp){stat}'), fontsize=20) + ax.set_title((f"Cumulative: {target} " f"({length}bp){stat}"), fontsize=20) plt.legend(labels, fontsize=20) plt.tight_layout() - plt.savefig(f'{output}.{target}.{variable}.cumulative.png') + plt.savefig(f"{output}.{target}.{variable}.cumulative.png") plt.close() @@ -108,7 +148,12 @@ def non_cumulative(metadata, coverage, target, variable, output): labels = [] covs = [] - for (name, ), grp in metadata.group_by([variable, ], maintain_order=True): + for (name,), grp in metadata.group_by( + [ + variable, + ], + maintain_order=True, + ): grp_coverage = ordered_coverage(coverage, grp, target) if len(grp_coverage) == 0: @@ -116,7 +161,7 @@ def non_cumulative(metadata, coverage, target, variable, output): labels.append(name) covs.append(grp_coverage[COLUMN_PERCENT_COVERED]) - plt.plot(grp_coverage['x'], grp_coverage[COLUMN_PERCENT_COVERED]) + plt.plot(grp_coverage["x"], grp_coverage[COLUMN_PERCENT_COVERED]) # assumes all the same contig, so if `ordered_coverage` changes than # this assumption would not be valid @@ -125,23 +170,24 @@ def non_cumulative(metadata, coverage, target, variable, output): if len(covs) > 1: k, p = ss.kruskal(*covs) - stat = f'\nKruskal: stat={k:.2f}; p={p:.2e}' + stat = f"\nKruskal: stat={k:.2f}; p={p:.2e}" else: - stat = '' + stat = "" ax = plt.gca() - ax.set_ylabel('Percent genome covered', fontsize=20) - ax.set_xlabel('Proportion of samples', fontsize=20) - ax.tick_params(axis='both', which='major', labelsize=16) - ax.tick_params(axis='both', which='minor', labelsize=16) + ax.set_ylabel("Percent genome covered", fontsize=20) + ax.set_xlabel("Proportion of samples", fontsize=20) + ax.tick_params(axis="both", which="major", labelsize=16) + ax.tick_params(axis="both", which="minor", labelsize=16) ax.set_xlim(0, 1) ax.set_ylim(0, 100) - ax.set_title((f'Non-cumulative: {target} ' - f'({length}bp){stat}'), fontsize=20) + ax.set_title( + (f"Non-cumulative: {target} " f"({length}bp){stat}"), fontsize=20 + ) plt.legend(labels, fontsize=20) plt.tight_layout() - plt.savefig(f'{output}.{target}.{variable}.non-cumulative.png') + plt.savefig(f"{output}.{target}.{variable}.non-cumulative.png") plt.close() @@ -151,49 +197,63 @@ def _get_covered(x_start_stop): def single_sample_position_plot(positions, lengths, output, scale=None): - positions = (positions - .lazy() - .join(lengths.lazy(), on=COLUMN_GENOME_ID) - .with_columns(x=0.5)).collect() - for (name, grp) in positions.group_by(COLUMN_GENOME_ID): + positions = ( + positions.lazy() + .join(lengths.lazy(), on=COLUMN_GENOME_ID) + .with_columns(x=0.5) + ).collect() + for name, grp in positions.group_by(COLUMN_GENOME_ID): plt.figure(figsize=(12, 8)) ax = plt.gca() - grp = (grp.lazy() - .sort(by=COLUMN_START) - .select(pl.col('x'), - pl.col(COLUMN_START) / pl.col(COLUMN_LENGTH), - pl.col(COLUMN_STOP) / pl.col(COLUMN_LENGTH))) + grp = ( + grp.lazy() + .sort(by=COLUMN_START) + .select( + pl.col("x"), + pl.col(COLUMN_START) / pl.col(COLUMN_LENGTH), + pl.col(COLUMN_STOP) / pl.col(COLUMN_LENGTH), + ) + ) covered_positions = _get_covered(grp.collect().to_numpy()) - lc = mc.LineCollection(covered_positions, - linewidths=2, alpha=0.7) + lc = mc.LineCollection(covered_positions, linewidths=2, alpha=0.7) ax.add_collection(lc) ax.set_xlim(-0.01, 1.0) ax.set_ylim(0, 1.0) - ax.set_title(f'Position plot: {name}', fontsize=20) - ax.set_ylabel('Unit normalized position', fontsize=20) + ax.set_title(f"Position plot: {name}", fontsize=20) + ax.set_ylabel("Unit normalized position", fontsize=20) scaletag = "" - ax.tick_params(axis='both', which='major', labelsize=16) - ax.tick_params(axis='both', which='minor', labelsize=16) + ax.tick_params(axis="both", which="major", labelsize=16) + ax.tick_params(axis="both", which="minor", labelsize=16) plt.tight_layout() - plt.savefig(f'{output}.{name}.position-plot.png') + plt.savefig(f"{output}.{name}.position-plot.png") plt.close() -def position_plot(metadata, coverage, positions, target, variable, output, scale=None): +def position_plot( + metadata, coverage, positions, target, variable, output, scale=None +): plt.figure(figsize=(12, 8)) ax = plt.gca() labels = [] colors = [] - target_positions = positions.filter(pl.col(COLUMN_GENOME_ID) == target).lazy() - - for ((name, ), grp), color in zip(metadata.group_by([variable, ], - maintain_order=True), - range(0, 10)): - color = f'C{color}' + target_positions = positions.filter( + pl.col(COLUMN_GENOME_ID) == target + ).lazy() + + for ((name,), grp), color in zip( + metadata.group_by( + [ + variable, + ], + maintain_order=True, + ), + range(0, 10), + ): + color = f"C{color}" grp_coverage = ordered_coverage(coverage, grp, target) if len(grp_coverage) == 0: @@ -206,57 +266,67 @@ def position_plot(metadata, coverage, positions, target, variable, output, scale hist_x = [] hist_y = [] - col_selection = [COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, 'x'] + col_selection = [COLUMN_SAMPLE_ID, COLUMN_GENOME_ID, "x"] for sid, gid, x in grp_coverage[col_selection].rows(): - cur_positions = (target_positions - .filter(pl.col(COLUMN_SAMPLE_ID) == sid) - .join(grp_coverage.lazy(), on=COLUMN_SAMPLE_ID) - .select(pl.col('x'), - pl.col(COLUMN_START) / length, - pl.col(COLUMN_STOP) / length)) + cur_positions = ( + target_positions.filter(pl.col(COLUMN_SAMPLE_ID) == sid) + .join(grp_coverage.lazy(), on=COLUMN_SAMPLE_ID) + .select( + pl.col("x"), + pl.col(COLUMN_START) / length, + pl.col(COLUMN_STOP) / length, + ) + ) if scale is None: - covered_positions = _get_covered(cur_positions.collect().to_numpy()) - lc = mc.LineCollection(covered_positions, color=color, - linewidths=0.5, alpha=0.7) + covered_positions = _get_covered( + cur_positions.collect().to_numpy() + ) + lc = mc.LineCollection( + covered_positions, color=color, linewidths=0.5, alpha=0.7 + ) ax.add_collection(lc) else: - covered_positions = pl.concat([ - cur_positions.select(pl.col('start').alias('common')), - cur_positions.select(pl.col('stop').alias('common')) - ]).collect() - - obs_count, obs_bins = np.histogram(covered_positions, - bins=scale, - range=(0, 1)) + covered_positions = pl.concat( + [ + cur_positions.select(pl.col("start").alias("common")), + cur_positions.select(pl.col("stop").alias("common")), + ] + ).collect() + + obs_count, obs_bins = np.histogram( + covered_positions, bins=scale, range=(0, 1) + ) obs_bins = obs_bins[:-1][obs_count > 0] hist_x.extend([x for _ in obs_bins]) hist_y.extend([b for b in obs_bins]) if scale is not None: - ax.scatter(hist_x, hist_y, s=.2, color=color, alpha=0.7) + ax.scatter(hist_x, hist_y, s=0.2, color=color, alpha=0.7) ax.set_xlim(-0.01, 1.0) ax.set_ylim(0, 1.0) if scale is None: - ax.set_title(f'Position plot: {target} ({length}bp)', fontsize=20) - ax.set_ylabel('Unit normalized genome position', fontsize=20) + ax.set_title(f"Position plot: {target} ({length}bp)", fontsize=20) + ax.set_ylabel("Unit normalized genome position", fontsize=20) scaletag = "" else: - ax.set_title(f'Scaled position plot: {target} ({length}bp)', fontsize=20) - ax.set_ylabel(f'Coverage (1/{scale})th scale', fontsize=20) + ax.set_title( + f"Scaled position plot: {target} ({length}bp)", fontsize=20 + ) + ax.set_ylabel(f"Coverage (1/{scale})th scale", fontsize=20) scaletag = f"-1_{scale}th-scale" - ax.set_xlabel('Proportion of samples', fontsize=20) + ax.set_xlabel("Proportion of samples", fontsize=20) plt.legend(labels, fontsize=20) leg = ax.get_legend() for i, lh in enumerate(leg.legendHandles): lh.set_color(colors[i]) - ax.tick_params(axis='both', which='major', labelsize=16) - ax.tick_params(axis='both', which='minor', labelsize=16) + ax.tick_params(axis="both", which="major", labelsize=16) + ax.tick_params(axis="both", which="minor", labelsize=16) plt.tight_layout() - plt.savefig(f'{output}.{target}.{variable}.position-plot{scaletag}.png') + plt.savefig(f"{output}.{target}.{variable}.position-plot{scaletag}.png") plt.close() diff --git a/micov/_version.py b/micov/_version.py index 3ff1e13..f437c8c 100644 --- a/micov/_version.py +++ b/micov/_version.py @@ -1,4 +1,3 @@ - # This file helps to compute a version number in source trees obtained from # git-archive tarball (such as those provided by githubs download-from-tag # feature). Distribution tarballs (built by setup.py sdist) and build @@ -68,12 +67,14 @@ class NotThisMethod(Exception): def register_vcs_handler(vcs: str, method: str) -> Callable: # decorator """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f: Callable) -> Callable: """Store f in HANDLERS[vcs][method].""" if vcs not in HANDLERS: HANDLERS[vcs] = {} HANDLERS[vcs][method] = f return f + return decorate @@ -100,10 +101,14 @@ def run_command( try: dispcmd = str([command] + args) # remember shell=False, so use git.cmd on windows, not just git - process = subprocess.Popen([command] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None), **popen_kwargs) + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + **popen_kwargs, + ) break except OSError as e: if e.errno == errno.ENOENT: @@ -141,15 +146,21 @@ def versions_from_parentdir( for _ in range(3): dirname = os.path.basename(root) if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } rootdirs.append(root) root = os.path.dirname(root) # up a level if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") @@ -212,7 +223,7 @@ def git_versions_from_keywords( # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. TAG = "tag: " - tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} if not tags: # Either we're using git < 1.8.3, or there really are no tags. We use # a heuristic: assume all version tags have a digit. The old git %d @@ -221,7 +232,7 @@ def git_versions_from_keywords( # between branches and tags. By ignoring refnames without digits, we # filter out many common branch names like "release" and # "stabilization", as well as "HEAD" and "master". - tags = {r for r in refs if re.search(r'\d', r)} + tags = {r for r in refs if re.search(r"\d", r)} if verbose: print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: @@ -229,32 +240,36 @@ def git_versions_from_keywords( for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] + r = ref[len(tag_prefix) :] # Filter out refs that exactly match prefix or that don't start # with a number once the prefix is stripped (mostly a concern # when prefix is '') - if not re.match(r'\d', r): + if not re.match(r"\d", r): continue if verbose: print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } # no suitable tags, so version is "0+unknown", but full hex is still there if verbose: print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } @register_vcs_handler("git", "pieces_from_vcs") def git_pieces_from_vcs( - tag_prefix: str, - root: str, - verbose: bool, - runner: Callable = run_command + tag_prefix: str, root: str, verbose: bool, runner: Callable = run_command ) -> Dict[str, Any]: """Get version from 'git describe' in the root of the source tree. @@ -273,8 +288,9 @@ def git_pieces_from_vcs( env.pop("GIT_DIR", None) runner = functools.partial(runner, env=env) - _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=not verbose) + _, rc = runner( + GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=not verbose + ) if rc != 0: if verbose: print("Directory %s not under git control" % root) @@ -282,10 +298,19 @@ def git_pieces_from_vcs( # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = runner(GITS, [ - "describe", "--tags", "--dirty", "--always", "--long", - "--match", f"{tag_prefix}[[:digit:]]*" - ], cwd=root) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + f"{tag_prefix}[[:digit:]]*", + ], + cwd=root, + ) # --long was added in git-1.5.5 if describe_out is None: raise NotThisMethod("'git describe' failed") @@ -300,8 +325,9 @@ def git_pieces_from_vcs( pieces["short"] = full_out[:7] # maybe improved later pieces["error"] = None - branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], - cwd=root) + branch_name, rc = runner( + GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root + ) # --abbrev-ref was added in git-1.6.3 if rc != 0 or branch_name is None: raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") @@ -341,17 +367,18 @@ def git_pieces_from_vcs( dirty = git_describe.endswith("-dirty") pieces["dirty"] = dirty if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] + git_describe = git_describe[: git_describe.rindex("-dirty")] # now we have TAG-NUM-gHEX or HEX if "-" in git_describe: # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparsable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) + pieces["error"] = ( + "unable to parse git-describe output: '%s'" % describe_out + ) return pieces # tag @@ -360,10 +387,12 @@ def git_pieces_from_vcs( if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] + pieces["closest-tag"] = full_tag[len(tag_prefix) :] # distance: number of commits since tag pieces["distance"] = int(mo.group(2)) @@ -378,7 +407,9 @@ def git_pieces_from_vcs( pieces["distance"] = len(out.split()) # total number of commits # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[ + 0 + ].strip() # Use only the last line. Previous lines may contain GPG signature # information. date = date.splitlines()[-1] @@ -412,8 +443,7 @@ def render_pep440(pieces: Dict[str, Any]) -> str: rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -442,8 +472,7 @@ def render_pep440_branch(pieces: Dict[str, Any]) -> str: rendered = "0" if pieces["branch"] != "master": rendered += ".dev0" - rendered += "+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) if pieces["dirty"]: rendered += ".dirty" return rendered @@ -468,10 +497,15 @@ def render_pep440_pre(pieces: Dict[str, Any]) -> str: if pieces["closest-tag"]: if pieces["distance"]: # update the post release segment - tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + tag_version, post_version = pep440_split_post( + pieces["closest-tag"] + ) rendered = tag_version if post_version is not None: - rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + rendered += ".post%d.dev%d" % ( + post_version + 1, + pieces["distance"], + ) else: rendered += ".post0.dev%d" % (pieces["distance"]) else: @@ -604,11 +638,13 @@ def render_git_describe_long(pieces: Dict[str, Any]) -> str: def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: """Render the given version pieces into the requested style.""" if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } if not style or style == "default": style = "pep440" # the default @@ -632,9 +668,13 @@ def render(pieces: Dict[str, Any], style: str) -> Dict[str, Any]: else: raise ValueError("unknown style '%s'" % style) - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } def get_versions() -> Dict[str, Any]: @@ -648,8 +688,9 @@ def get_versions() -> Dict[str, Any]: verbose = cfg.verbose try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) + return git_versions_from_keywords( + get_keywords(), cfg.tag_prefix, verbose + ) except NotThisMethod: pass @@ -658,13 +699,16 @@ def get_versions() -> Dict[str, Any]: # versionfile_source is the relative path from the top of the source # tree (where the .git directory might live) to this file. Invert # this to find the root from __file__. - for _ in cfg.versionfile_source.split('/'): + for _ in cfg.versionfile_source.split("/"): root = os.path.dirname(root) except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } try: pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) @@ -678,6 +722,10 @@ def get_versions() -> Dict[str, Any]: except NotThisMethod: pass - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/micov/cli.py b/micov/cli.py index 542c8e9..d778c09 100644 --- a/micov/cli.py +++ b/micov/cli.py @@ -8,10 +8,19 @@ import sys import tqdm from glob import glob -from ._io import (parse_genome_lengths, parse_taxonomy, set_taxonomy_as_id, - parse_qiita_coverages, parse_sam_to_df, write_qiita_cov, - parse_sample_metadata, compress_from_stream, - parse_bed_cov_to_df, _single_df, _check_and_compress) +from ._io import ( + parse_genome_lengths, + parse_taxonomy, + set_taxonomy_as_id, + parse_qiita_coverages, + parse_sam_to_df, + write_qiita_cov, + parse_sample_metadata, + compress_from_stream, + parse_bed_cov_to_df, + _single_df, + _check_and_compress, +) from ._cov import coverage_percent from ._convert import cigar_to_lens from ._per_sample import per_sample_coverage @@ -21,7 +30,7 @@ def _first_col_as_set(fp): - df = pl.read_csv(fp, separator='\t', infer_schema_length=0) + df = pl.read_csv(fp, separator="\t", infer_schema_length=0) return set(df[df.columns[0]]) @@ -32,25 +41,53 @@ def cli(): @cli.command() -@click.option('--qiita-coverages', type=click.Path(exists=True), multiple=True, - required=True, help='Pre-computed Qiita coverage data') -@click.option('--samples-to-keep', type=click.Path(exists=True), - required=False, - help='A metadata file with the samples to keep') -@click.option('--samples-to-ignore', type=click.Path(exists=True), - required=False, - help='A metadata file with the samples to ignore') -@click.option('--features-to-keep', type=click.Path(exists=True), - required=False, - help='A metadata file with the features to keep') -@click.option('--features-to-ignore', type=click.Path(exists=True), - required=False, - help='A metadata file with the features to ignore') -@click.option('--output', type=click.Path(exists=False), required=True) -@click.option('--lengths', type=click.Path(exists=True), required=True, - help="Genome lengths") -def qiita_coverage(qiita_coverages, samples_to_keep, samples_to_ignore, - features_to_keep, features_to_ignore, output, lengths): +@click.option( + "--qiita-coverages", + type=click.Path(exists=True), + multiple=True, + required=True, + help="Pre-computed Qiita coverage data", +) +@click.option( + "--samples-to-keep", + type=click.Path(exists=True), + required=False, + help="A metadata file with the samples to keep", +) +@click.option( + "--samples-to-ignore", + type=click.Path(exists=True), + required=False, + help="A metadata file with the samples to ignore", +) +@click.option( + "--features-to-keep", + type=click.Path(exists=True), + required=False, + help="A metadata file with the features to keep", +) +@click.option( + "--features-to-ignore", + type=click.Path(exists=True), + required=False, + help="A metadata file with the features to ignore", +) +@click.option("--output", type=click.Path(exists=False), required=True) +@click.option( + "--lengths", + type=click.Path(exists=True), + required=True, + help="Genome lengths", +) +def qiita_coverage( + qiita_coverages, + samples_to_keep, + samples_to_ignore, + features_to_keep, + features_to_ignore, + output, + lengths, +): """Compute aggregated coverage from one or more Qiita coverage files.""" if samples_to_keep: samples_to_keep = _first_col_as_set(samples_to_keep) @@ -66,29 +103,47 @@ def qiita_coverage(qiita_coverages, samples_to_keep, samples_to_ignore, lengths = parse_genome_lengths(lengths) - coverage = parse_qiita_coverages(qiita_coverages, - sample_keep=samples_to_keep, - sample_drop=samples_to_ignore, - feature_keep=features_to_keep, - feature_drop=features_to_ignore) - coverage.write_csv(output + '.covered-positions.tsv', separator='\t', - include_header=True) + coverage = parse_qiita_coverages( + qiita_coverages, + sample_keep=samples_to_keep, + sample_drop=samples_to_ignore, + feature_keep=features_to_keep, + feature_drop=features_to_ignore, + ) + coverage.write_csv( + output + ".covered-positions.tsv", separator="\t", include_header=True + ) genome_coverage = coverage_percent(coverage, lengths).collect() - genome_coverage.write_csv(output + '.coverage.tsv', separator='\t', - include_header=True) + genome_coverage.write_csv( + output + ".coverage.tsv", separator="\t", include_header=True + ) @cli.command() -@click.option('--data', type=click.Path(exists=True), required=False) -@click.option('--output', type=click.Path(exists=False)) -@click.option('--disable-compression', is_flag=True, default=False, - help='Do not compress the regions') -@click.option('--lengths', type=click.Path(exists=True), required=False, - help='Genome lengths, if provided compute coverage') -@click.option('--taxonomy', type=click.Path(exists=True), required=False, - help=('Genome taxonomy, if provided show species in coverage ' - 'percentage. Only works when --length is provided')) +@click.option("--data", type=click.Path(exists=True), required=False) +@click.option("--output", type=click.Path(exists=False)) +@click.option( + "--disable-compression", + is_flag=True, + default=False, + help="Do not compress the regions", +) +@click.option( + "--lengths", + type=click.Path(exists=True), + required=False, + help="Genome lengths, if provided compute coverage", +) +@click.option( + "--taxonomy", + type=click.Path(exists=True), + required=False, + help=( + "Genome taxonomy, if provided show species in coverage " + "percentage. Only works when --length is provided" + ), +) def compress(data, output, disable_compression, lengths, taxonomy): """Compress BAM/SAM/BED mapping data. @@ -96,7 +151,7 @@ def compress(data, output, disable_compression, lengths, taxonomy): samtools view foo.bam | micov coverage | gzip > foo.cov.gz """ - if output == '-' or output is None: + if output == "-" or output is None: output = sys.stdout if lengths is not None: @@ -106,16 +161,19 @@ def compress(data, output, disable_compression, lengths, taxonomy): taxonomy = parse_taxonomy(taxonomy) if os.path.isdir(data): - file_list = (glob(data + "/*.sam") - + glob(data + '/*.sam.xz') - + glob(data + '/*.sam.gz')) + file_list = ( + glob(data + "/*.sam") + + glob(data + "/*.sam.xz") + + glob(data + "/*.sam.gz") + ) else: file_list = [data] dfs = [] for samfile in file_list: - df = compress_from_stream(samfile, - disable_compression=disable_compression) + df = compress_from_stream( + samfile, disable_compression=disable_compression + ) if df is None or len(df) == 0: logger.warning("File appears empty...") else: @@ -123,32 +181,40 @@ def compress(data, output, disable_compression, lengths, taxonomy): coverage = _single_df(_check_and_compress(dfs, compress_size=0)) if lengths is None: - coverage.write_csv(output, separator='\t', include_header=True) + coverage.write_csv(output, separator="\t", include_header=True) else: genome_coverage = coverage_percent(coverage, lengths).collect() if taxonomy is None: - genome_coverage.write_csv(output, separator='\t', - include_header=True) + genome_coverage.write_csv( + output, separator="\t", include_header=True + ) else: - genome_coverage_with_taxonomy = set_taxonomy_as_id(genome_coverage, - taxonomy) - genome_coverage_with_taxonomy.write_csv(output, separator='\t', - include_header=True) + genome_coverage_with_taxonomy = set_taxonomy_as_id( + genome_coverage, taxonomy + ) + genome_coverage_with_taxonomy.write_csv( + output, separator="\t", include_header=True + ) @cli.command() -@click.option('--positions', type=click.Path(exists=True), required=False, - help='BED3') -@click.option('--output', type=click.Path(exists=False), required=False) -@click.option('--lengths', type=click.Path(exists=True), required=True, - help="Genome lengths") +@click.option( + "--positions", type=click.Path(exists=True), required=False, help="BED3" +) +@click.option("--output", type=click.Path(exists=False), required=False) +@click.option( + "--lengths", + type=click.Path(exists=True), + required=True, + help="Genome lengths", +) def position_plot(positions, output, lengths): """Construct a single sample coverage plot.""" if positions is None: data = sys.stdin else: - data = open(positions, 'rb') + data = open(positions, "rb") lengths = parse_genome_lengths(lengths) df = parse_bed_cov_to_df(data) @@ -156,10 +222,14 @@ def position_plot(positions, output, lengths): @cli.command() -@click.option('--paths', type=click.Path(exists=True), required=True) -@click.option('--output', type=click.Path(exists=False)) -@click.option('--lengths', type=click.Path(exists=True), required=True, - help="Genome lengths") +@click.option("--paths", type=click.Path(exists=True), required=True) +@click.option("--output", type=click.Path(exists=False)) +@click.option( + "--lengths", + type=click.Path(exists=True), + required=True, + help="Genome lengths", +) def consolidate(paths, output, lengths): """Consolidate coverage files into a Qiita-like coverage.tgz.""" paths = [path.strip() for path in open(paths)] @@ -171,22 +241,46 @@ def consolidate(paths, output, lengths): @cli.command() -@click.option('--qiita-coverages', type=click.Path(exists=True), multiple=True, - required=True, help='Pre-computed Qiita coverage data') -@click.option('--output', type=click.Path(exists=False)) -@click.option('--lengths', type=click.Path(exists=True), required=True, - help="Genome lengths") -@click.option('--samples-to-keep', type=click.Path(exists=True), - required=False, - help='A metadata file with the sample metadata') -@click.option('--features-to-keep', type=click.Path(exists=True), - required=False, - help='A metadata file with the features to keep') -@click.option('--features-to-ignore', type=click.Path(exists=True), - required=False, - help='A metadata file with the features to ignore') -def qiita_to_parquet(qiita_coverages, lengths, output, samples_to_keep, - features_to_keep, features_to_ignore): +@click.option( + "--qiita-coverages", + type=click.Path(exists=True), + multiple=True, + required=True, + help="Pre-computed Qiita coverage data", +) +@click.option("--output", type=click.Path(exists=False)) +@click.option( + "--lengths", + type=click.Path(exists=True), + required=True, + help="Genome lengths", +) +@click.option( + "--samples-to-keep", + type=click.Path(exists=True), + required=False, + help="A metadata file with the sample metadata", +) +@click.option( + "--features-to-keep", + type=click.Path(exists=True), + required=False, + help="A metadata file with the features to keep", +) +@click.option( + "--features-to-ignore", + type=click.Path(exists=True), + required=False, + help="A metadata file with the features to ignore", +) +def qiita_to_parquet( + qiita_coverages, + lengths, + output, + samples_to_keep, + features_to_keep, + features_to_ignore, +): """Aggregate Qiita coverage to parquet.""" if features_to_keep: features_to_keep = _first_col_as_set(features_to_keep) @@ -198,40 +292,66 @@ def qiita_to_parquet(qiita_coverages, lengths, output, samples_to_keep, samples_to_keep = _first_col_as_set(samples_to_keep) lengths = parse_genome_lengths(lengths) - covered_positions, coverage = per_sample_coverage(qiita_coverages, - samples_to_keep, - features_to_keep, - features_to_ignore, - lengths) - - coverage.collect().write_parquet(output + '.coverage.parquet', - compression='zstd', - compression_level=3) # default afaik - covered_positions.write_parquet(output + '.covered_positions.parquet', - compression='zstd', - compression_level=3) # default afaik + covered_positions, coverage = per_sample_coverage( + qiita_coverages, + samples_to_keep, + features_to_keep, + features_to_ignore, + lengths, + ) + + coverage.collect().write_parquet( + output + ".coverage.parquet", compression="zstd", compression_level=3 + ) # default afaik + covered_positions.write_parquet( + output + ".covered_positions.parquet", + compression="zstd", + compression_level=3, + ) # default afaik @cli.command() -@click.option('--parquet-coverage', type=click.Path(exists=False), - required=True, help=('Pre-computed coverage data as parquet. ' - 'This should be the basename used, i.e. ' - 'for "foo.coverage.parquet", please use ' - '"foo"')) -@click.option('--sample-metadata', type=click.Path(exists=True), - required=True, - help='A metadata file with the sample metadata') -@click.option('--sample-metadata-column', type=str, - required=True, - help='The column to consider in the sample metadata') -@click.option('--features-to-keep', type=click.Path(exists=True), - required=False, - help='A metadata file with the features to keep') -@click.option('--output', type=click.Path(exists=False), required=True) -@click.option('--plot', is_flag=True, default=False, - help='Generate plots from features') -def per_sample_group(parquet_coverage, sample_metadata, sample_metadata_column, - features_to_keep, output, plot): +@click.option( + "--parquet-coverage", + type=click.Path(exists=False), + required=True, + help=( + "Pre-computed coverage data as parquet. " + "This should be the basename used, i.e. " + 'for "foo.coverage.parquet", please use ' + '"foo"' + ), +) +@click.option( + "--sample-metadata", + type=click.Path(exists=True), + required=True, + help="A metadata file with the sample metadata", +) +@click.option( + "--sample-metadata-column", + type=str, + required=True, + help="The column to consider in the sample metadata", +) +@click.option( + "--features-to-keep", + type=click.Path(exists=True), + required=False, + help="A metadata file with the features to keep", +) +@click.option("--output", type=click.Path(exists=False), required=True) +@click.option( + "--plot", is_flag=True, default=False, help="Generate plots from features" +) +def per_sample_group( + parquet_coverage, + sample_metadata, + sample_metadata_column, + features_to_keep, + output, + plot, +): """Generate sample group plots and coverage data.""" _load_db(parquet_coverage, sample_metadata, features_to_keep) @@ -239,8 +359,13 @@ def per_sample_group(parquet_coverage, sample_metadata, sample_metadata_column, all_coverage = duckdb.sql("SELECT * FROM coverage").pl() metadata_pl = duckdb.sql("SELECT * FROM metadata").pl() - per_sample_plots(all_coverage, all_covered_positions, metadata_pl, - sample_metadata_column, output) + per_sample_plots( + all_coverage, + all_covered_positions, + metadata_pl, + sample_metadata_column, + output, + ) def _load_db(dbbase, sample_metadata, features_to_keep): @@ -250,28 +375,34 @@ def _load_db(dbbase, sample_metadata, features_to_keep): samples = tuple(metadata_pl[sample_column].unique()) - sfilt = f'WHERE sample_id IN {samples}' + sfilt = f"WHERE sample_id IN {samples}" if features_to_keep: features_to_keep = tuple(_first_col_as_set(features_to_keep)) sgfilt = f"{sfilt} AND genome_id IN {features_to_keep}" else: sgfilt = sfilt - duckdb.sql(f"""CREATE VIEW coverage + duckdb.sql( + f"""CREATE VIEW coverage AS SELECT * FROM '{dbbase}.coverage.parquet' - {sgfilt}""") - duckdb.sql(f"""CREATE VIEW covered_positions + {sgfilt}""" + ) + duckdb.sql( + f"""CREATE VIEW covered_positions AS SELECT * FROM '{dbbase}.covered_positions.parquet' - {sgfilt}""") - duckdb.sql(f"""CREATE TABLE metadata + {sgfilt}""" + ) + duckdb.sql( + f"""CREATE TABLE metadata AS SELECT * FROM metadata_pl {sfilt} AND {COLUMN_SAMPLE_ID} IN (SELECT DISTINCT {COLUMN_SAMPLE_ID} - FROM coverage)""") + FROM coverage)""" + ) -if __name__ == '__main__': +if __name__ == "__main__": cli() diff --git a/micov/tests/test_convert.py b/micov/tests/test_convert.py index 2539e4f..c154489 100644 --- a/micov/tests/test_convert.py +++ b/micov/tests/test_convert.py @@ -4,9 +4,9 @@ class ConvertTests(unittest.TestCase): def test_cigar_to_lens(self): - self.assertEqual(cigar_to_lens('150M'), 150) - self.assertEqual(cigar_to_lens('3M1I3M1D5M'), 12) + self.assertEqual(cigar_to_lens("150M"), 150) + self.assertEqual(cigar_to_lens("3M1I3M1D5M"), 12) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/micov/tests/test_cov.py b/micov/tests/test_cov.py index 23176bf..c6592de 100644 --- a/micov/tests/test_cov.py +++ b/micov/tests/test_cov.py @@ -1,59 +1,78 @@ import unittest from micov._cov import compress, coverage_percent -from micov._constants import (BED_COV_SCHEMA, COLUMN_GENOME_ID, - GENOME_LENGTH_SCHEMA, - GENOME_COVERAGE_SCHEMA) +from micov._constants import ( + BED_COV_SCHEMA, + COLUMN_GENOME_ID, + GENOME_LENGTH_SCHEMA, + GENOME_COVERAGE_SCHEMA, +) import polars as pl import polars.testing as plt class CovTests(unittest.TestCase): def test_compress(self): - exp = pl.DataFrame([['G123', 10, 50], - ['G123', 51, 89], - ['G123', 90, 100], - ['G123', 101, 110], - ['G456', 200, 300], - ['G456', 400, 500]], - orient='row', - schema=BED_COV_SCHEMA.dtypes_flat) - data = pl.DataFrame([['G123', 11, 50], - ['G123', 20, 30], - ['G456', 200, 299], - ['G123', 10, 12], - ['G456', 201, 300], - ['G123', 90, 100], - ['G123', 51, 89], - ['G123', 101, 110], - ['G456', 400, 500]], - orient='row', - schema=BED_COV_SCHEMA.dtypes_flat) + exp = pl.DataFrame( + [ + ["G123", 10, 50], + ["G123", 51, 89], + ["G123", 90, 100], + ["G123", 101, 110], + ["G456", 200, 300], + ["G456", 400, 500], + ], + orient="row", + schema=BED_COV_SCHEMA.dtypes_flat, + ) + data = pl.DataFrame( + [ + ["G123", 11, 50], + ["G123", 20, 30], + ["G456", 200, 299], + ["G123", 10, 12], + ["G456", 201, 300], + ["G123", 90, 100], + ["G123", 51, 89], + ["G123", 101, 110], + ["G456", 400, 500], + ], + orient="row", + schema=BED_COV_SCHEMA.dtypes_flat, + ) obs = compress(data).sort(COLUMN_GENOME_ID) plt.assert_frame_equal(obs, exp) def test_coverage_percent(self): - data = pl.DataFrame([['G123', 11, 50], - ['G456', 200, 299], - ['G123', 90, 100], - ['G456', 400, 500]], - orient='row', - schema=BED_COV_SCHEMA.dtypes_flat) - lengths = pl.DataFrame([['G123', 100], - ['G456', 1000], - ['G789', 500]], - orient='row', - schema=GENOME_LENGTH_SCHEMA.dtypes_flat) + data = pl.DataFrame( + [ + ["G123", 11, 50], + ["G456", 200, 299], + ["G123", 90, 100], + ["G456", 400, 500], + ], + orient="row", + schema=BED_COV_SCHEMA.dtypes_flat, + ) + lengths = pl.DataFrame( + [["G123", 100], ["G456", 1000], ["G789", 500]], + orient="row", + schema=GENOME_LENGTH_SCHEMA.dtypes_flat, + ) g123_covered = (50 - 11) + (100 - 90) g456_covered = (299 - 200) + (500 - 400) - exp = pl.DataFrame([['G123', g123_covered, 100, (g123_covered / 100) * 100], - ['G456', g456_covered, 1000, (g456_covered / 1000) * 100]], - orient='row', - schema=GENOME_COVERAGE_SCHEMA.dtypes_flat) + exp = pl.DataFrame( + [ + ["G123", g123_covered, 100, (g123_covered / 100) * 100], + ["G456", g456_covered, 1000, (g456_covered / 1000) * 100], + ], + orient="row", + schema=GENOME_COVERAGE_SCHEMA.dtypes_flat, + ) obs = coverage_percent(data, lengths).sort(COLUMN_GENOME_ID).collect() plt.assert_frame_equal(obs, exp) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/micov/tests/test_per_sample.py b/micov/tests/test_per_sample.py index 97a0e9e..fb6592a 100644 --- a/micov/tests/test_per_sample.py +++ b/micov/tests/test_per_sample.py @@ -2,41 +2,56 @@ import polars as pl import polars.testing as plt from micov._per_sample import compress_per_sample -from micov._constants import (GENOME_LENGTH_SCHEMA, - BED_COV_SAMPLEID_SCHEMA, - COLUMN_GENOME_ID, COLUMN_SAMPLE_ID, - GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA) +from micov._constants import ( + GENOME_LENGTH_SCHEMA, + BED_COV_SAMPLEID_SCHEMA, + COLUMN_GENOME_ID, + COLUMN_SAMPLE_ID, + GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA, +) class Tests(unittest.TestCase): def test_compress_per_sample(self): - lengths = pl.DataFrame([['A', 400], - ['B', 500]], - orient='row', - schema=GENOME_LENGTH_SCHEMA.dtypes_flat) - df = pl.DataFrame([['A', 10, 100, 'S1'], - ['A', 10, 20, 'S1'], - ['A', 90, 110, 'S1'], - ['B', 50, 150, 'S1'], - ['B', 200, 250, 'S1'], - ['A', 90, 95, 'S2'], - ['A', 50, 150, 'S2'], - ['A', 200, 300, 'S1'], - ['A', 201, 299, 'S1']], - orient='row', - schema=BED_COV_SAMPLEID_SCHEMA.dtypes_flat) - s1_a = ((110 - 10) + (300 - 200)) - s1_b = ((150 - 50) + (250 - 200)) - s2_a = (150 - 50) - exp = pl.DataFrame([['A', s1_a, 400, (s1_a / 400) * 100, 'S1'], - ['A', s2_a, 400, (s2_a / 400) * 100, 'S2'], - ['B', s1_b, 500, (s1_b / 500) * 100, 'S1']], - orient='row', - schema=GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA.dtypes_flat) - obs = compress_per_sample(df, lengths).sort([COLUMN_GENOME_ID, - COLUMN_SAMPLE_ID]).collect() + lengths = pl.DataFrame( + [["A", 400], ["B", 500]], + orient="row", + schema=GENOME_LENGTH_SCHEMA.dtypes_flat, + ) + df = pl.DataFrame( + [ + ["A", 10, 100, "S1"], + ["A", 10, 20, "S1"], + ["A", 90, 110, "S1"], + ["B", 50, 150, "S1"], + ["B", 200, 250, "S1"], + ["A", 90, 95, "S2"], + ["A", 50, 150, "S2"], + ["A", 200, 300, "S1"], + ["A", 201, 299, "S1"], + ], + orient="row", + schema=BED_COV_SAMPLEID_SCHEMA.dtypes_flat, + ) + s1_a = (110 - 10) + (300 - 200) + s1_b = (150 - 50) + (250 - 200) + s2_a = 150 - 50 + exp = pl.DataFrame( + [ + ["A", s1_a, 400, (s1_a / 400) * 100, "S1"], + ["A", s2_a, 400, (s2_a / 400) * 100, "S2"], + ["B", s1_b, 500, (s1_b / 500) * 100, "S1"], + ], + orient="row", + schema=GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA.dtypes_flat, + ) + obs = ( + compress_per_sample(df, lengths) + .sort([COLUMN_GENOME_ID, COLUMN_SAMPLE_ID]) + .collect() + ) plt.assert_frame_equal(obs, exp) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main()