Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

formatted by black formatter #16

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion micov/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""micov: microbiome coverage."""

from . import _version
__version__ = _version.get_versions()['version']

__version__ = _version.get_versions()["version"]
119 changes: 76 additions & 43 deletions micov/_constants.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,56 @@
COLUMN_GENOME_ID = 'genome_id'
COLUMN_GENOME_ID = "genome_id"
COLUMN_GENOME_ID_DTYPE = str
COLUMN_SAMPLE_ID = 'sample_id'
COLUMN_SAMPLE_ID = "sample_id"
COLUMN_SAMPLE_ID_DTYPE = str
COLUMN_START = 'start'
COLUMN_START = "start"
COLUMN_START_DTYPE = int
COLUMN_STOP = 'stop'
COLUMN_STOP = "stop"
COLUMN_STOP_DTYPE = int
COLUMN_READ_ID = 'read'
COLUMN_READ_ID = "read"
COLUMN_READ_ID_DTYPE = str
COLUMN_FLAG = 'flag'
COLUMN_FLAG = "flag"
COLUMN_FLAG_DTYPE = int
COLUMN_CIGAR = 'cigar'
COLUMN_CIGAR = "cigar"
COLUMN_CIGAR_DTYPE = str
COLUMN_LENGTH = 'length'
COLUMN_LENGTH = "length"
COLUMN_LENGTH_DTYPE = int
COLUMN_TAXONOMY = 'taxonomy'
COLUMN_TAXONOMY = "taxonomy"
COLUMN_TAXONOMY_DTYPE = str
COLUMN_COVERED = 'covered'
COLUMN_PERCENT_COVERED = 'percent_covered'
COLUMN_COVERED = "covered"
COLUMN_PERCENT_COVERED = "percent_covered"
COLUMN_COVERED_DTYPE = int
COLUMN_PERCENT_COVERED = 'percent_covered'
COLUMN_PERCENT_COVERED = "percent_covered"
COLUMN_PERCENT_COVERED_DTYPE = float

### should really probably just use a dataclass, and type annotations?


class _SCHEMA:
def __init__(self):
self.dtypes_dict = dict(self.dtypes_flat)
self.columns = tuple([c for c, _ in self.dtypes_flat])


class _BED_COV_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
]


BED_COV_SCHEMA = _BED_COV_SCHEMA()


class _BED_COV_WITH_SAMPLEID_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE),
]


BED_COV_SAMPLEID_SCHEMA = _BED_COV_WITH_SAMPLEID_SCHEMA()


Expand All @@ -51,48 +60,72 @@ class _SAM_SUBSET_SCHEMA(_SCHEMA):
# for binary coverage, we don't care about the flag, but we're parsing it
# now so we can care in the future.
column_indices = [0, 1, 2, 3, 5]
dtypes_flat = [(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE)]
dtypes_flat = [
(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE),
]


SAM_SUBSET_SCHEMA = _SAM_SUBSET_SCHEMA()


class _SAM_SUBSET_SCHEMA_PARSED(_SCHEMA):
dtypes_flat = [(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE)]
dtypes_flat = [
(COLUMN_READ_ID, COLUMN_READ_ID_DTYPE),
(COLUMN_FLAG, COLUMN_FLAG_DTYPE),
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_START, COLUMN_START_DTYPE),
(COLUMN_CIGAR, COLUMN_CIGAR_DTYPE),
(COLUMN_STOP, COLUMN_STOP_DTYPE),
]


SAM_SUBSET_SCHEMA_PARSED = _SAM_SUBSET_SCHEMA_PARSED()


class _GENOME_LENGTH_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
]


GENOME_LENGTH_SCHEMA = _GENOME_LENGTH_SCHEMA()


class _GENOME_TAXONOMY_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE),
]


GENOME_TAXNOMY_SCHEMA = _GENOME_TAXONOMY_SCHEMA()


class _GENOME_COVERAGE_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE),
]


GENOME_COVERAGE_SCHEMA = _GENOME_COVERAGE_SCHEMA()


class _GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE)]
dtypes_flat = [
(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
(COLUMN_LENGTH, COLUMN_LENGTH_DTYPE),
(COLUMN_PERCENT_COVERED, COLUMN_PERCENT_COVERED_DTYPE),
(COLUMN_SAMPLE_ID, COLUMN_SAMPLE_ID_DTYPE),
]


GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA = _GENOME_COVERAGE_WITH_SAMPLEID_SCHEMA()
10 changes: 5 additions & 5 deletions micov/_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,14 @@ def cigar_to_lens(cigar):

"""
align, offset = 0, 0
n = '' # current step size
n = "" # current step size
for c in cigar:
if c in 'MDIHNPSX=':
if c in 'M=X':
if c in "MDIHNPSX=":
if c in "M=X":
align += int(n)
elif c in 'DN':
elif c in "DN":
offset += int(n)
n = ''
n = ""
else:
n += c

Expand Down
87 changes: 56 additions & 31 deletions micov/_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,44 @@

import numba
import polars as pl
from ._constants import (COLUMN_GENOME_ID, COLUMN_START, COLUMN_STOP,
COLUMN_COVERED, BED_COV_SCHEMA,
COLUMN_LENGTH, COLUMN_PERCENT_COVERED)
from ._constants import (
COLUMN_GENOME_ID,
COLUMN_START,
COLUMN_STOP,
COLUMN_COVERED,
BED_COV_SCHEMA,
COLUMN_LENGTH,
COLUMN_PERCENT_COVERED,
)


def coverage_percent(coverages, lengths):
missing = (set(coverages[COLUMN_GENOME_ID]) -
set(lengths[COLUMN_GENOME_ID]))
missing = set(coverages[COLUMN_GENOME_ID]) - set(lengths[COLUMN_GENOME_ID])
if len(missing) > 0:
raise ValueError(f"{len(missing)} genome(s) appear unrepresented in "
f"the length information, examples: "
f"{sorted(missing)[:5]}")
raise ValueError(
f"{len(missing)} genome(s) appear unrepresented in "
f"the length information, examples: "
f"{sorted(missing)[:5]}"
)

return (coverages
.lazy()
.with_columns((pl.col(COLUMN_STOP) -
pl.col(COLUMN_START)).alias(COLUMN_COVERED))
.group_by([COLUMN_GENOME_ID, ])
.agg(pl.col(COLUMN_COVERED).sum())
.join(lengths.lazy(), on=COLUMN_GENOME_ID)
.with_columns(((pl.col(COLUMN_COVERED) /
pl.col(COLUMN_LENGTH)) * 100).alias(COLUMN_PERCENT_COVERED))) # noqa
return (
coverages.lazy()
.with_columns(
(pl.col(COLUMN_STOP) - pl.col(COLUMN_START)).alias(COLUMN_COVERED)
)
.group_by(
[
COLUMN_GENOME_ID,
]
)
.agg(pl.col(COLUMN_COVERED).sum())
.join(lengths.lazy(), on=COLUMN_GENOME_ID)
.with_columns(
((pl.col(COLUMN_COVERED) / pl.col(COLUMN_LENGTH)) * 100).alias(
COLUMN_PERCENT_COVERED
)
)
) # noqa


@numba.jit(nopython=True)
Expand Down Expand Up @@ -59,22 +75,31 @@ def _compress(rows):

def compress(df):
compressed = []
for (genome, ), grp in df.group_by([COLUMN_GENOME_ID, ]):
rows = (grp
.lazy()
.select([COLUMN_START, COLUMN_STOP])
.sort(COLUMN_START)
.collect()
.to_numpy(order='c'))
for (genome,), grp in df.group_by(
[
COLUMN_GENOME_ID,
]
):
rows = (
grp.lazy()
.select([COLUMN_START, COLUMN_STOP])
.sort(COLUMN_START)
.collect()
.to_numpy(order="c")
)

grp_compressed = _compress(rows)
grp_compressed_df = pl.LazyFrame(grp_compressed,
schema=[BED_COV_SCHEMA.dtypes_flat[1],
BED_COV_SCHEMA.dtypes_flat[2]],
orient='row')
grp_compressed_df = (grp_compressed_df
.with_columns(pl.lit(genome).alias(COLUMN_GENOME_ID))
.select(BED_COV_SCHEMA.columns))
grp_compressed_df = pl.LazyFrame(
grp_compressed,
schema=[
BED_COV_SCHEMA.dtypes_flat[1],
BED_COV_SCHEMA.dtypes_flat[2],
],
orient="row",
)
grp_compressed_df = grp_compressed_df.with_columns(
pl.lit(genome).alias(COLUMN_GENOME_ID)
).select(BED_COV_SCHEMA.columns)

compressed.append(grp_compressed_df.collect())

Expand Down
Loading
Loading