Skip to content

Commit

Permalink
Merge pull request #13 from sherlyn99/develop
Browse files Browse the repository at this point in the history
fixed typo and added taxonomy to micov compress
  • Loading branch information
wasade authored Jul 22, 2024
2 parents 02366fd + c846909 commit 3ef1a67
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 15 deletions.
8 changes: 8 additions & 0 deletions micov/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
COLUMN_CIGAR_DTYPE = str
COLUMN_LENGTH = 'length'
COLUMN_LENGTH_DTYPE = int
COLUMN_TAXONOMY = 'taxonomy'
COLUMN_TAXONOMY_DTYPE = str
COLUMN_COVERED = 'covered'
COLUMN_PERCENT_COVERED = 'percent_covered'
COLUMN_COVERED_DTYPE = int
Expand Down Expand Up @@ -73,6 +75,12 @@ class _GENOME_LENGTH_SCHEMA(_SCHEMA):
GENOME_LENGTH_SCHEMA = _GENOME_LENGTH_SCHEMA()


class _GENOME_TAXONOMY_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE)]
GENOME_TAXNOMY_SCHEMA = _GENOME_TAXONOMY_SCHEMA()


class _GENOME_COVERAGE_SCHEMA(_SCHEMA):
dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
(COLUMN_COVERED, COLUMN_COVERED_DTYPE),
Expand Down
69 changes: 64 additions & 5 deletions micov/_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import gzip

from ._cov import compress, coverage_percent
from ._constants import (BED_COV_SCHEMA, COLUMN_GENOME_ID, COLUMN_LENGTH,
from ._constants import (BED_COV_SCHEMA, GENOME_COVERAGE_SCHEMA,
COLUMN_GENOME_ID, COLUMN_LENGTH, COLUMN_TAXONOMY,
SAM_SUBSET_SCHEMA, COLUMN_CIGAR, COLUMN_STOP,
COLUMN_START, COLUMN_SAMPLE_ID)
from ._convert import cigar_to_lens
Expand Down Expand Up @@ -151,8 +152,7 @@ def _test_has_header(line):
if isinstance(line, bytes):
line = line.decode('utf-8')

genome_id_columns = ('genome-id', 'genome_id', 'feature-id',
'feature_id')
genome_id_columns = (COLUMN_GENOME_ID)

if line.startswith('#'):
has_header = True
Expand All @@ -166,6 +166,24 @@ def _test_has_header(line):
return has_header


def _test_has_header_taxonomy(line):
if isinstance(line, bytes):
line = line.decode('utf-8')

genome_id_columns = (COLUMN_GENOME_ID)
taxonomy_columns = (COLUMN_TAXONOMY)

if line.startswith('#'):
has_header = True
elif line.split('\t')[0] in genome_id_columns and \
line.split('\t')[1] in taxonomy_columns:
has_header = True
else:
has_header = False

return has_header


def parse_genome_lengths(lengths):
with open(lengths) as fp:
first_line = fp.readline()
Expand All @@ -190,6 +208,39 @@ def parse_genome_lengths(lengths):
return df[[genome_id_col, length_col]].rename(rename)


def parse_taxonomy(taxonomy):
with open(taxonomy) as fp:
first_line = fp.readline()

has_header = _test_has_header_taxonomy(first_line)
df = pl.read_csv(taxonomy, separator='\t', has_header=has_header)
genome_id_col = df.columns[0]
taxonomy_col = df.columns[1]

genome_ids = df[genome_id_col]
if len(genome_ids) != len(set(genome_ids)):
raise ValueError(f"'{genome_id_col}' is not unique")

rename = {genome_id_col: COLUMN_GENOME_ID,
taxonomy_col: COLUMN_TAXONOMY}

return df[[genome_id_col, taxonomy_col]].rename(rename)


def set_taxonomy_as_id(coverages, taxonomy):
missing = (set(coverages[COLUMN_GENOME_ID]) -
set(taxonomy[COLUMN_GENOME_ID]))
if len(missing) > 0:
raise ValueError(f"{len(missing)} genome(s) appear unrepresented in "
f"the taxonomy information, examples: "
f"{sorted(missing)[:5]}")

return (coverages
.join(taxonomy, on=COLUMN_GENOME_ID, how='inner')
.select(COLUMN_TAXONOMY,
pl.exclude(COLUMN_TAXONOMY)))


# TODO: this is not the greatest method name
def parse_sam_to_df(sam):
df = pl.read_csv(sam, separator='\t', has_header=False,
Expand Down Expand Up @@ -305,7 +356,15 @@ def compress_from_stream(sam, bufsize=100_000_000, disable_compression=False):
if len(buf) == 0:
return None

if len(buf[0].split(b'\t')) == 3:
line = buf[0]
if isinstance(line, str):
delim = '\t'
elif isinstance(line, bytes):
delim = b'\t'
else:
raise ValueError(f"Unexpected buffer type: {type(line)}")

if len(line.split(delim)) == 3:
parse_f = parse_bed_cov_to_df
else:
parse_f = parse_sam_to_df
Expand All @@ -324,6 +383,6 @@ def parse_coverage(data, features_to_keep):
schema_overrides=GENOME_COVERAGE_SCHEMA.dtypes_dict).lazy()

if features_to_keep is not None:
cov_df = cov_df.filter(pl.col(COLUMN_GENOME_ID).is_in(feature_keep))
cov_df = cov_df.filter(pl.col(COLUMN_GENOME_ID).is_in(features_to_keep))

return cov_df
26 changes: 21 additions & 5 deletions micov/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import io
import sys
import tqdm
from ._io import (parse_genome_lengths, parse_qiita_coverages, parse_sam_to_df,
write_qiita_cov, parse_sample_metadata, compress_from_stream,
from ._io import (parse_genome_lengths, parse_taxonomy, set_taxonomy_as_id,
parse_qiita_coverages, parse_sam_to_df, write_qiita_cov,
parse_sample_metadata, compress_from_stream,
parse_bed_cov_to_df)
from ._cov import coverage_percent
from ._convert import cigar_to_lens
Expand Down Expand Up @@ -82,8 +83,11 @@ def qiita_coverage(qiita_coverages, samples_to_keep, samples_to_ignore,
@click.option('--disable-compression', is_flag=True, default=False,
help='Do not compress the regions')
@click.option('--lengths', type=click.Path(exists=True), required=False,
help="Genome lengths, if provided compute coverage")
def compress(data, output, disable_compression, lengths):
help='Genome lengths, if provided compute coverage')
@click.option('--taxonomy', type=click.Path(exists=True), required=False,
help=('Genome taxonomy, if provided show species in coverage '
'percentage. Only works when --length is provided'))
def compress(data, output, disable_compression, lengths, taxonomy):
"""Compress BAM/SAM/BED mapping data.
This command can work with pipes, e.g.:
Expand All @@ -96,6 +100,9 @@ def compress(data, output, disable_compression, lengths):
if lengths is not None:
lengths = parse_genome_lengths(lengths)

if taxonomy is not None:
taxonomy = parse_taxonomy(taxonomy)

# compress data in blocks to avoid loading full mapping data into memory
# and compress as we go along.
df = compress_from_stream(data, disable_compression=disable_compression)
Expand All @@ -107,7 +114,16 @@ def compress(data, output, disable_compression, lengths):
df.write_csv(output, separator='\t', include_header=True)
else:
genome_coverage = coverage_percent(df, lengths).collect()
genome_coverage.write_csv(output, separator='\t', include_header=True)

if taxonomy is None:
genome_coverage.write_csv(output, separator='\t', include_header=True)
else:
genome_coverage_with_taxonomy = set_taxonomy_as_id(
genome_coverage, taxonomy
)
genome_coverage_with_taxonomy.write_csv(
output, separator='\t', include_header=True
)


@cli.command()
Expand Down
82 changes: 77 additions & 5 deletions micov/tests/test_io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import unittest
import io
import sys
from micov._io import (parse_genome_lengths, parse_qiita_coverages,
_single_df, _check_and_compress, parse_sam_to_df,
compress_from_stream, write_qiita_cov)
from micov._io import (parse_genome_lengths, parse_taxonomy, set_taxonomy_as_id,
parse_qiita_coverages, _single_df, _check_and_compress,
parse_sam_to_df, compress_from_stream, write_qiita_cov)
from micov._constants import (BED_COV_SCHEMA, COLUMN_GENOME_ID, COLUMN_START,
COLUMN_LENGTH, SAM_SUBSET_SCHEMA_PARSED,
GENOME_COVERAGE_SCHEMA, GENOME_LENGTH_SCHEMA)
COLUMN_LENGTH, COLUMN_TAXONOMY,
SAM_SUBSET_SCHEMA_PARSED, GENOME_COVERAGE_SCHEMA,
GENOME_LENGTH_SCHEMA)
import tempfile
import tarfile
import time
Expand Down Expand Up @@ -304,6 +305,77 @@ def test_parse_genome_lengths_bad_sizes(self):
with self.assertRaisesRegex(ValueError, "Lengths of zero or less"):
parse_genome_lengths(self.name)

def test_parse_taxonomy_good(self):
data = ("genome_id\ttaxonomy\tbaz\n"
"a\tspecies1\txyz\n"
"b\tspecies2\txyz\n"
"c\tspecies3\txyz\n")

with open(self.name, 'w') as fp:
fp.write(data)

exp = pl.DataFrame([['a', "species1"],
['b', "species2"],
['c', "species3"]],
schema=[COLUMN_GENOME_ID, COLUMN_TAXONOMY])
obs = parse_taxonomy(self.name)
plt.assert_frame_equal(obs, exp)

def test_parse_taxonomy_noheader(self):
data = ("a\tspecies1\txyz\n"
"b\tspecies2\txyz\n"
"c\tspecies3\txyz\n")

with open(self.name, 'w') as fp:
fp.write(data)

exp = pl.DataFrame([['a', "species1"],
['b', "species2"],
['c', "species3"]],
schema=[COLUMN_GENOME_ID, COLUMN_TAXONOMY])
obs = parse_taxonomy(self.name)
plt.assert_frame_equal(obs, exp)

def test_set_taxonomy_as_id(self):
cov = pl.DataFrame({
"genome_id": ["G000006925", "G000007525", "G000008865"],
"covered": [2501356, 4378, 2582128],
"length": [4828820, 2260266, 5594477],
"percent_covered": [51.800564112971706, 0.19369401654495533, 46.154948889771106]
})

tax = pl.DataFrame({
"genome_id": ["G000009925", "G000010425", "G000006925", "G000007525", "G000008865"],
"taxonomy": ["species1", "species2", "species3", "species4", "species5"]
})

exp = pl.DataFrame({
"taxonomy": ["species3", "species4", "species5"],
"genome_id": ["G000006925", "G000007525", "G000008865"],
"covered": [2501356, 4378, 2582128],
"length": [4828820, 2260266, 5594477],
"percent_covered": [51.800564112971706, 0.19369401654495533, 46.154948889771106]
})

obs = set_taxonomy_as_id(cov, tax)
plt.assert_frame_equal(obs, exp)

def test_taxonomy_as_id_missing_taxonomy(self):
cov = pl.DataFrame({
"genome_id": ["G000006925", "G000007525", "G000008865"],
"covered": [2501356, 4378, 2582128],
"length": [4828820, 2260266, 5594477],
"percent_covered": [51.800564112971706, 0.19369401654495533, 46.154948889771106]
})

tax = pl.DataFrame({
"genome_id": ["G000009925", "G000010425", "G000006925", "G000007525"],
"taxonomy": ["species1", "species2", "species3", "species4"]
})

with self.assertRaisesRegex(ValueError, "[G000008865]"):
obs = set_taxonomy_as_id(cov, tax)

def test_compress_from_stream(self):
data = io.BytesIO(
b"A\t0\tX\t1\t1\t50M\t*\t0\t0\t*\t*\n"
Expand Down

0 comments on commit 3ef1a67

Please sign in to comment.