Merge pull request #13 from sherlyn99/develop

fixed typo and added taxonomy to micov compress
biocore · Jul 22, 2024 · 3ef1a67 · 3ef1a67
2 parents 02366fd + c846909
commit 3ef1a67
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 15 deletions.
diff --git a/micov/_constants.py b/micov/_constants.py
@@ -14,6 +14,8 @@
 COLUMN_CIGAR_DTYPE = str
 COLUMN_LENGTH = 'length'
 COLUMN_LENGTH_DTYPE = int
+COLUMN_TAXONOMY = 'taxonomy'
+COLUMN_TAXONOMY_DTYPE = str
 COLUMN_COVERED = 'covered'
 COLUMN_PERCENT_COVERED = 'percent_covered'
 COLUMN_COVERED_DTYPE = int
@@ -73,6 +75,12 @@ class _GENOME_LENGTH_SCHEMA(_SCHEMA):
 GENOME_LENGTH_SCHEMA = _GENOME_LENGTH_SCHEMA()
 
 
+class _GENOME_TAXONOMY_SCHEMA(_SCHEMA):
+    dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
+                   (COLUMN_TAXONOMY, COLUMN_TAXONOMY_DTYPE)]
+GENOME_TAXNOMY_SCHEMA = _GENOME_TAXONOMY_SCHEMA()
+
+
 class _GENOME_COVERAGE_SCHEMA(_SCHEMA):
     dtypes_flat = [(COLUMN_GENOME_ID, COLUMN_GENOME_ID_DTYPE),
                    (COLUMN_COVERED, COLUMN_COVERED_DTYPE),

diff --git a/micov/_io.py b/micov/_io.py
@@ -10,7 +10,8 @@
 import gzip
 
 from ._cov import compress, coverage_percent
-from ._constants import (BED_COV_SCHEMA, COLUMN_GENOME_ID, COLUMN_LENGTH,
+from ._constants import (BED_COV_SCHEMA, GENOME_COVERAGE_SCHEMA,
+                         COLUMN_GENOME_ID, COLUMN_LENGTH, COLUMN_TAXONOMY,
                          SAM_SUBSET_SCHEMA, COLUMN_CIGAR, COLUMN_STOP,
                          COLUMN_START, COLUMN_SAMPLE_ID)
 from ._convert import cigar_to_lens
@@ -151,8 +152,7 @@ def _test_has_header(line):
     if isinstance(line, bytes):
         line = line.decode('utf-8')
 
-    genome_id_columns = ('genome-id', 'genome_id', 'feature-id',
-                         'feature_id')
+    genome_id_columns = (COLUMN_GENOME_ID)
 
     if line.startswith('#'):
         has_header = True
@@ -166,6 +166,24 @@ def _test_has_header(line):
     return has_header
 
 
+def _test_has_header_taxonomy(line):
+    if isinstance(line, bytes):
+        line = line.decode('utf-8')
+
+    genome_id_columns = (COLUMN_GENOME_ID)
+    taxonomy_columns = (COLUMN_TAXONOMY)
+
+    if line.startswith('#'):
+        has_header = True
+    elif line.split('\t')[0] in genome_id_columns and \
+        line.split('\t')[1] in taxonomy_columns:
+        has_header = True
+    else:
+        has_header = False
+
+    return has_header
+
+
 def parse_genome_lengths(lengths):
     with open(lengths) as fp:
         first_line = fp.readline()
@@ -190,6 +208,39 @@ def parse_genome_lengths(lengths):
     return df[[genome_id_col, length_col]].rename(rename)
 
 
+def parse_taxonomy(taxonomy):
+    with open(taxonomy) as fp:
+        first_line = fp.readline()
+
+    has_header = _test_has_header_taxonomy(first_line)
+    df = pl.read_csv(taxonomy, separator='\t', has_header=has_header)
+    genome_id_col = df.columns[0]
+    taxonomy_col = df.columns[1]
+
+    genome_ids = df[genome_id_col]
+    if len(genome_ids) != len(set(genome_ids)):
+        raise ValueError(f"'{genome_id_col}' is not unique")
+
+    rename = {genome_id_col: COLUMN_GENOME_ID,
+              taxonomy_col: COLUMN_TAXONOMY}
+
+    return df[[genome_id_col, taxonomy_col]].rename(rename)
+
+
+def set_taxonomy_as_id(coverages, taxonomy):
+    missing = (set(coverages[COLUMN_GENOME_ID]) -
+               set(taxonomy[COLUMN_GENOME_ID]))
+    if len(missing) > 0:
+        raise ValueError(f"{len(missing)} genome(s) appear unrepresented in "
+                         f"the taxonomy information, examples: "
+                         f"{sorted(missing)[:5]}")
+
+    return (coverages
+            .join(taxonomy, on=COLUMN_GENOME_ID, how='inner')
+            .select(COLUMN_TAXONOMY,
+                    pl.exclude(COLUMN_TAXONOMY)))
+
+
 # TODO: this is not the greatest method name
 def parse_sam_to_df(sam):
     df = pl.read_csv(sam, separator='\t', has_header=False,
@@ -305,7 +356,15 @@ def compress_from_stream(sam, bufsize=100_000_000, disable_compression=False):
         if len(buf) == 0:
             return None
 
-        if len(buf[0].split(b'\t')) == 3:
+        line = buf[0]
+        if isinstance(line, str):
+            delim = '\t'
+        elif isinstance(line, bytes):
+            delim = b'\t'
+        else:
+            raise ValueError(f"Unexpected buffer type: {type(line)}")
+
+        if len(line.split(delim)) == 3:
             parse_f = parse_bed_cov_to_df
         else:
             parse_f = parse_sam_to_df
@@ -324,6 +383,6 @@ def parse_coverage(data, features_to_keep):
                         schema_overrides=GENOME_COVERAGE_SCHEMA.dtypes_dict).lazy()
 
     if features_to_keep is not None:
-        cov_df = cov_df.filter(pl.col(COLUMN_GENOME_ID).is_in(feature_keep))
+        cov_df = cov_df.filter(pl.col(COLUMN_GENOME_ID).is_in(features_to_keep))
 
     return cov_df
diff --git a/micov/cli.py b/micov/cli.py
@@ -7,8 +7,9 @@
 import io
 import sys
 import tqdm
-from ._io import (parse_genome_lengths, parse_qiita_coverages, parse_sam_to_df,
-                  write_qiita_cov, parse_sample_metadata, compress_from_stream,
+from ._io import (parse_genome_lengths, parse_taxonomy, set_taxonomy_as_id,
+                  parse_qiita_coverages, parse_sam_to_df, write_qiita_cov,
+                  parse_sample_metadata, compress_from_stream,
                   parse_bed_cov_to_df)
 from ._cov import coverage_percent
 from ._convert import cigar_to_lens
@@ -82,8 +83,11 @@ def qiita_coverage(qiita_coverages, samples_to_keep, samples_to_ignore,
 @click.option('--disable-compression', is_flag=True, default=False,
               help='Do not compress the regions')
 @click.option('--lengths', type=click.Path(exists=True), required=False,
-              help="Genome lengths, if provided compute coverage")
-def compress(data, output, disable_compression, lengths):
+              help='Genome lengths, if provided compute coverage')
+@click.option('--taxonomy', type=click.Path(exists=True), required=False,
+              help=('Genome taxonomy, if provided show species in coverage '
+                    'percentage. Only works when --length is provided'))
+def compress(data, output, disable_compression, lengths, taxonomy):
     """Compress BAM/SAM/BED mapping data.
 
     This command can work with pipes, e.g.:
@@ -96,6 +100,9 @@ def compress(data, output, disable_compression, lengths):
     if lengths is not None:
         lengths = parse_genome_lengths(lengths)
 
+        if taxonomy is not None:
+            taxonomy = parse_taxonomy(taxonomy)
+
     # compress data in blocks to avoid loading full mapping data into memory
     # and compress as we go along.
     df = compress_from_stream(data, disable_compression=disable_compression)
@@ -107,7 +114,16 @@ def compress(data, output, disable_compression, lengths):
         df.write_csv(output, separator='\t', include_header=True)
     else:
         genome_coverage = coverage_percent(df, lengths).collect()
-        genome_coverage.write_csv(output, separator='\t', include_header=True)
+
+        if taxonomy is None:
+            genome_coverage.write_csv(output, separator='\t', include_header=True)
+        else:
+            genome_coverage_with_taxonomy = set_taxonomy_as_id(
+                genome_coverage, taxonomy
+                )
+            genome_coverage_with_taxonomy.write_csv(
+                output, separator='\t', include_header=True
+                )
 
 
 @cli.command()

diff --git a/micov/tests/test_io.py b/micov/tests/test_io.py
@@ -1,12 +1,13 @@
 import unittest
 import io
 import sys
-from micov._io import (parse_genome_lengths, parse_qiita_coverages,
-                       _single_df, _check_and_compress, parse_sam_to_df,
-                       compress_from_stream, write_qiita_cov)
+from micov._io import (parse_genome_lengths, parse_taxonomy, set_taxonomy_as_id,
+                       parse_qiita_coverages, _single_df, _check_and_compress, 
+                       parse_sam_to_df, compress_from_stream, write_qiita_cov)
 from micov._constants import (BED_COV_SCHEMA, COLUMN_GENOME_ID, COLUMN_START,
-                              COLUMN_LENGTH, SAM_SUBSET_SCHEMA_PARSED,
-                              GENOME_COVERAGE_SCHEMA, GENOME_LENGTH_SCHEMA)
+                              COLUMN_LENGTH, COLUMN_TAXONOMY, 
+                              SAM_SUBSET_SCHEMA_PARSED, GENOME_COVERAGE_SCHEMA, 
+                              GENOME_LENGTH_SCHEMA)
 import tempfile
 import tarfile
 import time
@@ -304,6 +305,77 @@ def test_parse_genome_lengths_bad_sizes(self):
         with self.assertRaisesRegex(ValueError, "Lengths of zero or less"):
             parse_genome_lengths(self.name)
 
+    def test_parse_taxonomy_good(self):
+        data = ("genome_id\ttaxonomy\tbaz\n"
+                "a\tspecies1\txyz\n"
+                "b\tspecies2\txyz\n"
+                "c\tspecies3\txyz\n")
+
+        with open(self.name, 'w') as fp:
+            fp.write(data)
+
+        exp = pl.DataFrame([['a', "species1"],
+                            ['b', "species2"],
+                            ['c', "species3"]],
+                           schema=[COLUMN_GENOME_ID, COLUMN_TAXONOMY])
+        obs = parse_taxonomy(self.name)
+        plt.assert_frame_equal(obs, exp)
+
+    def test_parse_taxonomy_noheader(self):
+        data = ("a\tspecies1\txyz\n"
+                "b\tspecies2\txyz\n"
+                "c\tspecies3\txyz\n")
+
+        with open(self.name, 'w') as fp:
+            fp.write(data)
+
+        exp = pl.DataFrame([['a', "species1"],
+                            ['b', "species2"],
+                            ['c', "species3"]],
+                           schema=[COLUMN_GENOME_ID, COLUMN_TAXONOMY])
+        obs = parse_taxonomy(self.name)
+        plt.assert_frame_equal(obs, exp)
+
+    def test_set_taxonomy_as_id(self):
+        cov = pl.DataFrame({
+            "genome_id": ["G000006925", "G000007525", "G000008865"],
+            "covered": [2501356, 4378, 2582128],
+            "length": [4828820, 2260266, 5594477],
+            "percent_covered": [51.800564112971706, 0.19369401654495533, 46.154948889771106]
+            })
+
+        tax = pl.DataFrame({
+            "genome_id": ["G000009925", "G000010425", "G000006925", "G000007525", "G000008865"],
+            "taxonomy": ["species1", "species2", "species3", "species4", "species5"]
+            })
+
+        exp = pl.DataFrame({
+            "taxonomy": ["species3", "species4", "species5"],
+            "genome_id": ["G000006925", "G000007525", "G000008865"],
+            "covered": [2501356, 4378, 2582128],
+            "length": [4828820, 2260266, 5594477],
+            "percent_covered": [51.800564112971706, 0.19369401654495533, 46.154948889771106]
+            })
+
+        obs = set_taxonomy_as_id(cov, tax)
+        plt.assert_frame_equal(obs, exp)
+
+    def test_taxonomy_as_id_missing_taxonomy(self):
+        cov = pl.DataFrame({
+            "genome_id": ["G000006925", "G000007525", "G000008865"],
+            "covered": [2501356, 4378, 2582128],
+            "length": [4828820, 2260266, 5594477],
+            "percent_covered": [51.800564112971706, 0.19369401654495533, 46.154948889771106]
+            })
+
+        tax = pl.DataFrame({
+            "genome_id": ["G000009925", "G000010425", "G000006925", "G000007525"],
+            "taxonomy": ["species1", "species2", "species3", "species4"]
+            })
+
+        with self.assertRaisesRegex(ValueError, "[G000008865]"):
+            obs = set_taxonomy_as_id(cov, tax)
+
     def test_compress_from_stream(self):
         data = io.BytesIO(
             b"A\t0\tX\t1\t1\t50M\t*\t0\t0\t*\t*\n"