simplify compress, remove np version

biocore · May 30, 2024 · 641db04 · 641db04
1 parent 890ae8e
commit 641db04
Showing 1 changed file with 5 additions and 52 deletions.
diff --git a/micov/_cov.py b/micov/_cov.py
@@ -35,18 +35,10 @@ def _compress(rows):
     start_val = None
     end_val = None
 
-    # Potential expansion:
-    # allocate new ranges as 2D np zeros
-    # resize
-    # bulk
-    # push setting of genome into df
-
-    for start, stop in rows:
-        if end_val is None:
-            # case 1: no active range, start active range.
-            start_val = start
-            end_val = stop
-        elif end_val >= start:
+    # case 1: no active range, start active range.
+    start_val, end_val = rows[0]
+    for start, stop in rows[1:]:
+        if end_val >= start:
             # case 2: active range continues through this range
             # extend active range
             end_val = max(end_val, stop)
@@ -65,43 +57,6 @@ def _compress(rows):
     return new_ranges
 
 
-@numba.jit(nopython=True)
-def _compress_np(rows, new_ranges):
-    # derived from zebra
-    # https://github.com/biocore/zebra_filter/blob/master/cover.py#L14
-
-    start_val = None
-    end_val = None
-
-    idx = 0
-    row = new_ranges[idx]
-
-    start_val, end_val = rows[0]
-    for start, stop in rows[1:]:
-        if end_val >= start:
-            # case 2: active range continues through this range
-            # extend active range
-            end_val = max(end_val, stop)
-        else:  # if end_val < r[0] - 1:
-            # case 3: active range ends before this range begins
-            # write new range out, then start new active range
-            row[0] = start_val
-            row[1] = end_val
-
-            idx += 1
-            row = new_ranges[idx]
-
-            start_val = start
-            end_val = stop
-
-    if end_val is not None:
-        row[0] = start_val
-        row[1] = end_val
-        idx += 1
-
-    return new_ranges[:idx]
-
-
 def compress(df):
     compressed = []
     for (genome, ), grp in df.group_by([COLUMN_GENOME_ID, ]):
@@ -112,9 +67,7 @@ def compress(df):
                  .collect()
                  .to_numpy())
 
-        new_ranges = np.empty((len(rows), 2), dtype=int)
-
-        grp_compressed = _compress_np(rows, new_ranges)
+        grp_compressed = _compress(rows)
         grp_compressed_df = pl.LazyFrame(grp_compressed,
                                          schema=[BED_COV_SCHEMA.dtypes_flat[1],
                                                  BED_COV_SCHEMA.dtypes_flat[2]],