Skip to content

Commit

Permalink
simplify compress, remove np version
Browse files Browse the repository at this point in the history
  • Loading branch information
wasade committed May 30, 2024
1 parent 890ae8e commit 641db04
Showing 1 changed file with 5 additions and 52 deletions.
57 changes: 5 additions & 52 deletions micov/_cov.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,10 @@ def _compress(rows):
start_val = None
end_val = None

# Potential expansion:
# allocate new ranges as 2D np zeros
# resize
# bulk
# push setting of genome into df

for start, stop in rows:
if end_val is None:
# case 1: no active range, start active range.
start_val = start
end_val = stop
elif end_val >= start:
# case 1: no active range, start active range.
start_val, end_val = rows[0]
for start, stop in rows[1:]:
if end_val >= start:
# case 2: active range continues through this range
# extend active range
end_val = max(end_val, stop)
Expand All @@ -65,43 +57,6 @@ def _compress(rows):
return new_ranges


@numba.jit(nopython=True)
def _compress_np(rows, new_ranges):
# derived from zebra
# https://github.com/biocore/zebra_filter/blob/master/cover.py#L14

start_val = None
end_val = None

idx = 0
row = new_ranges[idx]

start_val, end_val = rows[0]
for start, stop in rows[1:]:
if end_val >= start:
# case 2: active range continues through this range
# extend active range
end_val = max(end_val, stop)
else: # if end_val < r[0] - 1:
# case 3: active range ends before this range begins
# write new range out, then start new active range
row[0] = start_val
row[1] = end_val

idx += 1
row = new_ranges[idx]

start_val = start
end_val = stop

if end_val is not None:
row[0] = start_val
row[1] = end_val
idx += 1

return new_ranges[:idx]


def compress(df):
compressed = []
for (genome, ), grp in df.group_by([COLUMN_GENOME_ID, ]):
Expand All @@ -112,9 +67,7 @@ def compress(df):
.collect()
.to_numpy())

new_ranges = np.empty((len(rows), 2), dtype=int)

grp_compressed = _compress_np(rows, new_ranges)
grp_compressed = _compress(rows)
grp_compressed_df = pl.LazyFrame(grp_compressed,
schema=[BED_COV_SCHEMA.dtypes_flat[1],
BED_COV_SCHEMA.dtypes_flat[2]],
Expand Down

0 comments on commit 641db04

Please sign in to comment.