Skip to content

Commit

Permalink
Merge pull request #155 from sfiligoi/multi_def_230621
Browse files Browse the repository at this point in the history
Switch h5unifrac_all to lazy load semantics
  • Loading branch information
sfiligoi authored Jun 26, 2023
2 parents 6a9cca8 + 7d5f034 commit bd3cfac
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 19 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ jobs:
python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test2.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1"
python -c "import unifrac; st_l=unifrac.h5permanova_dict('ci/test2.dm.h5'); assert len(st_l) == 1"
python -c "import unifrac; pc=unifrac.h5pcoa('ci/test3.dm.h5'); print(pc); assert len(pc.eigvals) == 2"
ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre --pcoa 3 --mode multi --subsample-depth 2 --n-subsamples 10 -r hdf5 -o ci/test4.dm.h5 -m unweighted
python -c "import unifrac; dm_u=unifrac.h5unifrac_all('ci/test4.dm.h5'); assert len(dm_u) == 10; print(dm_u[0]); print(dm_u[4]); print(dm_u[9]); dm_u.close(); assert len(dm_u) == 0"
if [[ "$(uname -s)" == "Linux" ]];
then
MD5=md5sum
Expand Down
73 changes: 54 additions & 19 deletions unifrac/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from functools import reduce
from operator import or_
from typing import Union
import collections.abc

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -2552,7 +2553,56 @@ def h5unifrac(h5file: str) -> skbio.DistanceMatrix:
return dm


def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix:
class H5UnifracTuple(collections.abc.Sequence):
"""Read all UniFrac distance matrices from a hdf5 file"""

def __init__(self, h5file: str):
self.f_u = h5py.File(h5file, "r")
self.order = [c.decode('ascii') for c in self.f_u['order'][:]]
# cache some often used values
self.nels = None
self.cached_idx = None
self.cached_el = None

def __getitem__(self, i: int) -> skbio.DistanceMatrix:
if i == self.cached_idx:
return self.cached_el
i_str = 'matrix:%i' % i
if i == 0:
if 'matrix' in self.f_u.keys():
# single format
i_str = 'matrix'
el = skbio.DistanceMatrix(self.f_u[i_str][:, :],
self.order)
# if it did not throw, cache
self.cached_idx = i
self.cached_el = el
return self.cached_el

def __len__(self) -> int:
if self.nels is None:
i = 0
if 'matrix' in self.f_u.keys():
# single format
i = 1
else:
# multi format
while 'matrix:%i' % i in self.f_u.keys():
i = i + 1
self.nels = i
return self.nels

def close(self):
"""Explicitly close the underlying file descriptor"""
self.f_u.close()
# invalidate all other cache values
self.order = None
self.nels = 0
self.cached_idx = None
self.cached_el = None


def h5unifrac_all(h5file: str) -> H5UnifracTuple:
"""Read all UniFrac distance matrices from a hdf5 file
Parameters
Expand All @@ -2562,8 +2612,8 @@ def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix:
Returns
-------
tuple(skbio.DistanceMatrix)
The distance matrices.
H5UnifracTuple
A collection of distance matrices.
Raises
------
Expand All @@ -2582,22 +2632,7 @@ def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix:
phylogeny. BMC Bioinformatics 12:118 (2011).
"""

with h5py.File(h5file, "r") as f_u:
order = [c.decode('ascii') for c in f_u['order'][:]]
if 'matrix' in f_u.keys():
# single format
dms = [skbio.DistanceMatrix(
f_u['matrix'][:, :], order)]
else:
# multi format
dms = []
i = 0
while 'matrix:%i' % i in f_u.keys():
dms.append(skbio.DistanceMatrix(
f_u['matrix:%i' % i][:, :], order))
i = i + 1

return dms
return H5UnifracTuple(h5file)


def _build_pcoa(f_u, long_method_name, order_index,
Expand Down

0 comments on commit bd3cfac

Please sign in to comment.