From 0674e10e7188abe2447e8862101cd89abc3de48b Mon Sep 17 00:00:00 2001 From: Igor Sfiligoi Date: Mon, 26 Jun 2023 06:51:07 -0700 Subject: [PATCH 1/5] Switch to lazy obect creation in h5unifrac_all --- unifrac/_methods.py | 55 +++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/unifrac/_methods.py b/unifrac/_methods.py index 9c1e616e..77f24d83 100644 --- a/unifrac/_methods.py +++ b/unifrac/_methods.py @@ -9,6 +9,7 @@ from functools import reduce from operator import or_ from typing import Union +import collections.abc import numpy as np import pandas as pd @@ -2552,7 +2553,38 @@ def h5unifrac(h5file: str) -> skbio.DistanceMatrix: return dm -def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix: +class H5UnifracTuple(collections.abc.Sequence): + """Read all UniFrac distance matrices from a hdf5 file""" + + def __init__(self, h5file: str): + self.f_u = h5py.File(h5file, "r") + self.order = [c.decode('ascii') for c in self.f_u['order'][:]] + self.nels = None + + def __getitem__(self, i: int) -> skbio.DistanceMatrix: + i_str = 'matrix:%i' % i + if i == 0: + if 'matrix' in self.f_u.keys(): + # single format + i_str = 'matrix' + return skbio.DistanceMatrix(self.f_u[i_str][:, :], + self.order) + + def __len__(self) -> int: + if self.nels is None: + i = 0 + if 'matrix' in self.f_u.keys(): + # single format + i= 1 + else: + # multi format + while 'matrix:%i' % i in self.f_u.keys(): + i = i + 1 + self.nels = i + return self.nels + + +def h5unifrac_all(h5file: str) -> H5UnifracTuple: """Read all UniFrac distance matrices from a hdf5 file Parameters @@ -2562,8 +2594,8 @@ def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix: Returns ------- - tuple(skbio.DistanceMatrix) - The distance matrices. + H5UnifracTuple + A collection of distance matrices. Raises ------ @@ -2582,22 +2614,7 @@ def h5unifrac_all(h5file: str) -> skbio.DistanceMatrix: phylogeny. BMC Bioinformatics 12:118 (2011). """ - with h5py.File(h5file, "r") as f_u: - order = [c.decode('ascii') for c in f_u['order'][:]] - if 'matrix' in f_u.keys(): - # single format - dms = [skbio.DistanceMatrix( - f_u['matrix'][:, :], order)] - else: - # multi format - dms = [] - i = 0 - while 'matrix:%i' % i in f_u.keys(): - dms.append(skbio.DistanceMatrix( - f_u['matrix:%i' % i][:, :], order)) - i = i + 1 - - return dms + return H5UnifracTuple(h5file) def _build_pcoa(f_u, long_method_name, order_index, From efcfcc39c6d078a1c141bad1d1010ec87f76520e Mon Sep 17 00:00:00 2001 From: Igor Sfiligoi Date: Mon, 26 Jun 2023 06:52:54 -0700 Subject: [PATCH 2/5] Fix spacing for flake --- unifrac/_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unifrac/_methods.py b/unifrac/_methods.py index 77f24d83..4677edfa 100644 --- a/unifrac/_methods.py +++ b/unifrac/_methods.py @@ -2575,14 +2575,14 @@ def __len__(self) -> int: i = 0 if 'matrix' in self.f_u.keys(): # single format - i= 1 + i = 1 else: # multi format while 'matrix:%i' % i in self.f_u.keys(): i = i + 1 self.nels = i return self.nels - + def h5unifrac_all(h5file: str) -> H5UnifracTuple: """Read all UniFrac distance matrices from a hdf5 file From b2e92efd32687be1b790f40f9621e071d7276938 Mon Sep 17 00:00:00 2001 From: Igor Sfiligoi Date: Mon, 26 Jun 2023 07:19:42 -0700 Subject: [PATCH 3/5] Cache last accessed value to make it more use friendly --- unifrac/_methods.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/unifrac/_methods.py b/unifrac/_methods.py index 4677edfa..784a6769 100644 --- a/unifrac/_methods.py +++ b/unifrac/_methods.py @@ -2559,16 +2559,25 @@ class H5UnifracTuple(collections.abc.Sequence): def __init__(self, h5file: str): self.f_u = h5py.File(h5file, "r") self.order = [c.decode('ascii') for c in self.f_u['order'][:]] + # cache some often used values self.nels = None + self.cached_idx = None + self.cached_el = None def __getitem__(self, i: int) -> skbio.DistanceMatrix: + if i == self.cached_idx: + return self.cached_el i_str = 'matrix:%i' % i if i == 0: if 'matrix' in self.f_u.keys(): # single format i_str = 'matrix' - return skbio.DistanceMatrix(self.f_u[i_str][:, :], - self.order) + el = skbio.DistanceMatrix(self.f_u[i_str][:, :], + self.order) + # if it did not throw, cache + self.cached_idx = i + self.cached_el = el + return self.cached_el def __len__(self) -> int: if self.nels is None: From 4654fe6767139ba95651c7930abb87b48479fa67 Mon Sep 17 00:00:00 2001 From: Igor Sfiligoi Date: Mon, 26 Jun 2023 07:29:10 -0700 Subject: [PATCH 4/5] Add close method --- unifrac/_methods.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/unifrac/_methods.py b/unifrac/_methods.py index 784a6769..9f711195 100644 --- a/unifrac/_methods.py +++ b/unifrac/_methods.py @@ -2592,6 +2592,15 @@ def __len__(self) -> int: self.nels = i return self.nels + def close(self): + """Explicitly close the underlying file descriptor""" + self.f_u.close() + # invalidate all other cache values + self.order = None + self.nels = 0 + self.cached_idx = None + self.cached_el = None + def h5unifrac_all(h5file: str) -> H5UnifracTuple: """Read all UniFrac distance matrices from a hdf5 file From 7d5f03483c069d0f3735251652b480f1f97ece72 Mon Sep 17 00:00:00 2001 From: Igor Sfiligoi Date: Mon, 26 Jun 2023 09:15:15 -0700 Subject: [PATCH 5/5] Add proper h5unifrac_all tests --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4b5c7655..0e5569fa 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -121,6 +121,8 @@ jobs: python -c "import unifrac,skbio; dm = skbio.DistanceMatrix.read('ci/test.dm'); dm_u=unifrac.h5unifrac('ci/test2.dm.h5'); t=abs(dm_u.data-dm.data).max(); print(t); assert t < 0.1" python -c "import unifrac; st_l=unifrac.h5permanova_dict('ci/test2.dm.h5'); assert len(st_l) == 1" python -c "import unifrac; pc=unifrac.h5pcoa('ci/test3.dm.h5'); print(pc); assert len(pc.eigvals) == 2" + ssu -i unifrac/tests/data/crawford.biom -t unifrac/tests/data/crawford.tre --pcoa 3 --mode multi --subsample-depth 2 --n-subsamples 10 -r hdf5 -o ci/test4.dm.h5 -m unweighted + python -c "import unifrac; dm_u=unifrac.h5unifrac_all('ci/test4.dm.h5'); assert len(dm_u) == 10; print(dm_u[0]); print(dm_u[4]); print(dm_u[9]); dm_u.close(); assert len(dm_u) == 0" if [[ "$(uname -s)" == "Linux" ]]; then MD5=md5sum