Batch integration: partial reading (#460)

* WIP: use partial read fuction for asw_batch * fix asw_batch with partial reading * update dependencies and add helper script * adapt metrics scripts to use partial reading * use latest base images * update base iamges for methods * use partial reading function for all integration methods * apply partial reading for control methods * use partial reading for transformers Former-commit-id: aab07af
openproblems-bio · Aug 27, 2024 · 6e3105b · 6e3105b
1 parent 36cff3b
commit 6e3105b
Show file tree

Hide file tree

Showing 82 changed files with 644 additions and 339 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+README.html
+README_files/
 *.DS_Store
 *__pycache__
 *.h5ad

diff --git a/src/common/helper_functions/read_anndata_partial.py b/src/common/helper_functions/read_anndata_partial.py
@@ -0,0 +1,77 @@
+import warnings
+from pathlib import Path
+import anndata as ad
+import h5py
+from scipy.sparse import csr_matrix
+from anndata.experimental import read_elem, sparse_dataset
+
+
+def read_anndata(
+    file: str,
+    backed: bool = False,
+    **kwargs
+) -> ad.AnnData:
+    """
+    Read anndata file
+    :param file: path to anndata file in h5ad format
+    :param kwargs: AnnData parameter to group mapping
+    """
+    assert Path(file).exists(), f'File not found: {file}'
+
+    f = h5py.File(file, 'r')
+    kwargs = {x: x for x in f} if not kwargs else kwargs
+    if len(f.keys()) == 0:
+        return ad.AnnData()
+    # check if keys are available
+    for name, slot in kwargs.items():
+        if slot not in f:
+            warnings.warn(
+                f'Cannot find "{slot}" for AnnData parameter `{name}` from "{file}"'
+            )
+    adata = read_partial(f, backed=backed, **kwargs)
+    if not backed:
+        f.close()
+
+    return adata
+
+
+def read_partial(
+    group: h5py.Group,
+    backed: bool = False,
+    force_sparse_types: [str, list] = None,
+    **kwargs
+) -> ad.AnnData:
+    """
+    Partially read h5py groups
+    :params group: file group
+    :params force_sparse_types: encoding types to convert to sparse_dataset via csr_matrix
+    :params backed: read sparse matrix as sparse_dataset
+    :params **kwargs: dict of slot_name: slot, by default use all available slot for the h5py file
+    :return: AnnData object
+    """
+    if force_sparse_types is None:
+        force_sparse_types = []
+    elif isinstance(force_sparse_types, str):
+        force_sparse_types = [force_sparse_types]
+    slots = {}
+    if backed:
+        print('Read as backed sparse matrix...')
+
+    for slot_name, slot in kwargs.items():
+        print(f'Read slot "{slot}", store as "{slot_name}"...')
+        if slot not in group:
+            warnings.warn(f'Slot "{slot}" not found, skip...')
+            slots[slot_name] = None
+        else:
+            elem = group[slot]
+            iospec = ad._io.specs.get_spec(elem)
+            if iospec.encoding_type in ("csr_matrix", "csc_matrix") and backed:
+                slots[slot_name] = sparse_dataset(elem)
+            elif iospec.encoding_type in force_sparse_types:
+                slots[slot_name] = csr_matrix(read_elem(elem))
+                if backed:
+                    slots[slot_name] = sparse_dataset(slots[slot_name])
+            else:
+                slots[slot_name] = read_elem(elem)
+    return ad.AnnData(**slots)
+
diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml
@@ -14,9 +14,11 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [midtime, lowmem, lowcpu]
diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py
@@ -1,3 +1,4 @@
+import sys
 import scanpy as sc
 import numpy as np
 
@@ -15,9 +16,18 @@
 
 ## VIASH END
 
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+
 print('Read input', flush=True)
-adata = sc.read_h5ad(par['input'])
-adata.X = adata.layers["normalized"]
+adata = read_anndata(
+    par['input'],
+    X='layers/normalized',
+    obs='obs',
+    var='var',
+    uns='uns'
+)
 adata.var["highly_variable"] = adata.var["hvg"]
 
 print("Process dataset", flush=True)
@@ -27,7 +37,7 @@
     n_comps = min(50, np.sum(batch_idx))
     solver = "full" if n_comps == np.sum(batch_idx) else "arpack"
     adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca(
-        adata[batch_idx],
+        adata[batch_idx].copy(),
         n_comps=n_comps,
         use_highly_variable=True,
         svd_solver=solver,

diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml
@@ -14,9 +14,11 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [ "midtime", "lowmem", "lowcpu"]
diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py
@@ -1,3 +1,4 @@
+import sys
 import scanpy as sc
 
 ## VIASH START
@@ -15,8 +16,17 @@
 
 ## VIASH END
 
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+
 print('Read input', flush=True)
-adata = sc.read_h5ad(par['input'])
+adata = read_anndata(
+    par['input'],
+    obs='obs',
+    obsm='obsm',
+    uns='uns'
+)
 
 print("process dataset", flush=True)
 adata.obsm["X_emb"] = adata.obsm["X_pca"]

diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml
@@ -14,9 +14,11 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [ "midtime", "lowmem", "lowcpu"]
diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py
@@ -1,3 +1,4 @@
+import sys
 import scanpy as sc
 
 ## VIASH START
@@ -15,12 +16,22 @@
 
 ## VIASH END
 
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+
 print('Read input', flush=True)
-adata = sc.read_h5ad(par['input'])
+adata = read_anndata(
+    par['input'],
+    X='layers/normalized',
+    obs='obs',
+    var='var',
+    uns='uns'
+)
 
 # no processing, subset matrix to highly variable genes
 adata_hvg = adata[:, adata.var["hvg"]].copy()
-adata.layers['corrected_counts'] = adata_hvg.layers["normalized"].copy()
+adata.layers['corrected_counts'] = adata_hvg.X.copy()
 
 print("Store outputs", flush=True)
 adata.uns['method_id'] = meta['functionality_name']

diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml
@@ -14,10 +14,12 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
     - path: ../../utils.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [ "midtime", "lowmem", "lowcpu"]
diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py
@@ -19,10 +19,16 @@
 # add helper scripts to path
 sys.path.append(meta["resources_dir"])
 from utils import _set_uns
+from read_anndata_partial import read_anndata
 
 
 print('Read input', flush=True)
-adata = sc.read_h5ad(par['input'])
+adata = read_anndata(
+    par['input'],
+    obs='obs',
+    obsp='obsp',
+    uns='uns'
+)
 
 print("process dataset", flush=True)
 neighbors_map = adata.uns['knn']

diff --git a/...asks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml b/...asks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml
@@ -14,10 +14,12 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
     - path: ../../utils.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [midtime, lowmem, lowcpu]
diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py
@@ -16,10 +16,15 @@
 ## VIASH END
 sys.path.append(meta["resources_dir"])
 from utils import _perfect_embedding
+from read_anndata_partial import read_anndata
 
 
 print('Read input', flush=True)
-adata = ad.read_h5ad(par['input'])
+adata = read_anndata(
+    par['input'],
+    obs='obs',
+    uns='uns'
+)
 
 print('Process data...', flush=True)
 adata.obsm["X_emb"] = _perfect_embedding(partition=adata.obs["label"])

diff --git a/...tch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml b/...tch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml
@@ -18,10 +18,12 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
     - path: ../../utils.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [midtime, lowmem, lowcpu]
diff --git a/...sks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py b/...sks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py
@@ -17,10 +17,15 @@
 ## VIASH END
 sys.path.append(meta["resources_dir"])
 from utils import _perfect_embedding
+from read_anndata_partial import read_anndata
 
 
 print('Read input', flush=True)
-adata = ad.read_h5ad(par['input'])
+adata = read_anndata(
+    par['input'],
+    obs='obs',
+    uns='uns'
+)
 
 print('Process data...', flush=True)
 adata.obsm["X_emb"] = _perfect_embedding(

diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml
@@ -14,10 +14,12 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
     - path: ../../utils.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [ "midtime", "lowmem", "lowcpu"]
diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py
@@ -19,9 +19,15 @@
 # add helper scripts to path
 sys.path.append(meta["resources_dir"])
 from utils import _randomize_features
+from read_anndata_partial import read_anndata
 
 print('Read input', flush=True)
-adata = sc.read_h5ad(par['input'])
+adata = read_anndata(
+    par['input'],
+    obs='obs',
+    obsm='obsm',
+    uns='uns'
+)
 
 print("process dataset", flush=True)
 adata.obsm["X_emb"] = _randomize_features(

diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml
@@ -14,10 +14,12 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
     - path: ../../utils.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [ "midtime", "lowmem", "lowcpu"]
diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py
@@ -19,12 +19,20 @@
 # add helper scripts to path
 sys.path.append(meta["resources_dir"])
 from utils import _randomize_features
+from read_anndata_partial import read_anndata
+
 
 print('Read input', flush=True)
-adata = ad.read_h5ad(par['input'])
+adata = read_anndata(
+    par['input'],
+    X='layers/normalized',
+    obs='obs',
+    var='var',
+    uns='uns'
+)
 
 adata.layers['corrected_counts'] = _randomize_features(
-    adata.layers["normalized"],
+    adata.X,
     partition=adata.obs["batch"],
 )
 

diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml
@@ -14,10 +14,12 @@ functionality:
   resources:
     - type: python_script
       path: script.py
+    - type: python_script
+      path: /src/common/helper_functions/read_anndata_partial.py
     - path: ../../utils.py
 platforms:
   - type: docker
-    image: ghcr.io/openproblems-bio/base_python:1.0.4
+    image: ghcr.io/openproblems-bio/base_images/python:1.1.0
   - type: nextflow
     directives:
       label: [ "midtime", "lowmem", "lowcpu"]