Skip to content

Commit

Permalink
Batch integration: partial reading (#460)
Browse files Browse the repository at this point in the history
* WIP: use partial read fuction for asw_batch

* fix asw_batch with partial reading

* update dependencies and add helper script

* adapt metrics scripts to use partial reading

* use latest base images

* update base iamges for methods

* use partial reading function for all integration methods

* apply partial reading for control methods

* use partial reading for transformers

Former-commit-id: aab07af
  • Loading branch information
mumichae authored Aug 27, 2024
1 parent 36cff3b commit 6e3105b
Show file tree
Hide file tree
Showing 82 changed files with 644 additions and 339 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
README.html
README_files/
*.DS_Store
*__pycache__
*.h5ad
Expand Down
77 changes: 77 additions & 0 deletions src/common/helper_functions/read_anndata_partial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import warnings
from pathlib import Path
import anndata as ad
import h5py
from scipy.sparse import csr_matrix
from anndata.experimental import read_elem, sparse_dataset


def read_anndata(
file: str,
backed: bool = False,
**kwargs
) -> ad.AnnData:
"""
Read anndata file
:param file: path to anndata file in h5ad format
:param kwargs: AnnData parameter to group mapping
"""
assert Path(file).exists(), f'File not found: {file}'

f = h5py.File(file, 'r')
kwargs = {x: x for x in f} if not kwargs else kwargs
if len(f.keys()) == 0:
return ad.AnnData()
# check if keys are available
for name, slot in kwargs.items():
if slot not in f:
warnings.warn(
f'Cannot find "{slot}" for AnnData parameter `{name}` from "{file}"'
)
adata = read_partial(f, backed=backed, **kwargs)
if not backed:
f.close()

return adata


def read_partial(
group: h5py.Group,
backed: bool = False,
force_sparse_types: [str, list] = None,
**kwargs
) -> ad.AnnData:
"""
Partially read h5py groups
:params group: file group
:params force_sparse_types: encoding types to convert to sparse_dataset via csr_matrix
:params backed: read sparse matrix as sparse_dataset
:params **kwargs: dict of slot_name: slot, by default use all available slot for the h5py file
:return: AnnData object
"""
if force_sparse_types is None:
force_sparse_types = []
elif isinstance(force_sparse_types, str):
force_sparse_types = [force_sparse_types]
slots = {}
if backed:
print('Read as backed sparse matrix...')

for slot_name, slot in kwargs.items():
print(f'Read slot "{slot}", store as "{slot_name}"...')
if slot not in group:
warnings.warn(f'Slot "{slot}" not found, skip...')
slots[slot_name] = None
else:
elem = group[slot]
iospec = ad._io.specs.get_spec(elem)
if iospec.encoding_type in ("csr_matrix", "csc_matrix") and backed:
slots[slot_name] = sparse_dataset(elem)
elif iospec.encoding_type in force_sparse_types:
slots[slot_name] = csr_matrix(read_elem(elem))
if backed:
slots[slot_name] = sparse_dataset(slots[slot_name])
else:
slots[slot_name] = read_elem(elem)
return ad.AnnData(**slots)

Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [midtime, lowmem, lowcpu]
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
import scanpy as sc
import numpy as np

Expand All @@ -15,9 +16,18 @@

## VIASH END

sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = sc.read_h5ad(par['input'])
adata.X = adata.layers["normalized"]
adata = read_anndata(
par['input'],
X='layers/normalized',
obs='obs',
var='var',
uns='uns'
)
adata.var["highly_variable"] = adata.var["hvg"]

print("Process dataset", flush=True)
Expand All @@ -27,7 +37,7 @@
n_comps = min(50, np.sum(batch_idx))
solver = "full" if n_comps == np.sum(batch_idx) else "arpack"
adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca(
adata[batch_idx],
adata[batch_idx].copy(),
n_comps=n_comps,
use_highly_variable=True,
svd_solver=solver,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [ "midtime", "lowmem", "lowcpu"]
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
import scanpy as sc

## VIASH START
Expand All @@ -15,8 +16,17 @@

## VIASH END

sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = sc.read_h5ad(par['input'])
adata = read_anndata(
par['input'],
obs='obs',
obsm='obsm',
uns='uns'
)

print("process dataset", flush=True)
adata.obsm["X_emb"] = adata.obsm["X_pca"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [ "midtime", "lowmem", "lowcpu"]
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import sys
import scanpy as sc

## VIASH START
Expand All @@ -15,12 +16,22 @@

## VIASH END

sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = sc.read_h5ad(par['input'])
adata = read_anndata(
par['input'],
X='layers/normalized',
obs='obs',
var='var',
uns='uns'
)

# no processing, subset matrix to highly variable genes
adata_hvg = adata[:, adata.var["hvg"]].copy()
adata.layers['corrected_counts'] = adata_hvg.layers["normalized"].copy()
adata.layers['corrected_counts'] = adata_hvg.X.copy()

print("Store outputs", flush=True)
adata.uns['method_id'] = meta['functionality_name']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
- path: ../../utils.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [ "midtime", "lowmem", "lowcpu"]
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,16 @@
# add helper scripts to path
sys.path.append(meta["resources_dir"])
from utils import _set_uns
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = sc.read_h5ad(par['input'])
adata = read_anndata(
par['input'],
obs='obs',
obsp='obsp',
uns='uns'
)

print("process dataset", flush=True)
neighbors_map = adata.uns['knn']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
- path: ../../utils.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [midtime, lowmem, lowcpu]
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,15 @@
## VIASH END
sys.path.append(meta["resources_dir"])
from utils import _perfect_embedding
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = ad.read_h5ad(par['input'])
adata = read_anndata(
par['input'],
obs='obs',
uns='uns'
)

print('Process data...', flush=True)
adata.obsm["X_emb"] = _perfect_embedding(partition=adata.obs["label"])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
- path: ../../utils.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [midtime, lowmem, lowcpu]
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,15 @@
## VIASH END
sys.path.append(meta["resources_dir"])
from utils import _perfect_embedding
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = ad.read_h5ad(par['input'])
adata = read_anndata(
par['input'],
obs='obs',
uns='uns'
)

print('Process data...', flush=True)
adata.obsm["X_emb"] = _perfect_embedding(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
- path: ../../utils.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [ "midtime", "lowmem", "lowcpu"]
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@
# add helper scripts to path
sys.path.append(meta["resources_dir"])
from utils import _randomize_features
from read_anndata_partial import read_anndata

print('Read input', flush=True)
adata = sc.read_h5ad(par['input'])
adata = read_anndata(
par['input'],
obs='obs',
obsm='obsm',
uns='uns'
)

print("process dataset", flush=True)
adata.obsm["X_emb"] = _randomize_features(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
- path: ../../utils.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [ "midtime", "lowmem", "lowcpu"]
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,20 @@
# add helper scripts to path
sys.path.append(meta["resources_dir"])
from utils import _randomize_features
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = ad.read_h5ad(par['input'])
adata = read_anndata(
par['input'],
X='layers/normalized',
obs='obs',
var='var',
uns='uns'
)

adata.layers['corrected_counts'] = _randomize_features(
adata.layers["normalized"],
adata.X,
partition=adata.obs["batch"],
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ functionality:
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/common/helper_functions/read_anndata_partial.py
- path: ../../utils.py
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
image: ghcr.io/openproblems-bio/base_images/python:1.1.0
- type: nextflow
directives:
label: [ "midtime", "lowmem", "lowcpu"]
Loading

0 comments on commit 6e3105b

Please sign in to comment.