Skip to content

Commit

Permalink
Extract popv model/reference data download
Browse files Browse the repository at this point in the history
  • Loading branch information
axdanbol committed Oct 20, 2023
1 parent f132af1 commit 36ddf03
Show file tree
Hide file tree
Showing 10 changed files with 284 additions and 190 deletions.
4 changes: 0 additions & 4 deletions containers/popv/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,6 @@ FROM tensorflow/tensorflow:2.11.1

RUN apt-get -y update && apt-get -y install python3-dev g++ git wget

ENV MODELS_DIR=/models ZENODO_MODELS_ID=7580707
COPY context/download-models.sh .
RUN ./download-models.sh

COPY context/download-ontology.sh .
RUN ./download-ontology.sh

Expand Down
8 changes: 8 additions & 0 deletions containers/popv/context/download-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -e

OUTPUT_DIR=${3:-"./popv"}

mkdir -p "$OUTPUT_DIR"
/download-models.sh "$1" "$OUTPUT_DIR/models"
/download-reference-data.sh "$2" "$OUTPUT_DIR/reference-data"
23 changes: 12 additions & 11 deletions containers/popv/context/download-models.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
#!/bin/bash
set -e

# Defined by user: MODELS_DIR and ZENODO_MODELS_ID
MODELS_TMP_DIR=$MODELS_DIR/tmp
MODELS_ID=${1:?"A zenodo models id must be provided to download!"}
MODELS_DIR=${2:-"./popv/models"}

mkdir -p $MODELS_DIR $MODELS_TMP_DIR
mkdir -p $MODELS_DIR
zenodo_get $MODELS_ID -o $MODELS_DIR

pip install 'zenodo_get@git+https://github.com/dvolgyes/zenodo_get'
zenodo_get $ZENODO_MODELS_ID -o $MODELS_TMP_DIR
for ARCHIVE in $MODELS_DIR/*.tar.gz; do
MODEL=$(basename -s .tar.gz $ARCHIVE)
DIR="$MODELS_DIR/$MODEL"

for archive in $MODELS_TMP_DIR/*.tar.gz; do
NAME=`basename $archive .tar.gz`
mkdir -p $MODELS_DIR/${NAME}
tar zx -C $MODELS_DIR/${NAME} -f $archive
if [[ ! -d $DIR || -z $(ls -A $DIR) ]]; then
mkdir -p $DIR
tar zx -C $DIR -f $ARCHIVE
fi
done

rm -rf $MODELS_TMP_DIR
8 changes: 8 additions & 0 deletions containers/popv/context/download-reference-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -e

REFERENCE_DATA_ID=${1:?"A zenodo reference data id must be provided to download!"}
REFERENCE_DATA_DIR=${2:-"./popv/reference-data"}

mkdir -p $REFERENCE_DATA_DIR
zenodo_get $REFERENCE_DATA_ID -o $REFERENCE_DATA_DIR
99 changes: 66 additions & 33 deletions containers/popv/context/main.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import os
import typing as t
from logging import warn
from pathlib import Path

import numpy
import popv
import scanpy

import popv
from src.algorithm import Algorithm, OrganLookup, add_common_arguments


class PopvOptions(t.TypedDict):
reference_data: scanpy.AnnData
reference_data_dir: Path
models_dir: Path
prediction_mode: str
cell_ontology_dir: str
query_labels_key: t.Optional[str]
Expand All @@ -22,30 +23,10 @@ class PopvOptions(t.TypedDict):
samples_per_label: int


class PopvOrganLookup(OrganLookup[str]):
def __init__(self, mapping_file: Path):
super().__init__(mapping_file)
self.models_dir = Path(os.environ["MODELS_DIR"])
self.prefix = "pretrained_models_"
self.suffix = "_ts"

def get_builtin_options(self):
dirs = self.get_dirs()
return map(lambda dir: (self.get_name(dir), dir), dirs)

def get_dirs(self) -> t.Iterable[Path]:
return self.models_dir.glob(f"{self.prefix}*{self.suffix}")

def get_name(self, dir: Path) -> str:
start = len(self.prefix)
end = -len(self.suffix)
return dir.name[start:end]


class PopvAlgorithm(Algorithm[str, PopvOptions]):
def __init__(self):
super().__init__(PopvOrganLookup)
self.models_dir = Path(os.environ["MODELS_DIR"])
super().__init__(OrganLookup)
# self.models_dir = Path(os.environ["MODELS_DIR"])

def do_run(self, matrix: Path, organ: str, options: PopvOptions):
data = scanpy.read_h5ad(matrix)
Expand All @@ -63,18 +44,25 @@ def do_run(self, matrix: Path, organ: str, options: PopvOptions):
def prepare_query(
self, data: scanpy.AnnData, organ: str, options: PopvOptions
) -> scanpy.AnnData:
reference_data_path = self.find_reference_data(
options["reference_data_dir"], organ
)
model_path = self.find_model_dir(options["models_dir"], organ)
reference_data = scanpy.read_h5ad(reference_data_path)
n_samples_per_label = self.get_n_samples_per_label(reference_data, options)

query = popv.preprocessing.Process_Query(
data,
options["reference_data"],
save_path_trained_models=str(self.models_dir / organ),
reference_data,
save_path_trained_models=str(model_path),
prediction_mode=options["prediction_mode"],
query_labels_key=options["query_labels_key"],
query_batch_key=options["query_batch_key"],
query_layers_key=options["query_layers_key"],
ref_labels_key=options["ref_labels_key"],
ref_batch_key=options["ref_batch_key"],
unknown_celltype_label=options["unknown_labels_key"],
n_samples_per_label=self.get_n_samples_per_label(options),
n_samples_per_label=n_samples_per_label,
cl_obo_folder=f"{options['cell_ontology_dir']}/",
compute_embedding=True,
hvg=None,
Expand All @@ -83,23 +71,68 @@ def prepare_query(

return query.adata

def get_n_samples_per_label(self, options: PopvOptions) -> int:
reference_data = options["reference_data"]
def get_n_samples_per_label(
self, reference_data: scanpy.AnnData, options: PopvOptions
) -> int:
ref_labels_key = options["ref_labels_key"]
n_samples_per_label = options["samples_per_label"]
if ref_labels_key in reference_data.obs.columns:
n = numpy.min(reference_data.obs.groupby(ref_labels_key).size())
n_samples_per_label = numpy.max((n_samples_per_label, t.cast(int, n)))
return n_samples_per_label

def find_reference_data(self, dir: Path, organ: str) -> Path:
def is_reference_data_candidate(path: Path):
return (
path.is_file()
and path.suffix == ".h5ad"
and organ.lower() in path.stem.lower()
)

return self._find_in_dir(
dir,
is_reference_data_candidate,
f"Cannot find reference data for organ '{organ}'",
f"Multiple reference data candidates for organ '{organ}'",
)

def find_model_dir(self, dir: Path, organ: str) -> Path:
def is_model_candidate(path: Path):
return path.is_dir() and organ.lower() in path.name.lower()

return self._find_in_dir(
dir,
is_model_candidate,
f"Cannot find model directory for organ '{organ}'",
f"Multiple model directory candidates for organ '{organ}'",
)

def _find_in_dir(
self, dir: Path, cond: t.Callable[[Path], bool], error_msg: str, warn_msg: str
):
candidates = list(filter(cond, dir.iterdir()))
candidates.sort(key=lambda path: len(path.name))

if not candidates:
raise ValueError(error_msg)
elif len(candidates) > 1:
warn(warn_msg)
return candidates[0]


def _get_arg_parser():
parser = add_common_arguments()
parser.add_argument(
"--reference-data",
type=scanpy.read_h5ad,
"--reference-data-dir",
type=Path,
required=True,
help="Path to directory with reference data",
)
parser.add_argument(
"--models-dir",
type=Path,
required=True,
help="h5ad reference data file",
help="Path to models directory",
)
parser.add_argument("--prediction-mode", default="fast", help="Prediction mode")
parser.add_argument(
Expand Down
48 changes: 24 additions & 24 deletions containers/popv/context/organ-mapping.json
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
{
"UBERON:0001255": "pretrained_models_Bladder_ts",
"UBERON:0000178": "pretrained_models_Blood_ts",
"UBERON:0002371": "pretrained_models_Bone_Marrow_ts",
"UBERON:0000970": "pretrained_models_Eye_ts",
"UBERON:0004548": "pretrained_models_Eye_ts",
"UBERON:0004549": "pretrained_models_Eye_ts",
"UBERON:0000948": "pretrained_models_Heart_ts",
"UBERON:0000059": "pretrained_models_Large_Intestine_ts",
"UBERON:0002107": "pretrained_models_Liver_ts",
"UBERON:0002048": "pretrained_models_Lung_ts",
"UBERON:0001004": "pretrained_models_Lung_ts",
"UBERON:0000029": "pretrained_models_Lymph_Node_ts",
"UBERON:0002509": "pretrained_models_Lymph_Node_ts",
"UBERON:0001911": "pretrained_models_Mammary_ts",
"UBERON:0001264": "pretrained_models_Pancreas_ts",
"UBERON:0002367": "pretrained_models_Prostate_ts",
"UBERON:0000079": "pretrained_models_Prostate_ts",
"UBERON:0002097": "pretrained_models_Skin_ts",
"UBERON:0002108": "pretrained_models_Small_Intestine_ts",
"UBERON:0002106": "pretrained_models_Spleen_ts",
"UBERON:0002370": "pretrained_models_Thymus_ts",
"UBERON:0003126": "pretrained_models_Trachea_ts",
"UBERON:0000995": "pretrained_models_Uterus_ts",
"UBERON:0004537": "pretrained_models_Vasculature_ts"
"UBERON:0001255": "Bladder",
"UBERON:0000178": "Blood",
"UBERON:0002371": "Bone_Marrow",
"UBERON:0000970": "Eye",
"UBERON:0004548": "Eye",
"UBERON:0004549": "Eye",
"UBERON:0000948": "Heart",
"UBERON:0000059": "Large_Intestine",
"UBERON:0002107": "Liver",
"UBERON:0002048": "Lung",
"UBERON:0001004": "Lung",
"UBERON:0000029": "Lymph_Node",
"UBERON:0002509": "Lymph_Node",
"UBERON:0001911": "Mammary",
"UBERON:0001264": "Pancreas",
"UBERON:0002367": "Prostate",
"UBERON:0000079": "Prostate",
"UBERON:0002097": "Skin",
"UBERON:0002108": "Small_Intestine",
"UBERON:0002106": "Spleen",
"UBERON:0002370": "Thymus",
"UBERON:0003126": "Trachea",
"UBERON:0000995": "Uterus",
"UBERON:0004537": "Vasculature"
}
Loading

0 comments on commit 36ddf03

Please sign in to comment.