Skip to content

Commit

Permalink
Feature/chunked writer precomputed burdens testing combo (#135)
Browse files Browse the repository at this point in the history
* make genotype and variant files optional

* specify burden file manually

* specify burden file manually

* add pipelines

* bug fixes

* add pipelines

* add eval pipeline

* Small bug fix removing overwriting of function option name.

* add argument for additional REGENIE options

* keep beta in results

* add conditional analysis

* run regenie_step2 in long queue

* improve modularity

* remove .loco files from output/input

* run REGENIE step 2 on verylong queue

* bug fixes

* bug fixes

* Add tests

* Add test data for merging

* Fix test path

* unused variable

* bug fix

* bug fix (remove broken debugging code)

* remove duplicate rule

* update pipelines for new burden directory structure

* bug fixes for REGENIE pipeline

* bug fixes for REGENIE pipelines

* add option for determinism in training

* add test for results of training_association_testing pipeline

* make genotype and variant files optional

* specify burden file manually

* specify burden file manually

* add pipelines

* bug fixes

* add pipelines

* add eval pipeline

* add argument for additional REGENIE options

* add conditional analysis

* Small bug fix removing overwriting of function option name.

* run regenie_step2 in long queue

* improve modularity

* remove .loco files from output/input

* run REGENIE step 2 on verylong queue

* bug fixes

* bug fixes

* bug fix

* bug fix (remove broken debugging code)

* remove duplicate rule

* update pipelines for new burden directory structure

* bug fixes for REGENIE pipeline

* bug fixes for REGENIE pipelines

* add option for determinism in training

* add test for results of training_association_testing pipeline

* make genotype and variant files optional

* specify burden file manually

* specify burden file manually

* add pipelines

* bug fixes

* add pipelines

* add eval pipeline

* add argument for additional REGENIE options

* add conditional analysis

* Small bug fix removing overwriting of function option name.

* run regenie_step2 in long queue

* improve modularity

* remove .loco files from output/input

* run REGENIE step 2 on verylong queue

* bug fixes

* bug fixes

* bug fix

* bug fix (remove broken debugging code)

* remove duplicate rule

* update pipelines for new burden directory structure

* bug fixes for REGENIE pipeline

* bug fixes for REGENIE pipelines

* add option for determinism in training

* add test for results of training_association_testing pipeline

* make genotype and variant files optional

* specify burden file manually

* specify burden file manually

* bug fixes

* add eval pipeline

* add argument for additional REGENIE options

* update pipelines for new burden directory structure

* bug fixes for REGENIE pipelines

* add test for results of training_association_testing pipeline

* corrections to pipelines

* delete old example config

* bug fixes for new config file naming

* improve error message when repo is misspecified

* correct data key

* add command to compare association testing results

* cast trial ID to int to fix intermittent bug where it's stored as float

* expose deterministic option (for testing/debugging)

* remove deeprvat_repo_dir

* Add chunked writer

* Remove deeprvat from paths

* Fix dtype

* Add merge to snakemake pipeline

* Remove x.zarr and y.zarr

* Working test for 5 chunks

* Add back sample_ids test

* Fix pretrained path

* Remove phenotypes key

* Fix failing tests

* More config errors

* fixes to pipelines

* remove unfinished sections

* pipeline fixes

* reduce number of phenotypes in example configs

* bug fix

* pipeline fixes

* add example config for REGENIE

* pipeline fixes

* pipeline fixes

* pipeline fixes

* adapt CV pipeline

* pipeline fix

* reduce uninformative logging

* modifications for running on compute cluster

* pipeline fix

* fix typo

* delete unused files

* fixup! Format Python code with psf/black pull_request

* pipeline fixes

* code cleanup

* remove unneded teest files

* fixup! Format Python code with psf/black pull_request

* correct typo

* black

* fixup! Format Python code with psf/black pull_request

---------

Co-authored-by: Brian Clarke <[email protected]>
Co-authored-by: Thibault <[email protected]>
Co-authored-by: Magnus Wahlberg <[email protected]>
Co-authored-by: PMBio <[email protected]>
  • Loading branch information
5 people authored Sep 24, 2024
1 parent 801fc32 commit f0a2e25
Show file tree
Hide file tree
Showing 39 changed files with 2,068 additions and 622 deletions.
102 changes: 69 additions & 33 deletions deeprvat/cv_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,80 +96,117 @@ def generate_test_config(input_config, out_file, fold, n_folds):


@cli.command()
@click.option("--link-burdens", type=click.Path())
@click.option("--skip-burdens", is_flag=True)
@click.option("--burden-dirs", "-b", multiple=True)
@click.argument("out_dir", type=click.Path(), default="./")
@click.option("--xy-dirs", "-b", multiple=True)
@click.argument("out_dir_burdens", type=click.Path(), default="./")
@click.argument("out_dir_xy", type=click.Path(), default="./")
@click.argument("config_file", type=click.Path(exists=True))
def combine_test_set_burdens(
out_dir,
link_burdens,
out_dir_burdens,
out_dir_xy,
skip_burdens,
burden_dirs,
xy_dirs,
config_file,
):
assert len(burden_dirs) == len(xy_dirs)

with open(config_file) as f:
config = yaml.safe_load(f)
compression_level = 1
skip_burdens = link_burdens is not None
n_total_samples = []
for burden_dir in burden_dirs:
print(burden_dir)
this_y = zarr.open(f"{burden_dir}/y.zarr")
this_x = zarr.open(f"{burden_dir}/x.zarr")
for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
logger.debug(xy_dir)
this_y = zarr.open(f"{xy_dir}/y.zarr")
this_x = zarr.open(f"{xy_dir}/x.zarr")
this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
# this_burdens = zarr.open(f'{burden_dir}/burdens.zarr')

assert this_y.shape[0] == this_x.shape[0] # == this_burdens.shape[0]
assert this_y.shape[0] == this_x.shape[0]
n_total_samples.append(this_y.shape[0])

if not skip_burdens:
this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")
this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
assert this_y.shape[0] == this_burdens.shape[0]
logger.debug(this_sample_ids_xy, this_sample_ids_burdens)
assert np.array_equal(this_sample_ids_xy, this_sample_ids_burdens)

n_total_samples = np.sum(n_total_samples)
print(f"Total number of samples {n_total_samples}")
logger.info(f"Total number of samples: {n_total_samples}")
if not skip_burdens:
this_burdens = zarr.open(
f"{burden_dir}/burdens.zarr"
) # any burden tensor (here from the last file to get dims 1 -n)
burdens = zarr.open(
Path(out_dir) / "burdens.zarr",
Path(out_dir_burdens) / "burdens.zarr",
mode="a",
shape=(n_total_samples,) + this_burdens.shape[1:],
chunks=(1000, 1000),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
print(f"burdens shape: {burdens.shape}")
else:
burdens = None
logger.info(f"burdens shape: {burdens.shape}")
sample_ids_burdens = zarr.open(
Path(out_dir_burdens) / "sample_ids.zarr",
mode="a",
shape=(n_total_samples),
chunks=(None),
dtype="U200",
compressor=Blosc(clevel=compression_level),
)

y = zarr.open(
Path(out_dir) / "y.zarr",
Path(out_dir_xy) / "y.zarr",
mode="a",
shape=(n_total_samples,) + this_y.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
x = zarr.open(
Path(out_dir) / "x.zarr",
Path(out_dir_xy) / "x.zarr",
mode="a",
shape=(n_total_samples,) + this_x.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
sample_ids_xy = zarr.open(
Path(out_dir_xy) / "sample_ids.zarr",
mode="a",
shape=(n_total_samples),
chunks=(None),
dtype="U200",
compressor=Blosc(clevel=compression_level),
)

start_idx = 0

for burden_dir in burden_dirs:
this_y = zarr.open(f"{burden_dir}/y.zarr")[:]
for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
this_y = zarr.load(f"{xy_dir}/y.zarr")
end_idx = start_idx + this_y.shape[0]
this_x = zarr.open(f"{burden_dir}/x.zarr")[:]
if not skip_burdens:
logger.info("writing burdens")
this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")[:]
burdens[start_idx:end_idx] = this_burdens
print((start_idx, end_idx))
this_x = zarr.load(f"{xy_dir}/x.zarr")
this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
y[start_idx:end_idx] = this_y
x[start_idx:end_idx] = this_x
sample_ids_xy[start_idx:end_idx] = this_sample_ids_xy
if not skip_burdens:
logger.info("writing burdens")
this_burdens = zarr.load(f"{burden_dir}/burdens.zarr")
burdens[start_idx:end_idx] = this_burdens
this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
sample_ids_burdens[start_idx:end_idx] = this_sample_ids_burdens
start_idx = end_idx

# sanity check
if not skip_burdens and not np.array_equal(sample_ids_xy[:], sample_ids_burdens[:]):
logger.error(
"sample_ids_xy, sample_ids_burdens do not match:\n"
+ f"sample_ids_xy: {sample_ids_xy[:]}"
+ f"sample_ids_burdens: {sample_ids_burdens[:]}"
)
raise RuntimeError("sample_ids_xy, sample_ids_burdens do not match")

y_transformation = config["association_testing_data"]["dataset_config"].get(
"y_transformation", None
)
Expand Down Expand Up @@ -202,13 +239,12 @@ def combine_test_set_burdens(
for col in range(this_y.shape[1]):
this_y[:, col] = my_quantile_transform(this_y[:, col])
y[:] = this_y

if not skip_burdens:
genes = np.load(f"{burden_dirs[0]}/genes.npy")
np.save(Path(out_dir_burdens) / "genes.npy", genes)

print("done")
if link_burdens is not None:
source_path = Path(out_dir) / "burdens.zarr"
source_path.unlink(missing_ok=True)
source_path.symlink_to(link_burdens)
genes = np.load(f"{burden_dirs[0]}/genes.npy")
np.save(Path(out_dir) / "genes.npy", genes)


if __name__ == "__main__":
Expand Down
102 changes: 53 additions & 49 deletions deeprvat/data/dense_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(
split: str = "",
train_dataset: Optional[Dataset] = None,
chromosomes: List[str] = None,
phenotype_file: str = None,
phenotype_file: Optional[str] = None,
standardize_xpheno: bool = True,
standardize_anno: bool = False,
standardize_rare_anno: bool = False,
Expand Down Expand Up @@ -106,13 +106,14 @@ def __init__(
zarr_dir: Optional[str] = None,
cache_matrices: bool = False,
verbose: bool = False,
return_genotypes: bool = True,
):
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)

self.check_samples = True # TODO undo
self.check_samples = False # NOTE: Set to True for debugging
self.split = split
self.train_dataset = train_dataset
self.chromosomes = (
Expand All @@ -134,13 +135,10 @@ def __init__(
f"Using phenotypes: x: {self.x_phenotypes}, " f"y: {self.y_phenotypes}"
)

if gt_file is None:
raise ValueError("gt_file must be specified")
self.gt_filename = gt_file
if variant_file is None:
raise ValueError("variant_file must be specified")
if phenotype_file is None:
raise ValueError("phenotype_file must be specified")
self.gt_filename = gt_file
self.return_genotypes = return_genotypes
self.variant_filename = variant_file
self.variant_matrix = None
self.genotype_matrix = None
Expand All @@ -154,9 +152,6 @@ def __init__(
self.variant_matrix = f["variant_matrix"][:]
self.genotype_matrix = f["genotype_matrix"][:]

logger.info(
f"Using phenotype file {phenotype_file} and genotype file {self.gt_filename}"
)
self.setup_phenotypes(
phenotype_file, sim_phenotype_file, skip_y_na, skip_x_na, sample_file
)
Expand Down Expand Up @@ -204,45 +199,54 @@ def __init__(
else:
self.variants_to_keep = variants_to_keep

self.setup_annotations(
annotation_file, annotation_aggregation, precomputed_annotations
)
if self.return_genotypes:
self.setup_annotations(
annotation_file, annotation_aggregation, precomputed_annotations
)

self.transform_data()
self.setup_variants(min_common_variant_count, min_common_af, variants)

self.get_variant_metadata(grouping_level)
if self.return_genotypes:
self.setup_variants(min_common_variant_count, min_common_af, variants)

self.get_variant_metadata(grouping_level)

if rare_embedding is not None:
if rare_embedding is not None and self.return_genotypes:
self.rare_embedding = getattr(rare_embedders, rare_embedding["type"])(
self, **rare_embedding["config"]
)

else:
self.rare_embedding = None

def __getitem__(self, idx: int) -> torch.tensor:
if self.variant_matrix is None:
gt_file = h5py.File(self.gt_filename, "r")
self.variant_matrix = gt_file["variant_matrix"]
self.genotype_matrix = gt_file["genotype_matrix"]
if self.cache_matrices:
self.variant_matrix = self.variant_matrix[:]
self.genotype_matrix = self.genotype_matrix[:]

# idx_pheno = self.index_map_pheno[idx] #samples and phenotype is already subset so can use idx
idx_geno = self.index_map_geno[idx]
sparse_variants = self.variant_matrix[idx_geno, :]
sparse_genotype = self.genotype_matrix[idx_geno, :]
(
common_variants,
all_sparse_variants,
sparse_genotype,
) = self.get_common_variants(sparse_variants, sparse_genotype)

rare_variant_annotations = self.get_rare_variants(
idx, all_sparse_variants, sparse_genotype
)
if self.return_genotypes:
if self.variant_matrix is None or self.genotype_matrix is None:
gt_file = h5py.File(self.gt_filename, "r")
self.variant_matrix = gt_file["variant_matrix"]
self.genotype_matrix = gt_file["genotype_matrix"]
if self.cache_matrices:
self.variant_matrix = self.variant_matrix[:]
self.genotype_matrix = self.genotype_matrix[:]

idx_geno = self.index_map_geno[idx]
if self.check_samples:
# sanity check, can be removed in future
assert self.samples_gt[idx_geno] == self.samples[idx]

sparse_variants = self.variant_matrix[idx_geno, :]
sparse_genotype = self.genotype_matrix[idx_geno, :]
(
common_variants,
all_sparse_variants,
sparse_genotype,
) = self.get_common_variants(sparse_variants, sparse_genotype)

rare_variant_annotations = self.get_rare_variants(
idx, all_sparse_variants, sparse_genotype
)
else:
common_variants = torch.tensor([], dtype=torch.float)
rare_variant_annotations = torch.tensor([], dtype=torch.float)

phenotypes = self.phenotype_df.iloc[
idx, :
Expand All @@ -255,9 +259,7 @@ def __getitem__(self, idx: int) -> torch.tensor:
y = torch.tensor(
phenotypes[self.y_phenotypes].to_numpy(dtype=np.float32), dtype=torch.float
)
if self.check_samples:
# sanity check, can be removed in future
assert self.samples_gt[idx_geno] == self.samples[idx]

return {
"sample": self.samples[idx],
"x_phenotypes": x_phenotype_tensor,
Expand Down Expand Up @@ -287,11 +289,6 @@ def setup_phenotypes(
):
logger.debug("Reading phenotype dataframe")
self.phenotype_df = pd.read_parquet(phenotype_file, engine="pyarrow")
with h5py.File(self.gt_filename, "r") as f:
samples_gt = f["samples"][:]
samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
if self.check_samples:
self.samples_gt = samples_gt
samples_phenotype_df = np.array(self.phenotype_df.index)
# phenotypes_df has first to be sorted in the same order as samples_gt
if sim_phenotype_file is not None:
Expand All @@ -315,14 +312,21 @@ def setup_phenotypes(
logger.warning(
"Some samples from the sample file were not found in the data"
)
sample_to_keep = shared_samples
samples_to_keep = shared_samples
logger.info(
f"Number of samples in sample file and in phenotype_df: {len(samples_to_keep)}"
)
else:
logger.info("Using all samples in phenotype df")
samples_to_keep = copy.deepcopy(samples_phenotype_df)

# if self.return_genotypes:
with h5py.File(self.gt_filename, "r") as f:
samples_gt = f["samples"][:]
samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
if self.check_samples:
self.samples_gt = samples_gt

logger.info("Removing samples that are not in genotype file")

samples_to_keep = np.array(
Expand Down Expand Up @@ -353,11 +357,11 @@ def setup_phenotypes(
mask_cols += self.x_phenotypes
mask = (self.phenotype_df[mask_cols].notna()).all(axis=1)
mask &= samples_to_keep_mask
samples_to_keep = self.phenotype_df.index[mask]
self.samples = self.phenotype_df.index[mask]
self.n_samples = mask.sum()
logger.info(f"Final number of kept samples: {self.n_samples}")

self.phenotype_df = self.phenotype_df[mask]
self.samples = self.phenotype_df.index.to_numpy()

# account for the fact that genotypes.h5 and phenotype_df can have different
# orders of their samples
Expand Down
Loading

0 comments on commit f0a2e25

Please sign in to comment.