Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/chunked writer precomputed burdens testing combo #135

Merged
merged 141 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
141 commits
Select commit Hold shift + click to select a range
2e4ea84
make genotype and variant files optional
bfclarke Mar 4, 2024
94722ed
specify burden file manually
bfclarke Mar 4, 2024
5655919
specify burden file manually
bfclarke Mar 5, 2024
742b09f
add pipelines
bfclarke Mar 12, 2024
8deacd9
bug fixes
bfclarke Mar 12, 2024
ad61bd4
add pipelines
bfclarke Mar 12, 2024
3801859
add eval pipeline
bfclarke Mar 12, 2024
1f8c424
Small bug fix removing overwriting of function option name.
ThibaultBechtler Mar 15, 2024
48b039f
add argument for additional REGENIE options
bfclarke Mar 29, 2024
4c45063
keep beta in results
bfclarke Mar 29, 2024
7925630
add conditional analysis
bfclarke Mar 29, 2024
67fa91c
Merge branch 'feature-precomputed-burden-testing' of github.com:PMBio…
bfclarke Mar 29, 2024
b964fda
run regenie_step2 in long queue
bfclarke Apr 11, 2024
dd41088
improve modularity
bfclarke Apr 11, 2024
f9e8698
remove .loco files from output/input
bfclarke Apr 12, 2024
232fd72
run REGENIE step 2 on verylong queue
bfclarke Apr 17, 2024
b539cd3
bug fixes
bfclarke Apr 17, 2024
4dca15d
bug fixes
bfclarke Apr 30, 2024
c28b69d
Add tests
endast May 7, 2024
9c718f7
Add test data for merging
endast May 7, 2024
2ceabb5
Fix test path
endast May 7, 2024
28798ac
unused variable
endast May 7, 2024
69e20e0
Merge branch 'feature-precomputed-burden-testing' into feature/chunke…
endast May 7, 2024
0a25ffd
bug fix
bfclarke May 15, 2024
ccb92c1
bug fix (remove broken debugging code)
bfclarke May 17, 2024
669140b
remove duplicate rule
bfclarke May 17, 2024
7d13b7b
update pipelines for new burden directory structure
bfclarke May 22, 2024
751c23d
bug fixes for REGENIE pipeline
bfclarke May 22, 2024
97b43ec
bug fixes for REGENIE pipelines
bfclarke May 22, 2024
cbd4dcf
add option for determinism in training
bfclarke May 23, 2024
4b48b0e
add test for results of training_association_testing pipeline
bfclarke Jun 25, 2024
c6b4f3b
make genotype and variant files optional
bfclarke Mar 4, 2024
7bcae5f
specify burden file manually
bfclarke Mar 4, 2024
9b9c6e5
specify burden file manually
bfclarke Mar 5, 2024
ca26206
add pipelines
bfclarke Mar 12, 2024
a51d40e
bug fixes
bfclarke Mar 12, 2024
d76628f
add pipelines
bfclarke Mar 12, 2024
0bc1409
add eval pipeline
bfclarke Mar 12, 2024
b0940e9
add argument for additional REGENIE options
bfclarke Mar 29, 2024
318c793
add conditional analysis
bfclarke Mar 29, 2024
1a765b0
Small bug fix removing overwriting of function option name.
ThibaultBechtler Mar 15, 2024
48900ea
run regenie_step2 in long queue
bfclarke Apr 11, 2024
a03e0ee
improve modularity
bfclarke Apr 11, 2024
2380406
remove .loco files from output/input
bfclarke Apr 12, 2024
d810e41
run REGENIE step 2 on verylong queue
bfclarke Apr 17, 2024
e69b33c
bug fixes
bfclarke Apr 17, 2024
756d797
bug fixes
bfclarke Apr 30, 2024
6965d73
bug fix
bfclarke May 15, 2024
50ed569
bug fix (remove broken debugging code)
bfclarke May 17, 2024
d041c64
remove duplicate rule
bfclarke May 17, 2024
32ba53f
update pipelines for new burden directory structure
bfclarke May 22, 2024
213d184
bug fixes for REGENIE pipeline
bfclarke May 22, 2024
34a4608
bug fixes for REGENIE pipelines
bfclarke May 22, 2024
01e1e42
add option for determinism in training
bfclarke May 23, 2024
6d19c0d
add test for results of training_association_testing pipeline
bfclarke Jun 25, 2024
218200d
Merge branch 'feature-precomputed-burden-testing' of github.com:PMBio…
bfclarke Jul 3, 2024
4ce40b2
make genotype and variant files optional
bfclarke Mar 4, 2024
2edde91
specify burden file manually
bfclarke Mar 4, 2024
d4f929e
specify burden file manually
bfclarke Mar 5, 2024
9124ea7
add pipelines
bfclarke Mar 12, 2024
878b322
bug fixes
bfclarke Mar 12, 2024
e14fc85
add pipelines
bfclarke Mar 12, 2024
abbd76b
add eval pipeline
bfclarke Mar 12, 2024
801fff8
add argument for additional REGENIE options
bfclarke Mar 29, 2024
e83cd7c
add conditional analysis
bfclarke Mar 29, 2024
0e33290
Small bug fix removing overwriting of function option name.
ThibaultBechtler Mar 15, 2024
585e399
run regenie_step2 in long queue
bfclarke Apr 11, 2024
14990b9
improve modularity
bfclarke Apr 11, 2024
b2d292c
remove .loco files from output/input
bfclarke Apr 12, 2024
a9cc677
run REGENIE step 2 on verylong queue
bfclarke Apr 17, 2024
d2ae3b7
bug fixes
bfclarke Apr 17, 2024
3f9beaa
bug fixes
bfclarke Apr 30, 2024
c7e8bd0
bug fix
bfclarke May 15, 2024
f297ff8
bug fix (remove broken debugging code)
bfclarke May 17, 2024
07661ec
remove duplicate rule
bfclarke May 17, 2024
9c8fbde
update pipelines for new burden directory structure
bfclarke May 22, 2024
4f1f1fa
bug fixes for REGENIE pipeline
bfclarke May 22, 2024
3bc2fb0
bug fixes for REGENIE pipelines
bfclarke May 22, 2024
e55c902
add option for determinism in training
bfclarke May 23, 2024
010c22c
add test for results of training_association_testing pipeline
bfclarke Jun 25, 2024
90c65a7
make genotype and variant files optional
bfclarke Aug 22, 2024
e1992c7
specify burden file manually
bfclarke Mar 4, 2024
f7bc9c0
specify burden file manually
bfclarke Aug 22, 2024
b5fdf0a
bug fixes
bfclarke Mar 12, 2024
0d1da22
add eval pipeline
bfclarke Mar 12, 2024
b6a5b4b
add argument for additional REGENIE options
bfclarke Aug 22, 2024
232e7dd
update pipelines for new burden directory structure
bfclarke Aug 23, 2024
28b616e
bug fixes for REGENIE pipelines
bfclarke Aug 23, 2024
686e5d3
add test for results of training_association_testing pipeline
bfclarke Aug 23, 2024
8d99a4d
corrections to pipelines
bfclarke Aug 22, 2024
f35fe30
Merge branch 'feature-precomputed-burden-testing' of github.com:PMBio…
bfclarke Aug 23, 2024
f57b29e
delete old example config
bfclarke Aug 27, 2024
43255d0
bug fixes for new config file naming
bfclarke Aug 28, 2024
4347fd1
improve error message when repo is misspecified
bfclarke Aug 28, 2024
da0474c
correct data key
bfclarke Aug 28, 2024
b86ff09
add command to compare association testing results
bfclarke Aug 28, 2024
703c27c
cast trial ID to int to fix intermittent bug where it's stored as float
bfclarke Aug 28, 2024
d2d59d4
expose deterministic option (for testing/debugging)
bfclarke Aug 28, 2024
54ef3b3
Merge branch 'feature-precomputed-burden-testing' into feature/chunke…
endast Aug 30, 2024
d700de4
Add initial changes
endast Sep 3, 2024
d26b548
remove deeprvat_repo_dir
endast Sep 3, 2024
32f409d
Add chunked writer
endast Sep 3, 2024
c470c84
Remove deeprvat from paths
endast Sep 3, 2024
739717e
Fix dtype
endast Sep 4, 2024
c77d793
Add merge to snakemake pipeline
endast Sep 4, 2024
c891b41
Remove x.zarr and y.zarr
endast Sep 4, 2024
06e34b6
Working test for 5 chunks
endast Sep 4, 2024
3ad8434
Add back sample_ids test
endast Sep 4, 2024
45f6a32
Fix pretrained path
endast Sep 4, 2024
dd62d49
Remove phenotypes key
endast Sep 4, 2024
7873bac
Merge branch 'feature/chunked-writer-precomputed-burdens-testing-comb…
endast Sep 4, 2024
db92c10
Fix failing tests
endast Sep 4, 2024
ff82d44
More config errors
endast Sep 4, 2024
0b6aeda
fixes to pipelines
bfclarke Sep 9, 2024
6c6bf2f
remove unfinished sections
bfclarke Sep 9, 2024
5181ed6
pipeline fixes
bfclarke Sep 10, 2024
68d2ccd
reduce number of phenotypes in example configs
bfclarke Sep 10, 2024
0c70898
bug fix
bfclarke Sep 10, 2024
5511a6d
pipeline fixes
bfclarke Sep 10, 2024
4dace76
add example config for REGENIE
bfclarke Sep 10, 2024
49979ab
pipeline fixes
bfclarke Sep 10, 2024
e41bb3a
pipeline fixes
bfclarke Sep 10, 2024
dc5df06
pipeline fixes
bfclarke Sep 10, 2024
ebd7baf
adapt CV pipeline
bfclarke Sep 13, 2024
52b3990
pipeline fix
bfclarke Sep 16, 2024
01929c2
reduce uninformative logging
bfclarke Sep 17, 2024
fdcdc45
modifications for running on compute cluster
bfclarke Sep 17, 2024
eb996e5
pipeline fix
bfclarke Sep 17, 2024
bed51b6
fix typo
bfclarke Sep 17, 2024
6d3275e
delete unused files
bfclarke Sep 17, 2024
242c6b8
fixup! Format Python code with psf/black pull_request
Sep 17, 2024
3483fc6
pipeline fixes
bfclarke Sep 20, 2024
b86080a
pipeline fixes
bfclarke Sep 20, 2024
2662477
Merge branch 'feature/chunked-writer-precomputed-burdens-testing-comb…
bfclarke Sep 20, 2024
c09eec3
code cleanup
bfclarke Sep 24, 2024
0f3e2d8
remove unneded teest files
bfclarke Sep 24, 2024
c238e75
fixup! Format Python code with psf/black pull_request
Sep 24, 2024
a052bd6
correct typo
bfclarke Sep 24, 2024
7c2e867
black
bfclarke Sep 24, 2024
14f783f
Merge branch 'feature/chunked-writer-precomputed-burdens-testing-comb…
bfclarke Sep 24, 2024
1b3dfb1
fixup! Format Python code with psf/black pull_request
Sep 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 69 additions & 33 deletions deeprvat/cv_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,80 +96,117 @@ def generate_test_config(input_config, out_file, fold, n_folds):


@cli.command()
@click.option("--link-burdens", type=click.Path())
@click.option("--skip-burdens", is_flag=True)
@click.option("--burden-dirs", "-b", multiple=True)
@click.argument("out_dir", type=click.Path(), default="./")
@click.option("--xy-dirs", "-b", multiple=True)
@click.argument("out_dir_burdens", type=click.Path(), default="./")
@click.argument("out_dir_xy", type=click.Path(), default="./")
@click.argument("config_file", type=click.Path(exists=True))
def combine_test_set_burdens(
out_dir,
link_burdens,
out_dir_burdens,
out_dir_xy,
skip_burdens,
burden_dirs,
xy_dirs,
config_file,
):
assert len(burden_dirs) == len(xy_dirs)

with open(config_file) as f:
config = yaml.safe_load(f)
compression_level = 1
skip_burdens = link_burdens is not None
n_total_samples = []
for burden_dir in burden_dirs:
print(burden_dir)
this_y = zarr.open(f"{burden_dir}/y.zarr")
this_x = zarr.open(f"{burden_dir}/x.zarr")
for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
logger.debug(xy_dir)
this_y = zarr.open(f"{xy_dir}/y.zarr")
this_x = zarr.open(f"{xy_dir}/x.zarr")
this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
# this_burdens = zarr.open(f'{burden_dir}/burdens.zarr')

assert this_y.shape[0] == this_x.shape[0] # == this_burdens.shape[0]
assert this_y.shape[0] == this_x.shape[0]
n_total_samples.append(this_y.shape[0])

if not skip_burdens:
this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")
this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
assert this_y.shape[0] == this_burdens.shape[0]
logger.debug(this_sample_ids_xy, this_sample_ids_burdens)
assert np.array_equal(this_sample_ids_xy, this_sample_ids_burdens)

n_total_samples = np.sum(n_total_samples)
print(f"Total number of samples {n_total_samples}")
logger.info(f"Total number of samples: {n_total_samples}")
if not skip_burdens:
this_burdens = zarr.open(
f"{burden_dir}/burdens.zarr"
) # any burden tensor (here from the last file to get dims 1 -n)
burdens = zarr.open(
Path(out_dir) / "burdens.zarr",
Path(out_dir_burdens) / "burdens.zarr",
mode="a",
shape=(n_total_samples,) + this_burdens.shape[1:],
chunks=(1000, 1000),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
print(f"burdens shape: {burdens.shape}")
else:
burdens = None
logger.info(f"burdens shape: {burdens.shape}")
sample_ids_burdens = zarr.open(
Path(out_dir_burdens) / "sample_ids.zarr",
mode="a",
shape=(n_total_samples),
chunks=(None),
dtype="U200",
compressor=Blosc(clevel=compression_level),
)

y = zarr.open(
Path(out_dir) / "y.zarr",
Path(out_dir_xy) / "y.zarr",
mode="a",
shape=(n_total_samples,) + this_y.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
x = zarr.open(
Path(out_dir) / "x.zarr",
Path(out_dir_xy) / "x.zarr",
mode="a",
shape=(n_total_samples,) + this_x.shape[1:],
chunks=(None, None),
dtype=np.float32,
compressor=Blosc(clevel=compression_level),
)
sample_ids_xy = zarr.open(
Path(out_dir_xy) / "sample_ids.zarr",
mode="a",
shape=(n_total_samples),
chunks=(None),
dtype="U200",
compressor=Blosc(clevel=compression_level),
)

start_idx = 0

for burden_dir in burden_dirs:
this_y = zarr.open(f"{burden_dir}/y.zarr")[:]
for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
this_y = zarr.load(f"{xy_dir}/y.zarr")
end_idx = start_idx + this_y.shape[0]
this_x = zarr.open(f"{burden_dir}/x.zarr")[:]
if not skip_burdens:
logger.info("writing burdens")
this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")[:]
burdens[start_idx:end_idx] = this_burdens
print((start_idx, end_idx))
this_x = zarr.load(f"{xy_dir}/x.zarr")
this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
y[start_idx:end_idx] = this_y
x[start_idx:end_idx] = this_x
sample_ids_xy[start_idx:end_idx] = this_sample_ids_xy
if not skip_burdens:
logger.info("writing burdens")
this_burdens = zarr.load(f"{burden_dir}/burdens.zarr")
burdens[start_idx:end_idx] = this_burdens
this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
sample_ids_burdens[start_idx:end_idx] = this_sample_ids_burdens
start_idx = end_idx

# sanity check
if not skip_burdens and not np.array_equal(sample_ids_xy[:], sample_ids_burdens[:]):
logger.error(
"sample_ids_xy, sample_ids_burdens do not match:\n"
+ f"sample_ids_xy: {sample_ids_xy[:]}"
+ f"sample_ids_burdens: {sample_ids_burdens[:]}"
)
raise RuntimeError("sample_ids_xy, sample_ids_burdens do not match")

y_transformation = config["association_testing_data"]["dataset_config"].get(
"y_transformation", None
)
Expand Down Expand Up @@ -202,13 +239,12 @@ def combine_test_set_burdens(
for col in range(this_y.shape[1]):
this_y[:, col] = my_quantile_transform(this_y[:, col])
y[:] = this_y

if not skip_burdens:
genes = np.load(f"{burden_dirs[0]}/genes.npy")
np.save(Path(out_dir_burdens) / "genes.npy", genes)

print("done")
if link_burdens is not None:
source_path = Path(out_dir) / "burdens.zarr"
source_path.unlink(missing_ok=True)
source_path.symlink_to(link_burdens)
genes = np.load(f"{burden_dirs[0]}/genes.npy")
np.save(Path(out_dir) / "genes.npy", genes)


if __name__ == "__main__":
Expand Down
102 changes: 53 additions & 49 deletions deeprvat/data/dense_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __init__(
split: str = "",
train_dataset: Optional[Dataset] = None,
chromosomes: List[str] = None,
phenotype_file: str = None,
phenotype_file: Optional[str] = None,
standardize_xpheno: bool = True,
standardize_anno: bool = False,
standardize_rare_anno: bool = False,
Expand Down Expand Up @@ -106,13 +106,14 @@ def __init__(
zarr_dir: Optional[str] = None,
cache_matrices: bool = False,
verbose: bool = False,
return_genotypes: bool = True,
):
if verbose:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)

self.check_samples = True # TODO undo
self.check_samples = False # NOTE: Set to True for debugging
self.split = split
self.train_dataset = train_dataset
self.chromosomes = (
Expand All @@ -134,13 +135,10 @@ def __init__(
f"Using phenotypes: x: {self.x_phenotypes}, " f"y: {self.y_phenotypes}"
)

if gt_file is None:
raise ValueError("gt_file must be specified")
self.gt_filename = gt_file
if variant_file is None:
raise ValueError("variant_file must be specified")
if phenotype_file is None:
raise ValueError("phenotype_file must be specified")
self.gt_filename = gt_file
self.return_genotypes = return_genotypes
self.variant_filename = variant_file
self.variant_matrix = None
self.genotype_matrix = None
Expand All @@ -154,9 +152,6 @@ def __init__(
self.variant_matrix = f["variant_matrix"][:]
self.genotype_matrix = f["genotype_matrix"][:]

logger.info(
f"Using phenotype file {phenotype_file} and genotype file {self.gt_filename}"
)
self.setup_phenotypes(
phenotype_file, sim_phenotype_file, skip_y_na, skip_x_na, sample_file
)
Expand Down Expand Up @@ -204,45 +199,54 @@ def __init__(
else:
self.variants_to_keep = variants_to_keep

self.setup_annotations(
annotation_file, annotation_aggregation, precomputed_annotations
)
if self.return_genotypes:
self.setup_annotations(
annotation_file, annotation_aggregation, precomputed_annotations
)

self.transform_data()
self.setup_variants(min_common_variant_count, min_common_af, variants)

self.get_variant_metadata(grouping_level)
if self.return_genotypes:
self.setup_variants(min_common_variant_count, min_common_af, variants)

self.get_variant_metadata(grouping_level)

if rare_embedding is not None:
if rare_embedding is not None and self.return_genotypes:
self.rare_embedding = getattr(rare_embedders, rare_embedding["type"])(
self, **rare_embedding["config"]
)

else:
self.rare_embedding = None

def __getitem__(self, idx: int) -> torch.tensor:
if self.variant_matrix is None:
gt_file = h5py.File(self.gt_filename, "r")
self.variant_matrix = gt_file["variant_matrix"]
self.genotype_matrix = gt_file["genotype_matrix"]
if self.cache_matrices:
self.variant_matrix = self.variant_matrix[:]
self.genotype_matrix = self.genotype_matrix[:]

# idx_pheno = self.index_map_pheno[idx] #samples and phenotype is already subset so can use idx
idx_geno = self.index_map_geno[idx]
sparse_variants = self.variant_matrix[idx_geno, :]
sparse_genotype = self.genotype_matrix[idx_geno, :]
(
common_variants,
all_sparse_variants,
sparse_genotype,
) = self.get_common_variants(sparse_variants, sparse_genotype)

rare_variant_annotations = self.get_rare_variants(
idx, all_sparse_variants, sparse_genotype
)
if self.return_genotypes:
if self.variant_matrix is None or self.genotype_matrix is None:
gt_file = h5py.File(self.gt_filename, "r")
self.variant_matrix = gt_file["variant_matrix"]
self.genotype_matrix = gt_file["genotype_matrix"]
if self.cache_matrices:
self.variant_matrix = self.variant_matrix[:]
self.genotype_matrix = self.genotype_matrix[:]

idx_geno = self.index_map_geno[idx]
if self.check_samples:
# sanity check, can be removed in future
assert self.samples_gt[idx_geno] == self.samples[idx]

sparse_variants = self.variant_matrix[idx_geno, :]
sparse_genotype = self.genotype_matrix[idx_geno, :]
(
common_variants,
all_sparse_variants,
sparse_genotype,
) = self.get_common_variants(sparse_variants, sparse_genotype)

rare_variant_annotations = self.get_rare_variants(
idx, all_sparse_variants, sparse_genotype
)
else:
common_variants = torch.tensor([], dtype=torch.float)
rare_variant_annotations = torch.tensor([], dtype=torch.float)

phenotypes = self.phenotype_df.iloc[
idx, :
Expand All @@ -255,9 +259,7 @@ def __getitem__(self, idx: int) -> torch.tensor:
y = torch.tensor(
phenotypes[self.y_phenotypes].to_numpy(dtype=np.float32), dtype=torch.float
)
if self.check_samples:
# sanity check, can be removed in future
assert self.samples_gt[idx_geno] == self.samples[idx]

return {
"sample": self.samples[idx],
"x_phenotypes": x_phenotype_tensor,
Expand Down Expand Up @@ -287,11 +289,6 @@ def setup_phenotypes(
):
logger.debug("Reading phenotype dataframe")
self.phenotype_df = pd.read_parquet(phenotype_file, engine="pyarrow")
with h5py.File(self.gt_filename, "r") as f:
samples_gt = f["samples"][:]
samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
if self.check_samples:
self.samples_gt = samples_gt
samples_phenotype_df = np.array(self.phenotype_df.index)
# phenotypes_df has first to be sorted in the same order as samples_gt
if sim_phenotype_file is not None:
Expand All @@ -315,14 +312,21 @@ def setup_phenotypes(
logger.warning(
"Some samples from the sample file were not found in the data"
)
sample_to_keep = shared_samples
samples_to_keep = shared_samples
logger.info(
f"Number of samples in sample file and in phenotype_df: {len(samples_to_keep)}"
)
else:
logger.info("Using all samples in phenotype df")
samples_to_keep = copy.deepcopy(samples_phenotype_df)

# if self.return_genotypes:
with h5py.File(self.gt_filename, "r") as f:
samples_gt = f["samples"][:]
samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
if self.check_samples:
self.samples_gt = samples_gt

logger.info("Removing samples that are not in genotype file")

samples_to_keep = np.array(
Expand Down Expand Up @@ -353,11 +357,11 @@ def setup_phenotypes(
mask_cols += self.x_phenotypes
mask = (self.phenotype_df[mask_cols].notna()).all(axis=1)
mask &= samples_to_keep_mask
samples_to_keep = self.phenotype_df.index[mask]
self.samples = self.phenotype_df.index[mask]
self.n_samples = mask.sum()
logger.info(f"Final number of kept samples: {self.n_samples}")

self.phenotype_df = self.phenotype_df[mask]
self.samples = self.phenotype_df.index.to_numpy()

# account for the fact that genotypes.h5 and phenotype_df can have different
# orders of their samples
Expand Down
Loading