PMBio · bfclarke · Sep 24, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 5, 2024
diff --git a/deeprvat/cv_utils.py b/deeprvat/cv_utils.py
@@ -96,80 +96,117 @@ def generate_test_config(input_config, out_file, fold, n_folds):
 
 
 @cli.command()
-@click.option("--link-burdens", type=click.Path())
+@click.option("--skip-burdens", is_flag=True)
 @click.option("--burden-dirs", "-b", multiple=True)
-@click.argument("out_dir", type=click.Path(), default="./")
+@click.option("--xy-dirs", "-b", multiple=True)
+@click.argument("out_dir_burdens", type=click.Path(), default="./")
+@click.argument("out_dir_xy", type=click.Path(), default="./")
 @click.argument("config_file", type=click.Path(exists=True))
 def combine_test_set_burdens(
-    out_dir,
-    link_burdens,
+    out_dir_burdens,
+    out_dir_xy,
+    skip_burdens,
     burden_dirs,
+    xy_dirs,
     config_file,
 ):
+    assert len(burden_dirs) == len(xy_dirs)
+
     with open(config_file) as f:
         config = yaml.safe_load(f)
     compression_level = 1
-    skip_burdens = link_burdens is not None
     n_total_samples = []
-    for burden_dir in burden_dirs:
-        print(burden_dir)
-        this_y = zarr.open(f"{burden_dir}/y.zarr")
-        this_x = zarr.open(f"{burden_dir}/x.zarr")
+    for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
+        logger.debug(xy_dir)
+        this_y = zarr.open(f"{xy_dir}/y.zarr")
+        this_x = zarr.open(f"{xy_dir}/x.zarr")
+        this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
         # this_burdens = zarr.open(f'{burden_dir}/burdens.zarr')
 
-        assert this_y.shape[0] == this_x.shape[0]  # == this_burdens.shape[0]
+        assert this_y.shape[0] == this_x.shape[0]
         n_total_samples.append(this_y.shape[0])
 
+        if not skip_burdens:
+            this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")
+            this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
+            assert this_y.shape[0] == this_burdens.shape[0]
+            logger.debug(this_sample_ids_xy, this_sample_ids_burdens)
+            assert np.array_equal(this_sample_ids_xy, this_sample_ids_burdens)
+
     n_total_samples = np.sum(n_total_samples)
-    print(f"Total number of samples {n_total_samples}")
+    logger.info(f"Total number of samples: {n_total_samples}")
     if not skip_burdens:
-        this_burdens = zarr.open(
-            f"{burden_dir}/burdens.zarr"
-        )  # any burden tensor (here from the last file to get dims 1 -n)
         burdens = zarr.open(
-            Path(out_dir) / "burdens.zarr",
+            Path(out_dir_burdens) / "burdens.zarr",
             mode="a",
             shape=(n_total_samples,) + this_burdens.shape[1:],
             chunks=(1000, 1000),
             dtype=np.float32,
             compressor=Blosc(clevel=compression_level),
         )
-        print(f"burdens shape: {burdens.shape}")
-    else:
-        burdens = None
+        logger.info(f"burdens shape: {burdens.shape}")
+        sample_ids_burdens = zarr.open(
+            Path(out_dir_burdens) / "sample_ids.zarr",
+            mode="a",
+            shape=(n_total_samples),
+            chunks=(None),
+            dtype="U200",
+            compressor=Blosc(clevel=compression_level),
+        )
 
     y = zarr.open(
-        Path(out_dir) / "y.zarr",
+        Path(out_dir_xy) / "y.zarr",
         mode="a",
         shape=(n_total_samples,) + this_y.shape[1:],
         chunks=(None, None),
         dtype=np.float32,
         compressor=Blosc(clevel=compression_level),
     )
     x = zarr.open(
-        Path(out_dir) / "x.zarr",
+        Path(out_dir_xy) / "x.zarr",
         mode="a",
         shape=(n_total_samples,) + this_x.shape[1:],
         chunks=(None, None),
         dtype=np.float32,
         compressor=Blosc(clevel=compression_level),
     )
+    sample_ids_xy = zarr.open(
+        Path(out_dir_xy) / "sample_ids.zarr",
+        mode="a",
+        shape=(n_total_samples),
+        chunks=(None),
+        dtype="U200",
+        compressor=Blosc(clevel=compression_level),
+    )
 
     start_idx = 0
 
-    for burden_dir in burden_dirs:
-        this_y = zarr.open(f"{burden_dir}/y.zarr")[:]
+    for xy_dir, burden_dir in zip(xy_dirs, burden_dirs):
+        this_y = zarr.load(f"{xy_dir}/y.zarr")
         end_idx = start_idx + this_y.shape[0]
-        this_x = zarr.open(f"{burden_dir}/x.zarr")[:]
-        if not skip_burdens:
-            logger.info("writing burdens")
-            this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")[:]
-            burdens[start_idx:end_idx] = this_burdens
         print((start_idx, end_idx))
+        this_x = zarr.load(f"{xy_dir}/x.zarr")
+        this_sample_ids_xy = zarr.load(f"{xy_dir}/sample_ids.zarr")
         y[start_idx:end_idx] = this_y
         x[start_idx:end_idx] = this_x
+        sample_ids_xy[start_idx:end_idx] = this_sample_ids_xy
+        if not skip_burdens:
+            logger.info("writing burdens")
+            this_burdens = zarr.load(f"{burden_dir}/burdens.zarr")
+            burdens[start_idx:end_idx] = this_burdens
+            this_sample_ids_burdens = zarr.load(f"{burden_dir}/sample_ids.zarr")
+            sample_ids_burdens[start_idx:end_idx] = this_sample_ids_burdens
         start_idx = end_idx
 
+    # sanity check
+    if not skip_burdens and not np.array_equal(sample_ids_xy[:], sample_ids_burdens[:]):
+        logger.error(
+            "sample_ids_xy, sample_ids_burdens do not match:\n"
+            + f"sample_ids_xy: {sample_ids_xy[:]}"
+            + f"sample_ids_burdens: {sample_ids_burdens[:]}"
+        )
+        raise RuntimeError("sample_ids_xy, sample_ids_burdens do not match")
+
     y_transformation = config["association_testing_data"]["dataset_config"].get(
         "y_transformation", None
     )
@@ -202,13 +239,12 @@ def combine_test_set_burdens(
                 for col in range(this_y.shape[1]):
                     this_y[:, col] = my_quantile_transform(this_y[:, col])
             y[:] = this_y
+
+    if not skip_burdens:
+        genes = np.load(f"{burden_dirs[0]}/genes.npy")
+        np.save(Path(out_dir_burdens) / "genes.npy", genes)
+
     print("done")
-    if link_burdens is not None:
-        source_path = Path(out_dir) / "burdens.zarr"
-        source_path.unlink(missing_ok=True)
-        source_path.symlink_to(link_burdens)
-    genes = np.load(f"{burden_dirs[0]}/genes.npy")
-    np.save(Path(out_dir) / "genes.npy", genes)
 
 
 if __name__ == "__main__":

diff --git a/deeprvat/data/dense_gt.py b/deeprvat/data/dense_gt.py
@@ -65,7 +65,7 @@ def __init__(
         split: str = "",
         train_dataset: Optional[Dataset] = None,
         chromosomes: List[str] = None,
-        phenotype_file: str = None,
+        phenotype_file: Optional[str] = None,
         standardize_xpheno: bool = True,
         standardize_anno: bool = False,
         standardize_rare_anno: bool = False,
@@ -106,13 +106,14 @@ def __init__(
         zarr_dir: Optional[str] = None,
         cache_matrices: bool = False,
         verbose: bool = False,
+        return_genotypes: bool = True,
     ):
         if verbose:
             logger.setLevel(logging.DEBUG)
         else:
             logger.setLevel(logging.INFO)
 
-        self.check_samples = True  # TODO undo
+        self.check_samples = False  # NOTE: Set to True for debugging
         self.split = split
         self.train_dataset = train_dataset
         self.chromosomes = (
@@ -134,13 +135,10 @@ def __init__(
             f"Using phenotypes: x: {self.x_phenotypes}, " f"y: {self.y_phenotypes}"
         )
 
-        if gt_file is None:
-            raise ValueError("gt_file must be specified")
-        self.gt_filename = gt_file
-        if variant_file is None:
-            raise ValueError("variant_file must be specified")
         if phenotype_file is None:
             raise ValueError("phenotype_file must be specified")
+        self.gt_filename = gt_file
+        self.return_genotypes = return_genotypes
         self.variant_filename = variant_file
         self.variant_matrix = None
         self.genotype_matrix = None
@@ -154,9 +152,6 @@ def __init__(
                     self.variant_matrix = f["variant_matrix"][:]
                     self.genotype_matrix = f["genotype_matrix"][:]
 
-        logger.info(
-            f"Using phenotype file {phenotype_file} and genotype file {self.gt_filename}"
-        )
         self.setup_phenotypes(
             phenotype_file, sim_phenotype_file, skip_y_na, skip_x_na, sample_file
         )
@@ -204,45 +199,54 @@ def __init__(
         else:
             self.variants_to_keep = variants_to_keep
 
-        self.setup_annotations(
-            annotation_file, annotation_aggregation, precomputed_annotations
-        )
+        if self.return_genotypes:
+            self.setup_annotations(
+                annotation_file, annotation_aggregation, precomputed_annotations
+            )
 
         self.transform_data()
-        self.setup_variants(min_common_variant_count, min_common_af, variants)
 
-        self.get_variant_metadata(grouping_level)
+        if self.return_genotypes:
+            self.setup_variants(min_common_variant_count, min_common_af, variants)
+
+            self.get_variant_metadata(grouping_level)
 
-        if rare_embedding is not None:
+        if rare_embedding is not None and self.return_genotypes:
             self.rare_embedding = getattr(rare_embedders, rare_embedding["type"])(
                 self, **rare_embedding["config"]
             )
-
         else:
             self.rare_embedding = None
 
     def __getitem__(self, idx: int) -> torch.tensor:
-        if self.variant_matrix is None:
-            gt_file = h5py.File(self.gt_filename, "r")
-            self.variant_matrix = gt_file["variant_matrix"]
-            self.genotype_matrix = gt_file["genotype_matrix"]
-            if self.cache_matrices:
-                self.variant_matrix = self.variant_matrix[:]
-                self.genotype_matrix = self.genotype_matrix[:]
-
-        # idx_pheno = self.index_map_pheno[idx] #samples and phenotype is already subset so can use idx
-        idx_geno = self.index_map_geno[idx]
-        sparse_variants = self.variant_matrix[idx_geno, :]
-        sparse_genotype = self.genotype_matrix[idx_geno, :]
-        (
-            common_variants,
-            all_sparse_variants,
-            sparse_genotype,
-        ) = self.get_common_variants(sparse_variants, sparse_genotype)
-
-        rare_variant_annotations = self.get_rare_variants(
-            idx, all_sparse_variants, sparse_genotype
-        )
+        if self.return_genotypes:
+            if self.variant_matrix is None or self.genotype_matrix is None:
+                gt_file = h5py.File(self.gt_filename, "r")
+                self.variant_matrix = gt_file["variant_matrix"]
+                self.genotype_matrix = gt_file["genotype_matrix"]
+                if self.cache_matrices:
+                    self.variant_matrix = self.variant_matrix[:]
+                    self.genotype_matrix = self.genotype_matrix[:]
+
+            idx_geno = self.index_map_geno[idx]
+            if self.check_samples:
+                # sanity check, can be removed in future
+                assert self.samples_gt[idx_geno] == self.samples[idx]
+
+            sparse_variants = self.variant_matrix[idx_geno, :]
+            sparse_genotype = self.genotype_matrix[idx_geno, :]
+            (
+                common_variants,
+                all_sparse_variants,
+                sparse_genotype,
+            ) = self.get_common_variants(sparse_variants, sparse_genotype)
+
+            rare_variant_annotations = self.get_rare_variants(
+                idx, all_sparse_variants, sparse_genotype
+            )
+        else:
+            common_variants = torch.tensor([], dtype=torch.float)
+            rare_variant_annotations = torch.tensor([], dtype=torch.float)
 
         phenotypes = self.phenotype_df.iloc[
             idx, :
@@ -255,9 +259,7 @@ def __getitem__(self, idx: int) -> torch.tensor:
         y = torch.tensor(
             phenotypes[self.y_phenotypes].to_numpy(dtype=np.float32), dtype=torch.float
         )
-        if self.check_samples:
-            # sanity check, can be removed in future
-            assert self.samples_gt[idx_geno] == self.samples[idx]
+
         return {
             "sample": self.samples[idx],
             "x_phenotypes": x_phenotype_tensor,
@@ -287,11 +289,6 @@ def setup_phenotypes(
     ):
         logger.debug("Reading phenotype dataframe")
         self.phenotype_df = pd.read_parquet(phenotype_file, engine="pyarrow")
-        with h5py.File(self.gt_filename, "r") as f:
-            samples_gt = f["samples"][:]
-        samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
-        if self.check_samples:
-            self.samples_gt = samples_gt
         samples_phenotype_df = np.array(self.phenotype_df.index)
         # phenotypes_df has first to be sorted in the same order as samples_gt
         if sim_phenotype_file is not None:
@@ -315,14 +312,21 @@ def setup_phenotypes(
                 logger.warning(
                     "Some samples from the sample file were not found in the data"
                 )
-            sample_to_keep = shared_samples
+            samples_to_keep = shared_samples
             logger.info(
                 f"Number of samples in sample file and in phenotype_df: {len(samples_to_keep)}"
             )
         else:
             logger.info("Using all samples in phenotype df")
             samples_to_keep = copy.deepcopy(samples_phenotype_df)
 
+        # if self.return_genotypes:
+        with h5py.File(self.gt_filename, "r") as f:
+            samples_gt = f["samples"][:]
+        samples_gt = np.array([item.decode("utf-8") for item in samples_gt])
+        if self.check_samples:
+            self.samples_gt = samples_gt
+
         logger.info("Removing samples that are not in genotype file")
 
         samples_to_keep = np.array(
@@ -353,11 +357,11 @@ def setup_phenotypes(
             mask_cols += self.x_phenotypes
         mask = (self.phenotype_df[mask_cols].notna()).all(axis=1)
         mask &= samples_to_keep_mask
-        samples_to_keep = self.phenotype_df.index[mask]
+        self.samples = self.phenotype_df.index[mask]
         self.n_samples = mask.sum()
         logger.info(f"Final number of kept samples: {self.n_samples}")
+
         self.phenotype_df = self.phenotype_df[mask]
-        self.samples = self.phenotype_df.index.to_numpy()
 
         # account for the fact that genotypes.h5 and phenotype_df can have different
         # orders of their samples