From 2c903f2b4a2c0d9cddd7ac1cab7b4d9ea1ef5340 Mon Sep 17 00:00:00 2001 From: tszfungc Date: Tue, 16 Aug 2022 18:59:16 -0700 Subject: [PATCH 1/4] Add mean_impute --- sgkit/stats/preprocessing.py | 90 +++++++++++++++++++++++++++++++++++- sgkit/variables.py | 32 +++++++++++++ 2 files changed, 121 insertions(+), 1 deletion(-) diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py index 195cada68..d51e999e3 100644 --- a/sgkit/stats/preprocessing.py +++ b/sgkit/stats/preprocessing.py @@ -1,4 +1,4 @@ -from typing import Hashable, Optional +from typing import Hashable, Optional, Sequence import dask.array as da import numpy as np @@ -185,3 +185,91 @@ def filter_partial_calls( ) new_ds[variables.call_genotype_complete].attrs["mixed_ploidy"] = mixed_ploidy return conditional_merge_datasets(ds, new_ds, merge) + + +def mean_impute( + ds: Dataset, + variable: str, + dim: Hashable | Sequence[Hashable] = "samples", + merge: bool = True, +) -> Dataset: + """Mean impute a masked variable + + Parameters + ---------- + ds + Dataset containing variable to be imputed. + variable + Input variable name + ``f"{variable}"`` and ``f"{variable}_masked"`` must be present in ``ds``. + dim: + Dimension(s) where the means are computed along. + merge + If True (the default), merge the input dataset and the computed + output variables into a single dataset, otherwise return only + the imputed output variables. + See :ref:`dataset_merge` for more details. + + Returns + ------- + Dataset containing :data:`sgkit.variables.{variable}_imputed` in which masked entries are + replaced with the mean values of the unmasked. + + Examples + -------- + + >>> import sgkit as sg, numpy as np + >>> from sgkit.stats.preprocessing import mean_impute + >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=10, seed=1, missing_pct=.1) + >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE + samples S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 + variants + 0 1/0 1/0 ./0 1/1 0/1 1/0 0/0 0/0 ./. 1/0 + 1 ./1 0/0 1/0 1/1 ./0 1/1 1/1 1/1 0/1 0/0 + 2 ./0 1/1 1/. 0/1 0/1 0/1 1/0 ./1 1/0 0/0 + 3 0/1 0/1 0/1 0/1 1/1 1/1 0/0 1/1 0/1 1/0 + + >>> ds["call_dosage"] = ds.call_genotype.sum(dim="ploidy").astype(float) + >>> ds["call_dosage_mask"] = ds.call_genotype_mask.any(dim='ploidy') + >>> ds["call_dosage"] = ds["call_dosage"].where(~ds["call_dosage_mask"], np.nan) + >>> ds["call_dosage"] # doctest: +NORMALIZE_WHITESPACE + + array([[ 1., 1., nan, 2., 1., 1., 0., 0., nan, 1.], + [nan, 0., 1., 2., nan, 2., 2., 2., 1., 0.], + [nan, 2., nan, 1., 1., 1., 1., nan, 1., 0.], + [ 1., 1., 1., 1., 2., 2., 0., 2., 1., 1.]]) + Dimensions without coordinates: variants, samples + + >>> ds = mean_impute(ds, variable='call_dosage', dim='samples') + >>> ds["call_dosage_imputed"] # doctest: +NORMALIZE_WHITESPACE + + array([[1. , 1. , 0.875, 2. , 1. , 1. , 0. , 0. , 0.875, + 1. ], + [1.25 , 0. , 1. , 2. , 1.25 , 2. , 2. , 2. , 1. , + 0. ], + [1. , 2. , 1. , 1. , 1. , 1. , 1. , 1. , 1. , + 0. ], + [1. , 1. , 1. , 1. , 2. , 2. , 0. , 2. , 1. , + 1. ]]) + Dimensions without coordinates: variants, samples + Attributes: + comment: Dosages imputed, encoded as floats, with NaN indicating a missi... + + """ + + variables.validate(ds, variable) + variables.validate(ds, f"{variable}_mask") + + unmasked = ~ds[f"{variable}_mask"] + new_ds = create_dataset( + { + f"{variable}_imputed": ( + ds[variable].dims, + ds[variable] + .where(unmasked, ds[variable].where(unmasked).mean(dim=dim)) + .data, + ) + } + ) + + return conditional_merge_datasets(ds, new_ds, merge) diff --git a/sgkit/variables.py b/sgkit/variables.py index c1dbb5e9a..70fe0d9bb 100644 --- a/sgkit/variables.py +++ b/sgkit/variables.py @@ -214,6 +214,15 @@ def _check_field( ) ) +call_dosage_imputed, call_dosage_imputed_spec = SgkitVariables.register_variable( + ArrayLikeSpec( + "call_dosage_imputed", + kind="f", + ndim=2, + __doc__="""Dosages imputed, encoded as floats, with NaN indicating a missing value.""", + ) +) + call_dosage_mask, call_dosage_mask_spec = SgkitVariables.register_variable( ArrayLikeSpec( "call_dosage_mask", @@ -293,6 +302,16 @@ def _check_field( ) ) +call_genotype_imputed, call_genotype_imputed_spec = SgkitVariables.register_variable( + ArrayLikeSpec( + "call_genotype_imputed", + kind="f", + ndim=3, + __doc__=""" +Call genotype imputed """, + ) +) + ( call_genotype_probability, call_genotype_probability_spec, @@ -305,6 +324,19 @@ def _check_field( ) ) +( + call_genotype_probability_imputed, + call_genotype_probability_imputed_spec, +) = SgkitVariables.register_variable( + ArrayLikeSpec( + "call_genotype_probability_imputed", + kind="f", + ndim=3, + __doc__="""Genotype probabilities Imputed.""", + ) +) + + ( call_genotype_probability_mask, call_genotype_probability_mask_spec, From ed6eaa8446486f25fe6a4ae5ec6df0ab2c3261a3 Mon Sep 17 00:00:00 2001 From: Tsz-Fung Chan <54761316+tszfungc@users.noreply.github.com> Date: Tue, 16 Aug 2022 21:04:17 -0700 Subject: [PATCH 2/4] Update docstrings --- sgkit/stats/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py index d51e999e3..b5d2f4417 100644 --- a/sgkit/stats/preprocessing.py +++ b/sgkit/stats/preprocessing.py @@ -198,12 +198,12 @@ def mean_impute( Parameters ---------- ds - Dataset containing variable to be imputed. + Dataset containing the variable to be imputed. variable Input variable name ``f"{variable}"`` and ``f"{variable}_masked"`` must be present in ``ds``. dim: - Dimension(s) where the means are computed along. + Dimension(s) along which the means are computed. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only From 1264861875999359a7409a1dec46764e84fb1e86 Mon Sep 17 00:00:00 2001 From: tszfungc Date: Wed, 17 Aug 2022 10:37:16 -0700 Subject: [PATCH 3/4] Replace | operator with Union in typing --- sgkit/stats/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py index d51e999e3..3b446f514 100644 --- a/sgkit/stats/preprocessing.py +++ b/sgkit/stats/preprocessing.py @@ -1,4 +1,4 @@ -from typing import Hashable, Optional, Sequence +from typing import Hashable, Optional, Sequence, Union import dask.array as da import numpy as np @@ -190,7 +190,7 @@ def filter_partial_calls( def mean_impute( ds: Dataset, variable: str, - dim: Hashable | Sequence[Hashable] = "samples", + dim: Union[Hashable, Sequence[Hashable]] = "samples", merge: bool = True, ) -> Dataset: """Mean impute a masked variable From 3c5453ebe2bc8a70978c5d6b53be88eba6af2899 Mon Sep 17 00:00:00 2001 From: tszfungc Date: Thu, 18 Aug 2022 11:54:53 -0700 Subject: [PATCH 4/4] Remove f-string inside mean_impute.__doc__ --- sgkit/stats/preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py index 3b446f514..f93ba24df 100644 --- a/sgkit/stats/preprocessing.py +++ b/sgkit/stats/preprocessing.py @@ -200,8 +200,8 @@ def mean_impute( ds Dataset containing variable to be imputed. variable - Input variable name - ``f"{variable}"`` and ``f"{variable}_masked"`` must be present in ``ds``. + Input variable name. + Variables ``{variable}`` and ``{variable}_masked`` must be present in ``ds``. dim: Dimension(s) where the means are computed along. merge