From 2c903f2b4a2c0d9cddd7ac1cab7b4d9ea1ef5340 Mon Sep 17 00:00:00 2001
From: tszfungc <tszfungc@usc.edu>
Date: Tue, 16 Aug 2022 18:59:16 -0700
Subject: [PATCH 1/4] Add mean_impute

---
 sgkit/stats/preprocessing.py | 90 +++++++++++++++++++++++++++++++++++-
 sgkit/variables.py           | 32 +++++++++++++
 2 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py
index 195cada68..d51e999e3 100644
--- a/sgkit/stats/preprocessing.py
+++ b/sgkit/stats/preprocessing.py
@@ -1,4 +1,4 @@
-from typing import Hashable, Optional
+from typing import Hashable, Optional, Sequence
 
 import dask.array as da
 import numpy as np
@@ -185,3 +185,91 @@ def filter_partial_calls(
     )
     new_ds[variables.call_genotype_complete].attrs["mixed_ploidy"] = mixed_ploidy
     return conditional_merge_datasets(ds, new_ds, merge)
+
+
+def mean_impute(
+    ds: Dataset,
+    variable: str,
+    dim: Hashable | Sequence[Hashable] = "samples",
+    merge: bool = True,
+) -> Dataset:
+    """Mean impute a masked variable
+
+    Parameters
+    ----------
+    ds
+        Dataset containing variable to be imputed.
+    variable
+        Input variable name
+        ``f"{variable}"`` and ``f"{variable}_masked"`` must be present in ``ds``.
+    dim:
+        Dimension(s) where the means are computed along.
+    merge
+        If True (the default), merge the input dataset and the computed
+        output variables into a single dataset, otherwise return only
+        the imputed output variables.
+        See :ref:`dataset_merge` for more details.
+
+    Returns
+    -------
+    Dataset containing :data:`sgkit.variables.{variable}_imputed` in which masked entries are
+    replaced with the mean values of the unmasked.
+
+    Examples
+    --------
+
+    >>> import sgkit as sg, numpy as np
+    >>> from sgkit.stats.preprocessing import mean_impute
+    >>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=10, seed=1, missing_pct=.1)
+    >>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
+    samples    S0   S1   S2   S3   S4   S5   S6   S7   S8   S9
+    variants
+    0         1/0  1/0  ./0  1/1  0/1  1/0  0/0  0/0  ./.  1/0
+    1         ./1  0/0  1/0  1/1  ./0  1/1  1/1  1/1  0/1  0/0
+    2         ./0  1/1  1/.  0/1  0/1  0/1  1/0  ./1  1/0  0/0
+    3         0/1  0/1  0/1  0/1  1/1  1/1  0/0  1/1  0/1  1/0
+
+    >>> ds["call_dosage"] = ds.call_genotype.sum(dim="ploidy").astype(float)
+    >>> ds["call_dosage_mask"] = ds.call_genotype_mask.any(dim='ploidy')
+    >>> ds["call_dosage"] = ds["call_dosage"].where(~ds["call_dosage_mask"], np.nan)
+    >>> ds["call_dosage"]  # doctest: +NORMALIZE_WHITESPACE
+    <xarray.DataArray 'call_dosage' (variants: 4, samples: 10)>
+    array([[ 1.,  1., nan,  2.,  1.,  1.,  0.,  0., nan,  1.],
+           [nan,  0.,  1.,  2., nan,  2.,  2.,  2.,  1.,  0.],
+           [nan,  2., nan,  1.,  1.,  1.,  1., nan,  1.,  0.],
+           [ 1.,  1.,  1.,  1.,  2.,  2.,  0.,  2.,  1.,  1.]])
+    Dimensions without coordinates: variants, samples
+
+    >>> ds = mean_impute(ds, variable='call_dosage', dim='samples')
+    >>> ds["call_dosage_imputed"] # doctest: +NORMALIZE_WHITESPACE
+    <xarray.DataArray 'call_dosage_imputed' (variants: 4, samples: 10)>
+    array([[1.   , 1.   , 0.875, 2.   , 1.   , 1.   , 0.   , 0.   , 0.875,
+            1.   ],
+           [1.25 , 0.   , 1.   , 2.   , 1.25 , 2.   , 2.   , 2.   , 1.   ,
+            0.   ],
+           [1.   , 2.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   , 1.   ,
+            0.   ],
+           [1.   , 1.   , 1.   , 1.   , 2.   , 2.   , 0.   , 2.   , 1.   ,
+            1.   ]])
+    Dimensions without coordinates: variants, samples
+    Attributes:
+        comment:  Dosages imputed, encoded as floats, with NaN indicating a missi...
+
+    """
+
+    variables.validate(ds, variable)
+    variables.validate(ds, f"{variable}_mask")
+
+    unmasked = ~ds[f"{variable}_mask"]
+    new_ds = create_dataset(
+        {
+            f"{variable}_imputed": (
+                ds[variable].dims,
+                ds[variable]
+                .where(unmasked, ds[variable].where(unmasked).mean(dim=dim))
+                .data,
+            )
+        }
+    )
+
+    return conditional_merge_datasets(ds, new_ds, merge)
diff --git a/sgkit/variables.py b/sgkit/variables.py
index c1dbb5e9a..70fe0d9bb 100644
--- a/sgkit/variables.py
+++ b/sgkit/variables.py
@@ -214,6 +214,15 @@ def _check_field(
     )
 )
 
+call_dosage_imputed, call_dosage_imputed_spec = SgkitVariables.register_variable(
+    ArrayLikeSpec(
+        "call_dosage_imputed",
+        kind="f",
+        ndim=2,
+        __doc__="""Dosages imputed, encoded as floats, with NaN indicating a missing value.""",
+    )
+)
+
 call_dosage_mask, call_dosage_mask_spec = SgkitVariables.register_variable(
     ArrayLikeSpec(
         "call_dosage_mask",
@@ -293,6 +302,16 @@ def _check_field(
     )
 )
 
+call_genotype_imputed, call_genotype_imputed_spec = SgkitVariables.register_variable(
+    ArrayLikeSpec(
+        "call_genotype_imputed",
+        kind="f",
+        ndim=3,
+        __doc__="""
+Call genotype imputed """,
+    )
+)
+
 (
     call_genotype_probability,
     call_genotype_probability_spec,
@@ -305,6 +324,19 @@ def _check_field(
     )
 )
 
+(
+    call_genotype_probability_imputed,
+    call_genotype_probability_imputed_spec,
+) = SgkitVariables.register_variable(
+    ArrayLikeSpec(
+        "call_genotype_probability_imputed",
+        kind="f",
+        ndim=3,
+        __doc__="""Genotype probabilities Imputed.""",
+    )
+)
+
+
 (
     call_genotype_probability_mask,
     call_genotype_probability_mask_spec,

From ed6eaa8446486f25fe6a4ae5ec6df0ab2c3261a3 Mon Sep 17 00:00:00 2001
From: Tsz-Fung Chan <54761316+tszfungc@users.noreply.github.com>
Date: Tue, 16 Aug 2022 21:04:17 -0700
Subject: [PATCH 2/4] Update docstrings

---
 sgkit/stats/preprocessing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py
index d51e999e3..b5d2f4417 100644
--- a/sgkit/stats/preprocessing.py
+++ b/sgkit/stats/preprocessing.py
@@ -198,12 +198,12 @@ def mean_impute(
     Parameters
     ----------
     ds
-        Dataset containing variable to be imputed.
+        Dataset containing the variable to be imputed.
     variable
         Input variable name
         ``f"{variable}"`` and ``f"{variable}_masked"`` must be present in ``ds``.
     dim:
-        Dimension(s) where the means are computed along.
+        Dimension(s) along which the means are computed.
     merge
         If True (the default), merge the input dataset and the computed
         output variables into a single dataset, otherwise return only

From 1264861875999359a7409a1dec46764e84fb1e86 Mon Sep 17 00:00:00 2001
From: tszfungc <tszfungc@usc.edu>
Date: Wed, 17 Aug 2022 10:37:16 -0700
Subject: [PATCH 3/4] Replace | operator with Union in typing

---
 sgkit/stats/preprocessing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py
index d51e999e3..3b446f514 100644
--- a/sgkit/stats/preprocessing.py
+++ b/sgkit/stats/preprocessing.py
@@ -1,4 +1,4 @@
-from typing import Hashable, Optional, Sequence
+from typing import Hashable, Optional, Sequence, Union
 
 import dask.array as da
 import numpy as np
@@ -190,7 +190,7 @@ def filter_partial_calls(
 def mean_impute(
     ds: Dataset,
     variable: str,
-    dim: Hashable | Sequence[Hashable] = "samples",
+    dim: Union[Hashable, Sequence[Hashable]] = "samples",
     merge: bool = True,
 ) -> Dataset:
     """Mean impute a masked variable

From 3c5453ebe2bc8a70978c5d6b53be88eba6af2899 Mon Sep 17 00:00:00 2001
From: tszfungc <tszfungc@usc.edu>
Date: Thu, 18 Aug 2022 11:54:53 -0700
Subject: [PATCH 4/4] Remove f-string inside mean_impute.__doc__

---
 sgkit/stats/preprocessing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sgkit/stats/preprocessing.py b/sgkit/stats/preprocessing.py
index 3b446f514..f93ba24df 100644
--- a/sgkit/stats/preprocessing.py
+++ b/sgkit/stats/preprocessing.py
@@ -200,8 +200,8 @@ def mean_impute(
     ds
         Dataset containing variable to be imputed.
     variable
-        Input variable name
-        ``f"{variable}"`` and ``f"{variable}_masked"`` must be present in ``ds``.
+        Input variable name.
+        Variables ``{variable}`` and ``{variable}_masked`` must be present in ``ds``.
     dim:
         Dimension(s) where the means are computed along.
     merge