Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add mean imputation function #892

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 89 additions & 1 deletion sgkit/stats/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Hashable, Optional
from typing import Hashable, Optional, Sequence, Union

import dask.array as da
import numpy as np
Expand Down Expand Up @@ -185,3 +185,91 @@ def filter_partial_calls(
)
new_ds[variables.call_genotype_complete].attrs["mixed_ploidy"] = mixed_ploidy
return conditional_merge_datasets(ds, new_ds, merge)


def mean_impute(
ds: Dataset,
variable: str,
dim: Union[Hashable, Sequence[Hashable]] = "samples",
merge: bool = True,
) -> Dataset:
"""Mean impute a masked variable
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be helpful to give a more descriptive follow up sentence here, like say

This replaces missing data for the specified variable with the mean of the non-missing values.


Parameters
----------
ds
Dataset containing the variable to be imputed.
variable
Input variable name.
Variables ``{variable}`` and ``{variable}_masked`` must be present in ``ds``.
dim:
Dimension(s) along which the means are computed.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
the imputed output variables.
See :ref:`dataset_merge` for more details.

Returns
-------
Dataset containing :data:`sgkit.variables.{variable}_imputed` in which masked entries are
replaced with the mean values of the unmasked.

Examples
--------

>>> import sgkit as sg, numpy as np
>>> from sgkit.stats.preprocessing import mean_impute
>>> ds = sg.simulate_genotype_call_dataset(n_variant=4, n_sample=10, seed=1, missing_pct=.1)
>>> sg.display_genotypes(ds) # doctest: +NORMALIZE_WHITESPACE
samples S0 S1 S2 S3 S4 S5 S6 S7 S8 S9
variants
0 1/0 1/0 ./0 1/1 0/1 1/0 0/0 0/0 ./. 1/0
1 ./1 0/0 1/0 1/1 ./0 1/1 1/1 1/1 0/1 0/0
2 ./0 1/1 1/. 0/1 0/1 0/1 1/0 ./1 1/0 0/0
3 0/1 0/1 0/1 0/1 1/1 1/1 0/0 1/1 0/1 1/0

>>> ds["call_dosage"] = ds.call_genotype.sum(dim="ploidy").astype(float)
>>> ds["call_dosage_mask"] = ds.call_genotype_mask.any(dim='ploidy')
>>> ds["call_dosage"] = ds["call_dosage"].where(~ds["call_dosage_mask"], np.nan)
>>> ds["call_dosage"] # doctest: +NORMALIZE_WHITESPACE
<xarray.DataArray 'call_dosage' (variants: 4, samples: 10)>
array([[ 1., 1., nan, 2., 1., 1., 0., 0., nan, 1.],
[nan, 0., 1., 2., nan, 2., 2., 2., 1., 0.],
[nan, 2., nan, 1., 1., 1., 1., nan, 1., 0.],
[ 1., 1., 1., 1., 2., 2., 0., 2., 1., 1.]])
Dimensions without coordinates: variants, samples

>>> ds = mean_impute(ds, variable='call_dosage', dim='samples')
>>> ds["call_dosage_imputed"] # doctest: +NORMALIZE_WHITESPACE
<xarray.DataArray 'call_dosage_imputed' (variants: 4, samples: 10)>
array([[1. , 1. , 0.875, 2. , 1. , 1. , 0. , 0. , 0.875,
1. ],
[1.25 , 0. , 1. , 2. , 1.25 , 2. , 2. , 2. , 1. ,
0. ],
[1. , 2. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,
0. ],
[1. , 1. , 1. , 1. , 2. , 2. , 0. , 2. , 1. ,
1. ]])
Dimensions without coordinates: variants, samples
Attributes:
comment: Dosages imputed, encoded as floats, with NaN indicating a missi...

"""

variables.validate(ds, variable)
variables.validate(ds, f"{variable}_mask")

unmasked = ~ds[f"{variable}_mask"]
new_ds = create_dataset(
{
f"{variable}_imputed": (
ds[variable].dims,
ds[variable]
.where(unmasked, ds[variable].where(unmasked).mean(dim=dim))
.data,
)
}
)

return conditional_merge_datasets(ds, new_ds, merge)
32 changes: 32 additions & 0 deletions sgkit/variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,15 @@ def _check_field(
)
)

call_dosage_imputed, call_dosage_imputed_spec = SgkitVariables.register_variable(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we want to create a whole new bunch of variables here. Wouldn't it be simpler if we returned a copy of the original dataset in which all the missing data for the variable in question was replaced with the mean, and the mask was unset?

This would be more useful for downstream work, wouldn't it? We'd surely want to use the (say) imputed call_dosage in downstream analyses, and we wouldn't want to need to change variable names in order to do this.

ArrayLikeSpec(
"call_dosage_imputed",
kind="f",
ndim=2,
__doc__="""Dosages imputed, encoded as floats, with NaN indicating a missing value.""",
)
)

call_dosage_mask, call_dosage_mask_spec = SgkitVariables.register_variable(
ArrayLikeSpec(
"call_dosage_mask",
Expand Down Expand Up @@ -293,6 +302,16 @@ def _check_field(
)
)

call_genotype_imputed, call_genotype_imputed_spec = SgkitVariables.register_variable(
ArrayLikeSpec(
"call_genotype_imputed",
kind="f",
ndim=3,
__doc__="""
Call genotype imputed """,
)
)

(
call_genotype_probability,
call_genotype_probability_spec,
Expand All @@ -305,6 +324,19 @@ def _check_field(
)
)

(
call_genotype_probability_imputed,
call_genotype_probability_imputed_spec,
) = SgkitVariables.register_variable(
ArrayLikeSpec(
"call_genotype_probability_imputed",
kind="f",
ndim=3,
__doc__="""Genotype probabilities Imputed.""",
)
)


(
call_genotype_probability_mask,
call_genotype_probability_mask_spec,
Expand Down