Skip to content

Commit

Permalink
CODE: normalise_genetic_value
Browse files Browse the repository at this point in the history
Add new function to normalise genetic value
  • Loading branch information
daikitag authored and mergify[bot] committed Feb 28, 2024
1 parent 4d65425 commit 255539a
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 3 deletions.
5 changes: 5 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ This page provides a detailed explanation of all public tstrait objects and func
.. autosummary::
normalise_phenotypes
normalise_genetic_value
```

### Result data classes
Expand Down Expand Up @@ -114,6 +115,10 @@ This page provides a detailed explanation of all public tstrait objects and func
.. autofunction:: tstrait.normalise_phenotypes
```

```{eval-rst}
.. autofunction:: tstrait.normalise_genetic_value
```

### Result data classes

```{eval-rst}
Expand Down
46 changes: 45 additions & 1 deletion docs/genetic.md
Original file line number Diff line number Diff line change
Expand Up @@ -245,4 +245,48 @@ genetic_df["genetic_value"] = genetic_df["genetic_value"] - trait_df["effect_siz
genetic_df.head()
```

The new genetic value dataframe can be used in {py:func}`sim_env` to simulate phenotypes.
The new genetic value dataframe can be used in {py:func}`sim_env` to simulate phenotypes.

(normalise_genetic_value)=

# Normalise Genetic Value

The computed genetic values can be scaled by using the {py:func}`normalise_genetic_value` function. The function
will first normalise the genetic value by subtracting the mean of the input genetic value and divide it
by the standard devitation of the input genetic value.
Afterwards, it scales the normalised genetic value based on the mean and variance input.
The output of {py:func}`normalise_genetic_value` is a {py:class}`pandas.DataFrame` object with the scaled genetic values.
It is suggested to use this function on the genetic value dataframe that is obtained by
{py:func}`genetic_value`, and use the normalised genetic value dataframe to simulate phenotypes
by using {py:func}`sim_env`.

An example usage of this function is shown below:

```{code-cell}
mean = 0
var = 1
ts = msprime.sim_ancestry(
samples=10_000,
recombination_rate=1e-8,
sequence_length=100_000,
population_size=10_000,
random_seed=1000,
)
ts = msprime.sim_mutations(ts, rate=1e-7, random_seed=1001)
trait_df = tstrait.sim_trait(ts, num_causal=1000, model=model, random_seed=500)
genetic_df = tstrait.genetic_value(ts, trait_df)
normalised_df = tstrait.normalise_genetic_value(genetic_df, mean=mean, var=var)
normalised_df.head()
```

The distribution of the normalised genetic value is shown below, and wee that the mean and variance
of the normalised genetic values are 0 and 1.

```{code-cell}
import matplotlib.pyplot as plt
plt.hist(normalised_df["genetic_value"], bins=40)
plt.title("Normalised Genetic Value")
plt.show()
```
129 changes: 129 additions & 0 deletions tests/test_genetic_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,3 +376,132 @@ def test_tree_seq_multiple_trait(self):
)

pd.testing.assert_frame_equal(genetic_df, genetic_result, check_dtype=False)


class TestNormaliseGenetic:
def test_output(self, sample_ts):
mean = 2
var = 4
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
trait_df = tstrait.sim_trait(
ts=sample_ts, num_causal=20, model=model, random_seed=500
)
genetic_df = tstrait.genetic_value(sample_ts, trait_df)

normalised_df = tstrait.normalise_genetic_value(genetic_df, mean=mean, var=var)
genetic_array = normalised_df["genetic_value"].values
np.testing.assert_almost_equal(np.mean(genetic_array), mean, decimal=2)
np.testing.assert_almost_equal(np.var(genetic_array, ddof=1), var, decimal=2)
pd.testing.assert_series_equal(
normalised_df["trait_id"], genetic_df["trait_id"]
)
pd.testing.assert_series_equal(
normalised_df["individual_id"], genetic_df["individual_id"]
)

num_ind = sample_ts.num_individuals
assert len(normalised_df) == num_ind
assert normalised_df.shape[1] == 3
assert list(normalised_df.columns) == [
"individual_id",
"trait_id",
"genetic_value",
]

def test_default(self, sample_ts):
mean = 0
var = 1
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
trait_df = tstrait.sim_trait(
ts=sample_ts, num_causal=20, model=model, random_seed=1000
)
genetic_df = tstrait.genetic_value(sample_ts, trait_df)
normalised_df = tstrait.normalise_genetic_value(genetic_df)
genetic_array = normalised_df["genetic_value"].values
np.testing.assert_almost_equal(np.mean(genetic_array), mean, decimal=2)
np.testing.assert_almost_equal(np.var(genetic_array, ddof=1), var, decimal=2)
pd.testing.assert_series_equal(
normalised_df["trait_id"], genetic_df["trait_id"]
)
pd.testing.assert_series_equal(
normalised_df["individual_id"], genetic_df["individual_id"]
)

num_ind = sample_ts.num_individuals
assert len(normalised_df) == num_ind
assert normalised_df.shape[1] == 3
assert list(normalised_df.columns) == [
"individual_id",
"trait_id",
"genetic_value",
]

def test_column(self, sample_ts):
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
trait_df = tstrait.sim_trait(
ts=sample_ts, num_causal=20, model=model, random_seed=1000
)
genetic_df = tstrait.genetic_value(sample_ts, trait_df)
with pytest.raises(
ValueError, match="columns must be included in genetic_df dataframe"
):
tstrait.normalise_genetic_value(genetic_df[["trait_id", "individual_id"]])

with pytest.raises(
ValueError, match="columns must be included in genetic_df dataframe"
):
tstrait.normalise_genetic_value(genetic_df[["trait_id", "genetic_value"]])

with pytest.raises(
ValueError, match="columns must be included in genetic_df dataframe"
):
tstrait.normalise_genetic_value(
genetic_df[["genetic_value", "individual_id"]]
)

@pytest.mark.parametrize("var", [0, -1])
def test_negative_var(self, sample_ts, var):
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
trait_df = tstrait.sim_trait(
ts=sample_ts, num_causal=20, model=model, random_seed=1000
)
genetic_df = tstrait.genetic_value(sample_ts, trait_df)

with pytest.raises(ValueError, match="Variance must be greater than 0."):
tstrait.normalise_genetic_value(genetic_df, var=var)

@pytest.mark.parametrize("ddof", [0, 1])
def test_ddof(self, sample_ts, ddof):
model = tstrait.trait_model(distribution="normal", mean=2, var=6)
trait_df = tstrait.sim_trait(
ts=sample_ts, num_causal=20, model=model, random_seed=1000
)
genetic_df = tstrait.genetic_value(sample_ts, trait_df)
normalised_df = tstrait.normalise_genetic_value(
genetic_df, mean=0, var=1, ddof=ddof
)
normalised_genetic_array = normalised_df["genetic_value"].values

genetic_array = genetic_df["genetic_value"].values
genetic_array = (genetic_array - np.mean(genetic_array)) / np.std(
genetic_array, ddof=ddof
)

np.testing.assert_array_almost_equal(normalised_genetic_array, genetic_array)

def test_pleiotropy(self, sample_ts):
mean = 0
var = 1
model = tstrait.trait_model(
distribution="multi_normal", mean=np.zeros(2), cov=np.identity(2)
)
trait_df = tstrait.sim_trait(
ts=sample_ts, num_causal=20, model=model, random_seed=1000
)
genetic_df = tstrait.genetic_value(sample_ts, trait_df)
normalised_df = tstrait.normalise_genetic_value(genetic_df, mean=mean, var=var)
grouped = normalised_df.groupby(["trait_id"])[["genetic_value"]]
mean_array = grouped.mean().values.T[0]
var_array = grouped.var().values.T[0]
np.testing.assert_almost_equal(mean_array, np.zeros(2), decimal=2)
np.testing.assert_almost_equal(var_array, np.ones(2), decimal=2)
4 changes: 2 additions & 2 deletions tests/test_simulate_phenotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def test_output(self, sample_ts):
normalised_df = tstrait.normalise_phenotypes(phenotype_df, mean=mean, var=var)
phenotype_array = normalised_df["phenotype"].values
np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2)
np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2)
np.testing.assert_almost_equal(np.var(phenotype_array, ddof=1), var, decimal=2)
pd.testing.assert_series_equal(
normalised_df["trait_id"], phenotype_df["trait_id"]
)
Expand Down Expand Up @@ -263,7 +263,7 @@ def test_default(self, sample_ts):
normalised_df = tstrait.normalise_phenotypes(phenotype_df)
phenotype_array = normalised_df["phenotype"].values
np.testing.assert_almost_equal(np.mean(phenotype_array), mean, decimal=2)
np.testing.assert_almost_equal(np.var(phenotype_array), var, decimal=2)
np.testing.assert_almost_equal(np.var(phenotype_array, ddof=1), var, decimal=2)
pd.testing.assert_series_equal(
normalised_df["trait_id"], phenotype_df["trait_id"]
)
Expand Down
2 changes: 2 additions & 0 deletions tstrait/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
) # noreorder
from .genetic_value import (
genetic_value,
normalise_genetic_value,
) # noreorder
from .simulate_environment import (
sim_env,
Expand All @@ -47,5 +48,6 @@
"TraitModelNormal",
"TraitModelT",
"genetic_value",
"normalise_genetic_value",
"sim_env",
]
56 changes: 56 additions & 0 deletions tstrait/genetic_value.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,59 @@ def genetic_value(ts, trait_df):
genetic_result = genetic._run()

return genetic_result


def normalise_genetic_value(genetic_df, mean=0, var=1, ddof=1):
"""Normalise genetic value dataframe.
Parameters
----------
genetic_df : pandas.DataFrame
Genetic value dataframe.
mean : float, default 0
Mean of the resulting genetic value.
var : float, default 1
Variance of the resulting genetic value.
ddof : int, default 1
Delta degrees of freedom. The divisor used in computing the variance
is N - ddof, where N represents the number of elements.
Returns
-------
pandas.DataFrame
Dataframe with normalised genetic value.
Raises
------
ValueError
If `var` <= 0.
Notes
-----
The following columns must be included in `genetic_df`:
* **trait_id**: Trait ID.
* **individual_id**: Individual ID inside the tree sequence input.
* **genetic_value**: Simulated genetic values.
The dataframe output has the following columns:
* **trait_id**: Trait ID.
* **individual_id**: Individual ID inside the tree sequence input.
* **genetic_value**: Normalised genetic values.
Examples
--------
See :ref:`normalise_genetic_value` section for worked examples.
"""
if var <= 0:
raise ValueError("Variance must be greater than 0.")
genetic_df = _check_dataframe(
genetic_df, ["individual_id", "trait_id", "genetic_value"], "genetic_df"
)
grouped = genetic_df.groupby("trait_id")[["genetic_value"]]
transformed_genetic = grouped.transform(lambda x: (x - x.mean()) / x.std(ddof=ddof))
transformed_genetic = transformed_genetic * np.sqrt(var) + mean
genetic_df.loc[:, "genetic_value"] = transformed_genetic

return genetic_df

0 comments on commit 255539a

Please sign in to comment.