diff --git a/README.md b/README.md index 9018361..c16ae8d 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ **tstrait** is a quantitative trait simulator of [tskit](https://tskit.readthedocs.io/) tree sequences. -- Documentation: +- Documentation: https://tskit.dev/tstrait/docs/ - Source code: https://github.com/tskit-dev/tstrait - Tree sequence documentation: https://tskit.dev/learn/ diff --git a/tests/data.py b/tests/data.py index 2e8e73c..ba577e2 100644 --- a/tests/data.py +++ b/tests/data.py @@ -58,6 +58,10 @@ def diff_ind_tree(): """ ts = binary_tree() tables = ts.dump_tables() + tables.individuals.clear() + tables.individuals.add_row() + tables.individuals.add_row() + tables.individuals.add_row() individuals = tables.nodes.individual individuals[[0, 2]] = 1 individuals[[1, 3]] = 2 @@ -106,6 +110,27 @@ def non_binary_tree(): return ts +def triploid_tree(): + """Same mutation and tree structure as non_binary_tree, but individuals have + different nodes and are triploids. See the details of the tree sequence + in the doctring for `non_binary_tree`. + + Individual0: Node0, Node2 and Node4 + Individual1: Node1, Node3 and Node5 + """ + ts = non_binary_tree() + tables = ts.dump_tables() + tables.individuals.clear() + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[[0, 2, 4]] = 0 + individuals[[1, 3, 5]] = 1 + tables.nodes.individual = individuals + ts = tables.tree_sequence() + return ts + + def all_trees_ts(n): """ Generate a tree sequence that corresponds to the lexicographic listing diff --git a/tests/test_base.py b/tests/test_base.py index 0504a93..8775d32 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -1,6 +1,7 @@ import fractions import numpy as np +import pandas as pd import pytest from tstrait.base import ( @@ -8,6 +9,8 @@ _check_val, _check_instance, _check_numeric_array, + _check_dataframe, + _check_non_decreasing, ) # noreorder @@ -72,3 +75,36 @@ def test_true(self): def test_nonnumeric(self): with pytest.raises(TypeError, match="input must be numeric"): _check_numeric_array([1, "1"], "input") + + +class TestDataFrame: + def test_type(self): + with pytest.raises( + TypeError, + match="df must be a instance", + ): + _check_dataframe(1, {"one"}, "df") + + def test_dataframe(self): + df = pd.DataFrame({"first": [0, 1], "second": [3, 4]}) + + with pytest.raises( + ValueError, match="columns must be included in df dataframe" + ): + _check_dataframe(df, ["column"], "df") + + pd.testing.assert_frame_equal( + _check_dataframe(df, ["first"], "df"), df[["first"]] + ) + + pd.testing.assert_frame_equal( + _check_dataframe(df, ["first", "second"], "df"), df + ) + + +class TestNonDecreasing: + def test_non_decreasing(self): + with pytest.raises(ValueError, match="array must be non-decreasing"): + _check_non_decreasing([0, 1, 0], "array") + + _check_non_decreasing([0, 0, 1, 1, 4], "array") diff --git a/tests/test_genetic_value.py b/tests/test_genetic_value.py new file mode 100644 index 0000000..828ba16 --- /dev/null +++ b/tests/test_genetic_value.py @@ -0,0 +1,334 @@ +import msprime +import numpy as np +import pandas as pd +import pytest +import tstrait +from tstrait.base import _check_numeric_array +from .data import ( + binary_tree, + binary_tree_seq, + diff_ind_tree, + non_binary_tree, + triploid_tree, +) # noreorder + + +@pytest.fixture(scope="class") +def sample_ts(): + ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) + return ts + + +@pytest.fixture(scope="class") +def sample_df(): + df = pd.DataFrame( + { + "site_id": [0, 1], + "causal_state": ["A", "A"], + "effect_size": [0.1, 0.1], + "trait_id": [0, 0], + } + ) + + return df + + +@pytest.fixture(scope="class") +def sample_trait_model(): + return tstrait.trait_model(distribution="normal", mean=0, var=1) + + +class TestInput: + """This test will check that an informative error is raised when the input parameter + does not have an appropriate type or value. + """ + + def test_input_type(self, sample_ts, sample_df): + with pytest.raises( + TypeError, match="ts must be a instance" + ): + tstrait.genetic_value(ts=1, trait_df=sample_df) + with pytest.raises( + TypeError, + match="trait_df must be a instance", + ): + tstrait.genetic_value(ts=sample_ts, trait_df=1) + + def test_bad_input(self, sample_ts, sample_df): + with pytest.raises( + ValueError, match="columns must be included in trait_df dataframe" + ): + df = sample_df.drop(columns=["site_id"]) + tstrait.genetic_value(ts=sample_ts, trait_df=df) + + with pytest.raises(ValueError, match="site_id must be non-decreasing"): + df = sample_df.copy() + df["site_id"] = [2, 0] + tstrait.genetic_value(ts=sample_ts, trait_df=df) + + @pytest.mark.parametrize("trait_id", [[2, 3], [0, 2]]) + def test_trait_id(self, sample_ts, sample_df, trait_id): + with pytest.raises( + ValueError, match="trait_id must be consecutive and start from 0" + ): + df = sample_df.copy() + df["trait_id"] = trait_id + tstrait.genetic_value(ts=sample_ts, trait_df=df) + + +class TestOutputDim: + """Check that the genetic_value function gives the output with correct dimensions""" + + def check_dimensions(self, df, nrow): + assert len(df) == nrow + assert df.shape[1] == 3 + assert list(df.columns) == [ + "trait_id", + "individual_id", + "genetic_value", + ] + _check_numeric_array(df["individual_id"], "individual_id") + _check_numeric_array(df["genetic_value"], "genetic_value") + _check_numeric_array(df["trait_id"], "trait_id") + + def test_output_simple(self, sample_ts, sample_df): + genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=sample_df) + self.check_dimensions(genetic_df, sample_ts.num_individuals) + + @pytest.mark.parametrize( + "trait_model", + [ + tstrait.trait_model(distribution="normal", mean=0, var=1), + tstrait.trait_model(distribution="exponential", scale=1), + tstrait.trait_model(distribution="fixed", value=1), + tstrait.trait_model(distribution="t", mean=0, var=1, df=1), + tstrait.trait_model(distribution="gamma", shape=1, scale=1), + ], + ) + def test_output_sim_trait(self, sample_ts, trait_model): + num_causal = 10 + trait_df = tstrait.sim_trait( + ts=sample_ts, num_causal=num_causal, model=trait_model + ) + genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) + self.check_dimensions(genetic_df, sample_ts.num_individuals) + np.testing.assert_equal( + genetic_df["trait_id"], np.zeros(sample_ts.num_individuals) + ) + np.testing.assert_equal( + genetic_df["individual_id"], np.arange(sample_ts.num_individuals) + ) + + def test_output_multivariate(self, sample_ts): + num_trait = 3 + mean = np.ones(num_trait) + cov = np.eye(num_trait) + num_causal = 10 + model = tstrait.trait_model(distribution="multi_normal", mean=mean, cov=cov) + trait_df = tstrait.sim_trait(ts=sample_ts, num_causal=num_causal, model=model) + genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) + self.check_dimensions(genetic_df, sample_ts.num_individuals * num_trait) + ind_id = np.arange(sample_ts.num_individuals) + trait_id = np.ones(sample_ts.num_individuals) + np.testing.assert_equal( + genetic_df["individual_id"], np.concatenate((ind_id, ind_id, ind_id)) + ) + np.testing.assert_equal( + genetic_df["trait_id"], + np.concatenate((trait_id * 0, trait_id * 1, trait_id * 2)), + ) + + def test_additional_row(self, sample_ts, sample_df): + """Check that adding unexpected rows to trait_df won't cause any errors""" + df = sample_df.copy() + df["height"] = [170, 180] + tstrait.genetic_value(ts=sample_ts, trait_df=df) + + +class TestGenotype: + """Test the `_individual_genotype` method and check if it can accurately detect + the individual genotype. Afterwards, we will check the output of `sim_trait`. + """ + + def test_binary_tree(self): + trait_df = pd.DataFrame( + { + "site_id": [0, 1, 2, 3], + "causal_state": ["T", "T", "C", "C"], + "effect_size": [1, 10, 100, 1000], + "trait_id": [0, 0, 0, 0], + } + ) + + ts = binary_tree() + tree = ts.first() + genetic = tstrait.GeneticValue(ts, trait_df) + g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) + g1 = genetic._individual_genotype(tree, ts.site(1), "T", ts.num_nodes) + g2 = genetic._individual_genotype(tree, ts.site(2), "C", ts.num_nodes) + g3 = genetic._individual_genotype(tree, ts.site(3), "C", ts.num_nodes) + + np.testing.assert_equal(g0, np.array([1, 0, 2])) + np.testing.assert_equal(g1, np.array([1, 1, 0])) + np.testing.assert_equal(g2, np.array([0, 1, 0])) + np.testing.assert_equal(g3, np.array([1, 2, 0])) + + genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + + result_df = pd.DataFrame( + { + "trait_id": [0, 0, 0], + "individual_id": [0, 1, 2], + "genetic_value": [1011, 2110, 2], + } + ) + + pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + + def test_diff_ind_tree(self): + trait_df = pd.DataFrame( + { + "site_id": [0, 1, 2, 3], + "causal_state": ["T", "T", "C", "C"], + "effect_size": [1, 10, 100, 1000], + "trait_id": [0, 0, 0, 0], + } + ) + + ts = diff_ind_tree() + tree = ts.first() + genetic = tstrait.GeneticValue(ts, trait_df) + g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) + g1 = genetic._individual_genotype(tree, ts.site(1), "T", ts.num_nodes) + g2 = genetic._individual_genotype(tree, ts.site(2), "C", ts.num_nodes) + g3 = genetic._individual_genotype(tree, ts.site(3), "C", ts.num_nodes) + + np.testing.assert_equal(g0, np.array([1, 1, 1])) + np.testing.assert_equal(g1, np.array([1, 0, 1])) + np.testing.assert_equal(g2, np.array([0, 1, 0])) + np.testing.assert_equal(g3, np.array([1, 1, 1])) + + genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + + result_df = pd.DataFrame( + { + "trait_id": [0, 0, 0], + "individual_id": [0, 1, 2], + "genetic_value": [1011, 1101, 1011], + } + ) + + pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + + def test_non_binary_tree(self): + trait_df = pd.DataFrame( + { + "site_id": [0, 1], + "causal_state": ["T", "C"], + "effect_size": [1, 10], + "trait_id": [0, 0], + } + ) + + ts = non_binary_tree() + tree = ts.first() + genetic = tstrait.GeneticValue(ts, trait_df) + g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) + g1 = genetic._individual_genotype(tree, ts.site(1), "C", ts.num_nodes) + + np.testing.assert_equal(g0, np.array([0, 1, 2])) + np.testing.assert_equal(g1, np.array([0, 1, 1])) + + genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + + result_df = pd.DataFrame( + { + "trait_id": [0, 0, 0], + "individual_id": [0, 1, 2], + "genetic_value": [0, 11, 12], + } + ) + + pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + + def test_triploid(self, sample_df): + trait_df = pd.DataFrame( + { + "trait_id": [0, 0], + "site_id": [0, 1], + "causal_state": ["T", "C"], + "effect_size": [1, 10], + } + ) + + ts = triploid_tree() + tree = ts.first() + genetic = tstrait.GeneticValue(ts, sample_df) + g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) + g1 = genetic._individual_genotype(tree, ts.site(1), "C", ts.num_nodes) + + np.testing.assert_equal(g0, np.array([1, 2])) + np.testing.assert_equal(g1, np.array([1, 1])) + + genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + + result_df = pd.DataFrame( + { + "trait_id": [0, 0], + "individual_id": [0, 1], + "genetic_value": [11, 12], + } + ) + + pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + + +class TestTreeSeq: + """Test the `genetic_value` function by using a tree sequence data.""" + + def test_tree_seq(self): + ts = binary_tree_seq() + trait_df = pd.DataFrame( + { + "site_id": [0, 1, 2], + "causal_state": ["T", "G", "C"], + "effect_size": [1, 10, 100], + "trait_id": [0, 0, 0], + } + ) + + genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + + result_df = pd.DataFrame( + { + "trait_id": [0, 0], + "individual_id": [0, 1], + "genetic_value": [101, 122], + } + ) + + pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + + def test_tree_seq_multiple_trait(self): + ts = binary_tree_seq() + trait_df = pd.DataFrame( + { + "trait_id": np.tile([0, 1], 3), + "site_id": np.repeat([0, 1, 2], 2), + "causal_state": np.repeat(["T", "G", "C"], 2), + "effect_size": [1, 2, 10, 20, 100, 200], + } + ) + + genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + + result_df = pd.DataFrame( + { + "trait_id": [0, 0, 1, 1], + "individual_id": [0, 1, 0, 1], + "genetic_value": [101, 122, 202, 244], + } + ) + + pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) diff --git a/tests/test_sim_trait.py b/tests/test_sim_trait.py index 47be5ba..d83b5ca 100644 --- a/tests/test_sim_trait.py +++ b/tests/test_sim_trait.py @@ -8,9 +8,10 @@ from tstrait.base import _check_numeric_array from .data import ( binary_tree, + binary_tree_seq, diff_ind_tree, non_binary_tree, - binary_tree_seq, + triploid_tree, ) # noreorder @@ -152,14 +153,18 @@ def test_binary_tree(self, tree_seq, sample_trait_model): assert c2 == {"C": 1} assert c3 == {"C": 2, "T": 2} - def non_binary_tree(self, sample_trait_model): - ts = non_binary_tree() + @pytest.mark.parametrize("tree_seq", [non_binary_tree(), triploid_tree()]) + def non_binary_tree(self, tree_seq, sample_trait_model): simulator = tstrait.TraitSimulator( - ts=ts, num_causal=4, model=sample_trait_model, alpha=0.3, random_seed=1 + ts=tree_seq, + num_causal=4, + model=sample_trait_model, + alpha=0.3, + random_seed=1, ) - tree = ts.first() - c0 = simulator._obtain_allele_count(tree, ts.site(0)) - c1 = simulator._obtain_allele_count(tree, ts.site(1)) + tree = tree_seq.first() + c0 = simulator._obtain_allele_count(tree, tree_seq.site(0)) + c1 = simulator._obtain_allele_count(tree, tree_seq.site(1)) assert c0 == {"T": 3} assert c1 == {"C": 2, "T": 1} diff --git a/tstrait/__init__.py b/tstrait/__init__.py index aae1c01..35fbaa4 100644 --- a/tstrait/__init__.py +++ b/tstrait/__init__.py @@ -28,6 +28,10 @@ TraitModelNormal, TraitModelT, ) # noreorder +from .genetic_value import ( + genetic_value, + GeneticValue, +) # noreorder __all__ = [ "__version__", @@ -46,4 +50,6 @@ "TraitModelMultivariateNormal", "TraitModelNormal", "TraitModelT", + "genetic_value", + "GeneticValue", ] diff --git a/tstrait/base.py b/tstrait/base.py index ba769f3..2a79510 100644 --- a/tstrait/base.py +++ b/tstrait/base.py @@ -2,6 +2,7 @@ import operator import numpy as np +import pandas as pd def _check_instance(data, name, input_type): @@ -40,3 +41,22 @@ def _check_numeric_array(array, name): """Check if the input is a numeric array or not. This is used in unit tests.""" result = np.array([_check_val(value, name) for value in array]) return result + + +def _check_dataframe(df, column, df_name): + """Check if `df` is a pandas dataframe or not and examine if `column` is a column + in `df` or not + column is the set of column names and df_name is the name of df + """ + + df = _check_instance(df, df_name, pd.DataFrame) + if not set(column).issubset(df.columns): + raise ValueError(f"{column} columns must be included in {df_name} dataframe") + return df[list(column)] + + +def _check_non_decreasing(array, array_name): + """Check that the array is non-decreasing.""" + diff = np.diff(array) + if np.min(diff) < 0: + raise ValueError(f"{array_name} must be non-decreasing") diff --git a/tstrait/genetic_value.py b/tstrait/genetic_value.py new file mode 100644 index 0000000..2e7e963 --- /dev/null +++ b/tstrait/genetic_value.py @@ -0,0 +1,156 @@ +import numba +import numpy as np +import pandas as pd +import tskit + +from .base import _check_instance, _check_dataframe, _check_non_decreasing # noreorder + + +@numba.njit +def _traversal_genotype( + nodes_individual, + left_child_array, + right_sib_array, + stack, + has_mutation, + num_individuals, + num_nodes, +): # pragma: no cover + """ + Numba to speed up the tree traversal algorithm to determine the genotype of + individuals. + Stack has to be Typed List in numba to use numba. + """ + + genotype = np.zeros(num_individuals) + while len(stack) > 0: + parent_node_id = stack.pop() + if parent_node_id == num_nodes: + individual_id = -1 + else: + individual_id = nodes_individual[parent_node_id] + if individual_id > -1: + genotype[individual_id] += 1 + child_node_id = left_child_array[parent_node_id] + while child_node_id != -1: + if not has_mutation[child_node_id]: + stack.append(child_node_id) + child_node_id = right_sib_array[child_node_id] + + return genotype + + +class GeneticValue: + """GeneticValue class to compute genetic values of individuals. + + :param ts: Tree sequence data with mutation. + :type ts: tskit.TreeSequence + :param trait_df: Pandas dataframe that includes causal site ID, causal allele, + simulated effect size, and trait ID. + :type effect_size_df: pandas.DataFrame + """ + + def __init__(self, ts, trait_df): + self.trait_df = trait_df[["site_id", "causal_state", "effect_size", "trait_id"]] + self.ts = ts + + def _individual_genotype(self, tree, site, causal_state, num_nodes): + """ + Returns a numpy array that describes the number of causal mutation in an + individual. + """ + has_mutation = np.zeros(num_nodes + 1, dtype=bool) + state_transitions = {tree.virtual_root: site.ancestral_state} + for m in site.mutations: + state_transitions[m.node] = m.derived_state + has_mutation[m.node] = True + stack = numba.typed.List() + for node, state in state_transitions.items(): + if state == causal_state: + stack.append(node) + + if len(stack) == 0: + genotype = np.zeros(self.ts.num_individuals) + else: + genotype = _traversal_genotype( + nodes_individual=self.ts.nodes_individual, + left_child_array=tree.left_child_array, + right_sib_array=tree.right_sib_array, + stack=stack, + has_mutation=has_mutation, + num_individuals=self.ts.num_individuals, + num_nodes=num_nodes, + ) + + return genotype + + def compute_genetic_value(self): + """Computes genetic values of individuals. + + :return: Returns a pandas dataframe with genetic value, individual ID, + and trait ID + :rtype: pandas.DataFrame + """ + + num_ind = self.ts.num_individuals + num_trait = np.max(self.trait_df.trait_id) + 1 + genetic_val_array = np.zeros((num_trait, num_ind)) + + num_nodes = self.ts.num_nodes + tree = tskit.Tree(self.ts) + + for data in self.trait_df.itertuples(): + site = self.ts.site(data.site_id) + tree.seek(site.position) + individual_genotype = self._individual_genotype( + tree=tree, + site=site, + causal_state=data.causal_state, + num_nodes=num_nodes, + ) + genetic_val_array[data.trait_id, :] += ( + individual_genotype * data.effect_size + ) + + df = pd.DataFrame( + { + "trait_id": np.repeat(np.arange(num_trait), num_ind), + "individual_id": np.tile(np.arange(num_ind), num_trait), + "genetic_value": genetic_val_array.flatten(), + } + ) + + return df + + +def genetic_value(ts, trait_df): + """Calculates genetic values of individuals based on the inputted tree sequence + and the dataframe with simulated effect sizes of causal mutations. + :param ts: The tree sequence data that will be used in the quantitative trait + simulation. The tree sequence data must include a mutation. + :type ts: tskit.TreeSequence + :param trait_df: The dataframe that includes simulated effect sizes of causal + mutations. It must include site_id, causal allele, simulated effect size, + and trait_id, and the data must be aligned based on site_id. + :type trait_df: pandas.DataFrame + :return: Returns a pandas dataframe that includes individual ID and genetic + value. + :rtype: pandas.DataFrame + """ + + ts = _check_instance(ts, "ts", tskit.TreeSequence) + trait_df = _check_dataframe( + trait_df, ["site_id", "causal_state", "effect_size", "trait_id"], "trait_df" + ) + _check_non_decreasing(trait_df["site_id"], "site_id") + + trait_id = trait_df["trait_id"].unique() + + if np.min(trait_id) != 0 or np.max(trait_id) != len(trait_id) - 1: + raise ValueError("trait_id must be consecutive and start from 0") + + genetic = GeneticValue(ts=ts, trait_df=trait_df) + + genetic_df = genetic.compute_genetic_value() + + return genetic_df diff --git a/tstrait/provenance.py b/tstrait/provenance.py index 98082b5..d3f8f11 100644 --- a/tstrait/provenance.py +++ b/tstrait/provenance.py @@ -2,6 +2,6 @@ try: from . import _version - __version__ = _version.version + __version__ = _version.version # pragma: no cover except ImportError: # pragma: nocover pass diff --git a/tstrait/simulate_effect_size.py b/tstrait/simulate_effect_size.py index e5beb5c..21c18af 100644 --- a/tstrait/simulate_effect_size.py +++ b/tstrait/simulate_effect_size.py @@ -87,10 +87,13 @@ def sim_causal_mutation(self): """ causal_site_array = self._choose_causal_site() num_samples = self.ts.num_samples + num_trait = self.model.num_trait tree = tskit.Tree(self.ts) causal_state_array = np.zeros(self.num_causal, dtype=object) - beta_array = np.zeros((self.num_causal, self.model.num_trait)) + beta_array = np.zeros((self.num_causal, num_trait), dtype=float) + trait_id_array = np.tile(np.arange(num_trait), self.num_causal) + site_id_array = np.repeat(causal_site_array, num_trait) for i, single_id in enumerate(causal_site_array): site = self.ts.site(single_id) @@ -101,26 +104,19 @@ def sim_causal_mutation(self): allele_freq = counts[causal_state] / num_samples beta = self.model.sim_effect_size(num_causal=self.num_causal, rng=self.rng) beta *= self._frequency_dependence(allele_freq) - beta_array[i] = beta + beta_array[i, :] = beta + + causal_state_array = np.repeat(causal_state_array, num_trait) df = pd.DataFrame( - columns=["site_id", "causal_state", "effect_size", "trait_id"] + { + "site_id": site_id_array, + "causal_state": causal_state_array, + "effect_size": beta_array.flatten(), + "trait_id": trait_id_array, + } ) - for i in range(self.model.num_trait): - df_add = pd.DataFrame( - { - "site_id": causal_site_array, - "causal_state": causal_state_array, - "effect_size": beta_array[:, i], - "trait_id": np.ones(self.num_causal) * i, - } - ) - df = pd.concat([df, df_add]) - - df = df.reset_index() - del df["index"] - return df