From 6f0d2650a39dda0c992d657a2e8dde7310e2a4ce Mon Sep 17 00:00:00 2001 From: daikitag <48062118+daikitag@users.noreply.github.com> Date: Sat, 15 Jul 2023 00:04:03 +0900 Subject: [PATCH] dataftame-effect-size Generate a pandas dataframe to output simulated effect sizes --- requirements/CI-complete/requirements.txt | 1 + requirements/CI-docs/requirements.txt | 1 + requirements/CI-tests-pip/requirements.txt | 1 + requirements/development.txt | 1 + setup.cfg | 1 + stats_tests.py | 561 +++++++++------- tests/test_sim_trait.py | 746 +++++++++++++++++++++ tstrait/__init__.py | 4 + tstrait/simulate_effect_size.py | 171 +++++ 9 files changed, 1227 insertions(+), 260 deletions(-) create mode 100644 tests/test_sim_trait.py create mode 100644 tstrait/simulate_effect_size.py diff --git a/requirements/CI-complete/requirements.txt b/requirements/CI-complete/requirements.txt index f2a4410..14bf36b 100644 --- a/requirements/CI-complete/requirements.txt +++ b/requirements/CI-complete/requirements.txt @@ -1,6 +1,7 @@ msprime numba numpy +pandas pytest==6.2.5 pytest-cov==3.0.0 tskit==0.5.3 \ No newline at end of file diff --git a/requirements/CI-docs/requirements.txt b/requirements/CI-docs/requirements.txt index 4f83450..4553860 100644 --- a/requirements/CI-docs/requirements.txt +++ b/requirements/CI-docs/requirements.txt @@ -5,6 +5,7 @@ matplotlib msprime numba numpy +pandas scipy sphinx==4.5 sphinx-argparse diff --git a/requirements/CI-tests-pip/requirements.txt b/requirements/CI-tests-pip/requirements.txt index 7ec4f48..13465f6 100644 --- a/requirements/CI-tests-pip/requirements.txt +++ b/requirements/CI-tests-pip/requirements.txt @@ -1,4 +1,5 @@ msprime numba numpy +pandas tskit==0.5.3 diff --git a/requirements/development.txt b/requirements/development.txt index acf5a0b..d5c9975 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -7,6 +7,7 @@ matplotlib msprime numba numpy +pandas pre-commit pytest scipy diff --git a/setup.cfg b/setup.cfg index 788fbab..f1fb39a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,6 +26,7 @@ python_requires = >=3.7 install_requires = numpy numba>=0.57.0 + pandas tskit>=0.5.5 [options.extras_require] diff --git a/stats_tests.py b/stats_tests.py index 904d108..f852a33 100644 --- a/stats_tests.py +++ b/stats_tests.py @@ -8,8 +8,7 @@ import scipy.stats as stats import statsmodels.api as sm import tskit -import tstrait.simulate_phenotype as simulate_phenotype -import tstrait.trait_model as trait_model +import tstrait from tqdm import tqdm @@ -41,17 +40,27 @@ def sim_tree_seq(): Site 0 Genotype: [A, A, T, T, A, T] + Ancestral state: A + Causal allele freq: 0.5 Individual 0: 0 causal Individual 1: 2 causal Individual 2: 1 causal Site 1 Genotype: [A, A, A, A, A, T] + Ancestral state: T + Causal allele freq: 1/6 Individual 0: 0 causal Individual 1: 0 causal Individual 2: 1 causal + + Effect size distribution: + + SD Formula: + trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha + sqrt(2) from 2 causal sites """ - ts = tskit.Tree.generate_comb(6, span=10).tree_sequence + ts = tskit.Tree.generate_comb(6, span=2).tree_sequence tables = ts.dump_tables() for j in range(2): @@ -75,61 +84,11 @@ def sim_tree_seq(): return ts -def sim_tree_internal(): - """ - The following tree sequence will be generated: - - 6 - 4━━┻━━5 - ┏┻━┓ ┏┻━┓ - 0 1 2 3 - - Individual 0: Node 4 and 5 - Individual 1: Node 2 and 3 - Individual 3: Node 4 and 5 - - Site 0 Ancestral State: "A" - Causal Mutation: Node 4 - - Site 1 Ancestral State: "A" - Causal Mutation: Node 2 - - Site 0 Genotype: - [T, T, A, A, T, A] - Individual 0: 1 causal - Individual 1: 2 causal - Individual 2: 0 causal - - Site 1 Genotype: - [A, A, T, A, A, A] - Individual 0: 0 causal - Individual 1: 0 causal - Individual 2: 1 causal - """ - ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence - - tables = ts.dump_tables() - for j in range(2): - tables.sites.add_row(j, "A") - - tables.individuals.add_row() - tables.individuals.add_row() - tables.individuals.add_row() - individuals = tables.nodes.individual - individuals[0] = 1 - individuals[1] = 1 - individuals[2] = 2 - individuals[3] = 2 - individuals[4] = 0 - individuals[5] = 0 - tables.nodes.individual = individuals - - tables.mutations.add_row(site=0, node=4, derived_state="T") - tables.mutations.add_row(site=1, node=2, derived_state="T") - - ts = tables.tree_sequence() +def sim_tree_seq_freq(alpha): + const1 = np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha)) + const2 = np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha)) - return ts + return const1, const2 class Test: @@ -178,257 +137,336 @@ def plot_qq_normal(self, data, loc, scale, filename, title=""): plt.savefig(f, dpi=72) plt.close("all") + def plot_qq_exponential(self, data, scale, filename, title=""): + sm.qqplot(data, stats.expon, scale=scale, line="45") + plt.title(title) + f = self._build_filename(filename) + plt.savefig(f, dpi=72) + plt.close("all") + -class TestGenetic(Test): +class TestNormal(Test): def test_normal(self): - """ - Genotype of individuals: - - Site 0 Genotype: - [A, A, T, T, A, T] - Ancestral state: A - Causal allele freq: 0.5 - Individual 0: 0 causal - Individual 1: 2 causal - Individual 2: 1 causal - - Site 1 Genotype: - [A, A, A, A, A, T] - Ancestral state: T - Causal allele freq: 1/6 - Individual 0: 0 causal - Individual 1: 0 causal - Individual 2: 1 causal - - Effect size distribution: - - SD Formula: - trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha - sqrt(2) from 2 causal sites - - Environmental noise is simulated from a normal distribution where standard - deviation depends on the variance of the simulated genetic values - """ - rng = np.random.default_rng(1) - alpha_array = np.array([0, -0.3]) - trait_mean_array = np.array([0, 1]) - trait_var_array = np.array([1, 2]) - h2_array = np.array([0.3, 0.8]) + + alpha_array = np.array([0, -0.3, 1]) + mean_array = np.array([0, 1]) + var_array = np.array([1, 2]) ts = sim_tree_seq() count = 0 - prod = itertools.product( - alpha_array, trait_mean_array, trait_var_array, h2_array - ) - - for element in tqdm(prod, total=16): + prod = itertools.product(alpha_array, mean_array, var_array) + for element in tqdm(prod, total=12): alpha = element[0] - trait_mean = element[1] - trait_var = element[2] - trait_sd = np.sqrt(trait_var) - h2 = element[3] - model = trait_model.TraitModelAlleleFrequency(trait_mean, trait_var, alpha) - trait_mean /= 2 - genetic0 = np.zeros(1000) - genetic1 = np.zeros(1000) - genetic2 = np.zeros(1000) - - environment0 = np.zeros(1000) - environment1 = np.zeros(1000) - environment2 = np.zeros(1000) - + mean = element[1] + var = element[2] + sd = np.sqrt(var) + const0, const1 = sim_tree_seq_freq(alpha) + model = tstrait.trait_model(distribution="normal", mean=mean, var=var) + effect_size0 = np.zeros(1000) + effect_size1 = np.zeros(1000) for i in range(1000): - sim_result = simulate_phenotype.sim_phenotype( - ts, num_causal=2, model=model, h2=h2, random_seed=i + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=2, + model=model, + alpha=alpha, + random_seed=i + 1000 * count, ) - phenotype_result = sim_result.phenotype - genetic0[i] = phenotype_result.genetic_value[0] - genetic1[i] = phenotype_result.genetic_value[1] - genetic2[i] = phenotype_result.genetic_value[2] - - environment0[i] = phenotype_result.environment_noise[0] - environment1[i] = phenotype_result.environment_noise[1] - environment2[i] = phenotype_result.environment_noise[2] - assert np.array_equal(genetic0, np.zeros(1000)) - - effect_size_sd0 = ( - trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha)) + effect_size0[i] = sim_result["effect_size"][0] + effect_size1[i] = sim_result["effect_size"][1] + self.plot_qq_normal( + data=effect_size0, + loc=mean * const0 / 2, + scale=sd * const0 / 2, + filename=f"effect_size_0_{count}", + title=f"EffectSize0, Normal, alpha = {alpha}, " + f"mean = {mean}, var = {var}", ) - effect_size_sd1 = ( - trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha)) + self.plot_qq_normal( + data=effect_size1, + loc=mean * const1 / 2, + scale=sd * const1 / 2, + filename=f"effect_size_1_{count}", + title=f"EffectSize1, Normal, alpha = {alpha}, " + f"mean = {mean}, var = {var}", ) + count += 1 - effect_size_mean0 = trait_mean * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha)) - effect_size_mean1 = trait_mean * np.sqrt( - pow(2 * 1 / 6 * (1 - 1 / 6), alpha) - ) - ind_sd1 = effect_size_sd0 * 2 - ind_sd2 = np.sqrt((effect_size_sd0**2) + (effect_size_sd1**2)) +class TestExponential(Test): + def test_exponential(self): - self.plot_qq_normal( - data=genetic1, - loc=2 * effect_size_mean0, - scale=ind_sd1, - filename=f"ind_1_genetic_{count}", - title=f"Individual 1 Genetic, alpha = {alpha}, " - f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}", + alpha_array = np.array([0, -0.3, 1]) + scale_array = np.array([1, 2.5]) + + ts = sim_tree_seq() + + count = 0 + + prod = itertools.product(alpha_array, scale_array) + for element in tqdm(prod, total=6): + alpha = element[0] + scale = element[1] + const0, const1 = sim_tree_seq_freq(alpha) + model = tstrait.trait_model( + distribution="exponential", scale=scale, negative=False ) - self.plot_qq_normal( - data=genetic2, - loc=effect_size_mean0 + effect_size_mean1, - scale=ind_sd2, - filename=f"ind_2_genetic_{count}", - title=f"Individual 2 Genetic, alpha = {alpha}, " - f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}", + effect_size0 = np.zeros(1000) + effect_size1 = np.zeros(1000) + for i in range(1000): + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=2, + model=model, + alpha=alpha, + random_seed=i + 1000 * count, + ) + effect_size0[i] = sim_result["effect_size"][0] + effect_size1[i] = sim_result["effect_size"][1] + self.plot_qq_exponential( + data=effect_size0, + scale=scale / 2 * const0, + filename=f"effect_size_0_{count}", + title=f"EffectSize0, Exponential, alpha = {alpha}, " + f"scale = {scale}, negative = False", + ) + self.plot_qq_exponential( + data=effect_size1, + scale=scale / 2 * const1, + filename=f"effect_size_1_{count}", + title=f"EffectSize1, Expnential, alpha = {alpha}, " + f"scale = {scale}, negative = False", ) + count += 1 - genetic_sim = np.zeros(3) - env_sim = np.zeros(1000) + def test_exponential_negative(self): - for i in range(1000): - x1 = rng.normal(loc=effect_size_mean0, scale=effect_size_sd0) - x2 = rng.normal(loc=effect_size_mean1, scale=effect_size_sd1) - genetic_sim[0] = 0 - genetic_sim[1] = 2 * x1 - genetic_sim[2] = x1 + x2 - env_std = np.sqrt((1 - h2) / h2 * np.var(genetic_sim)) - env_sim[i] = rng.normal(loc=0, scale=env_std) + alpha_array = np.array([0, -0.3, 1]) + scale_array = np.array([1, 2.5]) - self.plot_qq_compare( - v1=environment0, - v2=env_sim, - x_label="Individual 0 Environment", - y_label="Simulated values", - filename=f"ind_0_env_{count}", - title=f"Individual 0 Env, alpha = {alpha}, trait_mean = {trait_mean}, " - f"trait_sd = {trait_sd}, h2 = {h2}", + ts = sim_tree_seq() + + count = 0 + + prod = itertools.product(alpha_array, scale_array) + for element in tqdm(prod, total=6): + alpha = element[0] + scale = element[1] + const0, const1 = sim_tree_seq_freq(alpha) + model = tstrait.trait_model( + distribution="exponential", scale=scale, negative=True ) + effect_size0 = np.zeros(1000) + effect_size1 = np.zeros(1000) + for i in range(1000): + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=2, + model=model, + alpha=alpha, + random_seed=i + 1000 * count, + ) + effect_size0[i] = sim_result["effect_size"][0] + effect_size1[i] = sim_result["effect_size"][1] + rng = np.random.default_rng(count + 1) + exponential = rng.exponential(scale=scale / 2, size=1000) + exponential *= rng.choice([-1, 1], size=1000) + self.plot_qq_compare( - v1=environment1, - v2=env_sim, - x_label="Individual 1 Environment", - y_label="Simulated values", - filename=f"ind_1_env_{count}", - title=f"Individual 1 Env, alpha = {alpha}, trait_mean = {trait_mean}, " - f"trait_sd = {trait_sd}, h2 = {h2}", + v1=effect_size0, + v2=exponential * const0, + x_label="Simulated Effect Size", + y_label="Exponential Distribution", + filename=f"effect_size_0_negative_{count}", + title=f"EffectSize0, Exponential, alpha = {alpha}, " + f"scale = {scale}, negative = True", ) self.plot_qq_compare( - v1=environment2, - v2=env_sim, - x_label="Individual 2 Environment", - y_label="Simulated values", - filename=f"ind_2_env_{count}", - title=f"Individual 2 Env, alpha = {alpha}, trait_mean = {trait_mean}, " - f"trait_sd = {trait_sd}, h2 = {h2}", + v1=effect_size1, + v2=exponential * const1, + x_label="Simulated Effect Size", + y_label="Exponential Distribution", + filename=f"effect_size_1_negative_{count}", + title=f"EffectSize1, Exponential, alpha = {alpha}, " + f"scale = {scale}, negative = True", ) count += 1 -class TestInternal(Test): - def test_internal(self): - """ - Genotype of individuals: +class TestT(Test): + def test_t(self): - Site 0 Genotype: - [T, T, A, A, T, A] - Ancestral state: A - Causal allele freq: 0.5 - Individual 0: 1 causal - Individual 1: 2 causal - Individual 2: 0 causal + alpha_array = np.array([0, -0.3, 1]) + mean_array = np.array([0, 1]) + var_array = np.array([1, 2]) + df_array = np.array([10, 13.5]) - Site 1 Genotype: - [A, A, T, A, A, A] - Ancestral state: A - Causal allele freq: 1/6 - Individual 0: 0 causal - Individual 1: 0 causal - Individual 2: 1 causal + ts = sim_tree_seq() - Effect size distribution: + count = 0 - SD Formula: - trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha - sqrt(2) from 2 causal sites + prod = itertools.product(alpha_array, mean_array, var_array, df_array) + for element in tqdm(prod, total=24): + alpha = element[0] + mean = element[1] + var = element[2] + df = element[3] + sd = np.sqrt(var) + const0, const1 = sim_tree_seq_freq(alpha) + model = tstrait.trait_model(distribution="t", mean=mean, var=var, df=df) + effect_size0 = np.zeros(1000) + effect_size1 = np.zeros(1000) + for i in range(1000): + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=2, + model=model, + alpha=alpha, + random_seed=i + 1000 * count, + ) + effect_size0[i] = sim_result["effect_size"][0] + effect_size1[i] = sim_result["effect_size"][1] + rng = np.random.default_rng(count + 1) + simulated_t = rng.standard_t(df, size=1000) + simulated_t = simulated_t * sd + mean + simulated_t /= 2 - Environmental noise is simulated from a normal distribution where standard - deviation depends on the variance of the simulated genetic values - """ + self.plot_qq_compare( + v1=effect_size0, + v2=simulated_t * const0, + x_label="Simulated Effect Size", + y_label="T Distribution", + filename=f"effect_size_0_{count}", + title=f"EffectSize0, T, alpha = {alpha}, mean = {mean}, " + f"var = {var}, df = {df}", + ) + self.plot_qq_compare( + v1=effect_size1, + v2=simulated_t * const1, + x_label="Simulated Effect Size", + y_label="T Distribution", + filename=f"effect_size_1_{count}", + title=f"EffectSize1, T, alpha = {alpha}, mean = {mean}, " + f"var = {var}, df = {df}", + ) + count += 1 - ts = sim_tree_internal() - alpha_array = np.array([0, -1]) - trait_mean_array = np.array([0, 1]) - trait_var_array = np.array([1, 2]) - h2_array = np.array([0.3, 0.8]) +class TestGamma(Test): + def test_gamma(self): - count = 0 + alpha_array = np.array([0, -0.3, 1]) + shape_array = np.array([1, 3.5]) + scale_array = np.array([1, 2.5]) - prod = itertools.product( - alpha_array, trait_mean_array, trait_var_array, h2_array - ) + ts = sim_tree_seq() - for element in tqdm(prod, total=16): - alpha = element[0] - trait_mean = element[1] - trait_var = element[2] - trait_sd = np.sqrt(trait_var) - h2 = element[3] - model = trait_model.TraitModelAlleleFrequency(trait_mean, trait_var, alpha) - trait_mean /= 2 - genetic0 = np.zeros(1000) - genetic1 = np.zeros(1000) - genetic2 = np.zeros(1000) + count = 0 + prod = itertools.product(alpha_array, shape_array, scale_array) + for element in tqdm(prod, total=12): + alpha = element[0] + shape = element[1] + scale = element[2] + const0, const1 = sim_tree_seq_freq(alpha) + model = tstrait.trait_model( + distribution="gamma", shape=shape, scale=scale, negative=False + ) + effect_size0 = np.zeros(1000) + effect_size1 = np.zeros(1000) for i in range(1000): - sim_result = simulate_phenotype.sim_phenotype( - ts, num_causal=2, model=model, h2=h2, random_seed=i + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=2, + model=model, + alpha=alpha, + random_seed=i + 1000 * count, ) - phenotype_result = sim_result.phenotype - genetic0[i] = phenotype_result.genetic_value[0] - genetic1[i] = phenotype_result.genetic_value[1] - genetic2[i] = phenotype_result.genetic_value[2] + effect_size0[i] = sim_result["effect_size"][0] + effect_size1[i] = sim_result["effect_size"][1] - effect_size_sd0 = ( - trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha)) - ) - effect_size_sd1 = ( - trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha)) + rng = np.random.default_rng(count + 1) + gamma = rng.gamma(shape=shape, scale=scale, size=1000) + gamma /= 2 + + self.plot_qq_compare( + v1=effect_size0, + v2=gamma * const0, + x_label="Simulated Effect Size", + y_label="Gamma Distribution", + filename=f"effect_size_0_{count}", + title=f"EffectSize0, Gamma, alpha = {alpha}, " + f"shape = {shape}, scale = {scale}, negative = False", ) - effect_size_mean0 = trait_mean * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha)) - effect_size_mean1 = trait_mean * np.sqrt( - pow(2 * 1 / 6 * (1 - 1 / 6), alpha) + self.plot_qq_compare( + v1=effect_size1, + v2=gamma * const1, + x_label="Simulated Effect Size", + y_label="Gamma Distribution", + filename=f"effect_size_1_{count}", + title=f"EffectSize1, Gamma, alpha = {alpha}, " + f"shape = {shape}, scale = {scale}, negative = False", ) + count += 1 - self.plot_qq_normal( - data=genetic0, - loc=effect_size_mean0, - scale=effect_size_sd0, - filename=f"internal_ind_0_genetic_{count}", - title=f"Individual 0 Genetic, alpha = {alpha}, " - f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}", + def test_gamma_negative(self): + + alpha_array = np.array([0, -0.3, 1]) + shape_array = np.array([1, 3.5]) + scale_array = np.array([1, 2.5]) + + ts = sim_tree_seq() + + count = 0 + + prod = itertools.product(alpha_array, shape_array, scale_array) + for element in tqdm(prod, total=12): + alpha = element[0] + shape = element[1] + scale = element[2] + const0, const1 = sim_tree_seq_freq(alpha) + model = tstrait.trait_model( + distribution="gamma", shape=shape, scale=scale, negative=True ) - self.plot_qq_normal( - data=genetic1, - loc=2 * effect_size_mean0, - scale=2 * effect_size_sd0, - filename=f"internal_ind_1_genetic_{count}", - title=f"Individual 1 Genetic, alpha = {alpha}, " - f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}", + effect_size0 = np.zeros(1000) + effect_size1 = np.zeros(1000) + for i in range(1000): + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=2, + model=model, + alpha=alpha, + random_seed=i + 1000 * count, + ) + effect_size0[i] = sim_result["effect_size"][0] + effect_size1[i] = sim_result["effect_size"][1] + + rng = np.random.default_rng(count + 1) + gamma = rng.gamma(shape=shape, scale=scale, size=1000) + gamma /= 2 + gamma *= rng.choice([-1, 1], size=1000) + + self.plot_qq_compare( + v1=effect_size0, + v2=gamma * const0, + x_label="Simulated Effect Size", + y_label="Gamma Distribution", + filename=f"effect_size_0_negative_{count}", + title=f"EffectSize0, Gamma, alpha = {alpha}, " + f"shape = {shape}, scale = {scale}, negative = True", ) - self.plot_qq_normal( - data=genetic2, - loc=effect_size_mean1, - scale=effect_size_sd1, - filename=f"internal_ind_2_genetic_{count}", - title=f"Individual 2 Genetic, alpha = {alpha}, " - f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}", + self.plot_qq_compare( + v1=effect_size1, + v2=gamma * const1, + x_label="Simulated Effect Size", + y_label="Gamma Distribution", + filename=f"effect_size_1_negative_{count}", + title=f"EffectSize1, Gamma, alpha = {alpha}, " + f"shape = {shape}, scale = {scale}, negative = True", ) - count += 1 @@ -440,4 +478,7 @@ def run_tests(suite, output_dir): if __name__ == "__main__": - run_tests(["TestGenetic", "TestInternal"], "_output/stats_tests_output") + run_tests( + ["TestNormal", "TestExponential", "TestT", "TestGamma"], + "_output/stats_tests_output", + ) diff --git a/tests/test_sim_trait.py b/tests/test_sim_trait.py new file mode 100644 index 0000000..c1167e4 --- /dev/null +++ b/tests/test_sim_trait.py @@ -0,0 +1,746 @@ +import functools + +import msprime +import numpy as np +import pytest +import tskit +import tstrait + + +@functools.lru_cache(maxsize=None) +def all_trees_ts(n): + """ + Generate a tree sequence that corresponds to the lexicographic listing + of all trees with n leaves (i.e. from tskit.all_trees(n)). + + Note: it would be nice to include a version of this in the combinatorics + module at some point but the implementation is quite inefficient. Also + it's not entirely clear that the way we're allocating node times is + guaranteed to work. + """ + tables = tskit.TableCollection(0) + for _ in range(n): + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + for j in range(1, n): + tables.nodes.add_row(flags=0, time=j) + + L = 0 + for tree in tskit.all_trees(n): + for u in tree.preorder()[1:]: + tables.edges.add_row(L, L + 1, tree.parent(u), u) + L += 1 + tables.sequence_length = L + tables.sort() + tables.simplify() + return tables.tree_sequence() + + +class Test_sim_phenotype_output_dim: + def check_dimensions(self, df, num_causal): + assert len(df) == num_causal + assert df.shape[1] == 3 + assert len(df["site_id"]) == num_causal + assert len(df["causal_state"]) == num_causal + assert len(df["effect_size"]) == num_causal + + @pytest.mark.parametrize("num_ind", [1, 2, np.array([5])[0]]) + @pytest.mark.parametrize("num_causal", [1, 2, np.array([3])[0]]) + @pytest.mark.parametrize("alpha", [0, 1, -1.1]) + @pytest.mark.parametrize("random_seed", [1, 2, None]) + def test_output_dim_normal(self, num_ind, num_causal, alpha, random_seed): + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + ts = msprime.sim_ancestry( + num_ind, sequence_length=100_000, random_seed=random_seed + ) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed) + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=num_causal, + model=model, + alpha=alpha, + random_seed=random_seed, + ) + self.check_dimensions(sim_result, num_causal) + + def test_binary_mutation_model(self): + random_seed = 1 + num_ind = 10 + num_causal = 5 + alpha = 1 + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + ts = msprime.sim_ancestry( + num_ind, sequence_length=100_000, random_seed=random_seed + ) + ts = msprime.sim_mutations( + ts, rate=0.01, random_seed=random_seed, model="binary" + ) + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=num_causal, + model=model, + alpha=alpha, + random_seed=random_seed, + ) + self.check_dimensions(sim_result, num_causal) + + def test_output_dim_exponential(self): + model = tstrait.trait_model(distribution="exponential", scale=1) + random_seed = 1 + num_ind = 10 + num_causal = 5 + alpha = 1 + ts = msprime.sim_ancestry( + num_ind, sequence_length=100_000, random_seed=random_seed + ) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed) + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=num_causal, + model=model, + alpha=alpha, + random_seed=random_seed, + ) + self.check_dimensions(sim_result, num_causal) + + def test_output_dim_fixed(self): + model = tstrait.trait_model(distribution="fixed", value=1) + random_seed = 1 + num_ind = 10 + num_causal = 5 + alpha = 1 + ts = msprime.sim_ancestry( + num_ind, sequence_length=100_000, random_seed=random_seed + ) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed) + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=num_causal, + model=model, + alpha=alpha, + random_seed=random_seed, + ) + self.check_dimensions(sim_result, num_causal) + + def test_output_dim_t(self): + model = tstrait.trait_model(distribution="t", mean=0, var=1, df=1) + random_seed = 1 + num_ind = 10 + num_causal = 5 + alpha = 1 + ts = msprime.sim_ancestry( + num_ind, sequence_length=100_000, random_seed=random_seed + ) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed) + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=num_causal, + model=model, + alpha=alpha, + random_seed=random_seed, + ) + self.check_dimensions(sim_result, num_causal) + + def test_output_dim_gamma(self): + model = tstrait.trait_model(distribution="gamma", shape=1, scale=1) + random_seed = 1 + num_ind = 10 + num_causal = 5 + alpha = 1 + ts = msprime.sim_ancestry( + num_ind, sequence_length=100_000, random_seed=random_seed + ) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed) + sim_result = tstrait.sim_trait( + ts=ts, + num_causal=num_causal, + model=model, + alpha=alpha, + random_seed=random_seed, + ) + self.check_dimensions(sim_result, num_causal) + + +class Test_sim_phenotype_input: + @pytest.mark.parametrize("ts", [0, "a", [1, 1]]) + def test_ts(self, ts): + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + with pytest.raises(TypeError, match="Input must be a tree sequence data"): + tstrait.sim_trait(ts=ts, num_causal=3, model=model, alpha=1, random_seed=1) + + @pytest.mark.parametrize("num_causal", ["1", "a", [1, 1]]) + def test_num_causal(self, num_causal): + ts = msprime.sim_ancestry(2, sequence_length=100_000, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + with pytest.raises( + TypeError, match="Number of causal sites must be an integer" + ): + tstrait.sim_trait( + ts=ts, num_causal=num_causal, model=model, alpha=1, random_seed=1 + ) + + @pytest.mark.parametrize("num_causal", [-1, 1.8, -1.5, 0]) + def test_num_causal_positive(self, num_causal): + ts = msprime.sim_ancestry(2, sequence_length=100_000, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + with pytest.raises( + ValueError, match="Number of causal sites must be a positive integer" + ): + tstrait.sim_trait( + ts=ts, num_causal=num_causal, model=model, alpha=1, random_seed=1 + ) + + @pytest.mark.parametrize("model", ["normal", 1, [None, 1, "a"]]) + def test_model(self, model): + ts = msprime.sim_ancestry(5, sequence_length=100_000, random_seed=2) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=2) + with pytest.raises( + TypeError, match="Trait model must be an instance of TraitModel" + ): + tstrait.sim_trait(ts=ts, num_causal=1, model=model, alpha=1, random_seed=1) + + def test_error_mutation(self): + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1) + with pytest.raises(ValueError, match="No mutation in the provided data"): + tstrait.sim_trait( + ts=ts, + num_causal=5, + model=model, + alpha=0, + random_seed=1, + ) + + def test_error_num_causal_add(self): + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) + num_causal = ts.num_sites + 1 + with pytest.raises( + ValueError, + match="There are less number of sites in the " + "tree sequence than the inputted number of causal sites", + ): + tstrait.sim_trait( + ts=ts, + num_causal=num_causal, + model=model, + alpha=0, + random_seed=1, + ) + + def test_output_num_causal(self): + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + ts = msprime.sim_ancestry(3, sequence_length=100_000, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) + num_causal = ts.num_sites + sim_result = tstrait.sim_trait( + ts=ts, num_causal=num_causal, model=model, alpha=-1, random_seed=1 + ) + assert sim_result.shape[0] == num_causal + + @pytest.mark.parametrize("alpha", ["1", "a", [1, 1]]) + def test_alpha(self, alpha): + ts = msprime.sim_ancestry(2, sequence_length=100_000, random_seed=1) + ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1) + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + with pytest.raises(TypeError, match="Alpha must be a number"): + tstrait.sim_trait( + ts=ts, num_causal=3, model=model, alpha=alpha, random_seed=1 + ) + + +class Test_site_genotypes: + def test_binary_tree(self): + # 3.00 6 + # ┊ ┏━┻━┓ ┊ + # 2.00┊ ┃ 5 ┊ + # ┊ ┃ ┏━┻┓ ┊ + # 1.00┊ ┃ ┃ 4 ┊ + # ┊ ┃ ┃ ┏┻┓ ┊ + # 0.00 0 1 2 3 + # + ts = tskit.Tree.generate_comb(4, span=15).tree_sequence + tables = ts.dump_tables() + for j in range(12): + tables.sites.add_row(j, "A") + + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[0] = 0 + individuals[1] = 0 + individuals[2] = 1 + individuals[3] = 1 + tables.nodes.individual = individuals + + tables.mutations.add_row(site=0, node=0, derived_state="T") + tables.mutations.add_row(site=1, node=4, derived_state="T") + tables.mutations.add_row(site=2, node=1, derived_state="T") + + tables.mutations.add_row(site=3, node=5, derived_state="T") + tables.mutations.add_row(site=3, node=2, derived_state="T", parent=3) + + tables.mutations.add_row(site=4, node=0, derived_state="T") + tables.mutations.add_row(site=4, node=4, derived_state="T") + + tables.mutations.add_row(site=5, node=0, derived_state="T") + tables.mutations.add_row(site=5, node=0, derived_state="A", parent=7) + + tables.mutations.add_row(site=6, node=0, derived_state="T") + tables.mutations.add_row(site=6, node=0, derived_state="G", parent=9) + tables.mutations.add_row(site=6, node=0, derived_state="T", parent=10) + tables.mutations.add_row(site=6, node=0, derived_state="C", parent=11) + + tables.mutations.add_row(site=7, node=5, derived_state="T") + tables.mutations.add_row(site=7, node=4, derived_state="C", parent=13) + + tables.mutations.add_row(site=8, node=5, derived_state="T") + tables.mutations.add_row(site=8, node=4, derived_state="C", parent=15) + tables.mutations.add_row(site=8, node=4, derived_state="T", parent=16) + tables.mutations.add_row(site=8, node=4, derived_state="A", parent=17) + tables.mutations.add_row(site=8, node=4, derived_state="T", parent=18) + + tables.mutations.add_row(site=9, node=6, derived_state="A") + + tables.mutations.add_row(site=10, node=6, derived_state="C") + + tables.mutations.add_row(site=11, node=5, derived_state="C") + tables.mutations.add_row(site=11, node=0, derived_state="T") + + ts = tables.tree_sequence() + + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + simulator = tstrait.TraitSimulator( + ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1 + ) + tree = ts.first() + + c1 = simulator._obtain_allele_count(tree, ts.site(0)) + c2 = simulator._obtain_allele_count(tree, ts.site(1)) + c3 = simulator._obtain_allele_count(tree, ts.site(2)) + c4 = simulator._obtain_allele_count(tree, ts.site(3)) + c5 = simulator._obtain_allele_count(tree, ts.site(4)) + c6 = simulator._obtain_allele_count(tree, ts.site(5)) + c7 = simulator._obtain_allele_count(tree, ts.site(6)) + c8 = simulator._obtain_allele_count(tree, ts.site(7)) + c9 = simulator._obtain_allele_count(tree, ts.site(8)) + c10 = simulator._obtain_allele_count(tree, ts.site(10)) + c11 = simulator._obtain_allele_count(tree, ts.site(11)) + + assert c1 == {"T": 1} + assert c2 == {"T": 2} + assert c3 == {"T": 1} + assert c4 == {"T": 3} + assert c5 == {"T": 3} + assert c6 == {"A": 4} + assert c7 == {"C": 1} + assert c8 == {"T": 1, "C": 2} + assert c9 == {"T": 3} + assert c10 == {"C": 4} + assert c11 == {"C": 3, "T": 1} + + def test_binary_tree_different_individual(self): + # 3.00 6 + # ┊ ┏━┻━┓ ┊ + # 2.00┊ ┃ 5 ┊ + # ┊ ┃ ┏━┻┓ ┊ + # 1.00┊ ┃ ┃ 4 ┊ + # ┊ ┃ ┃ ┏┻┓ ┊ + # 0.00 0 1 2 3 + # + ts = tskit.Tree.generate_comb(4, span=10).tree_sequence + tables = ts.dump_tables() + for j in range(9): + tables.sites.add_row(j, "A") + + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[0] = 0 + individuals[1] = 1 + individuals[2] = 0 + individuals[3] = 1 + tables.nodes.individual = individuals + + tables.mutations.add_row(site=0, node=0, derived_state="T") + tables.mutations.add_row(site=1, node=4, derived_state="T") + tables.mutations.add_row(site=2, node=1, derived_state="T") + + tables.mutations.add_row(site=3, node=5, derived_state="T") + tables.mutations.add_row(site=3, node=2, derived_state="T", parent=3) + + tables.mutations.add_row(site=4, node=0, derived_state="T") + tables.mutations.add_row(site=4, node=4, derived_state="T") + + tables.mutations.add_row(site=5, node=0, derived_state="T") + tables.mutations.add_row(site=5, node=0, derived_state="A", parent=7) + + tables.mutations.add_row(site=6, node=0, derived_state="T") + tables.mutations.add_row(site=6, node=0, derived_state="G", parent=9) + tables.mutations.add_row(site=6, node=0, derived_state="T", parent=10) + tables.mutations.add_row(site=6, node=0, derived_state="C", parent=11) + + tables.mutations.add_row(site=7, node=5, derived_state="T") + tables.mutations.add_row(site=7, node=4, derived_state="C", parent=13) + + tables.mutations.add_row(site=8, node=5, derived_state="T") + tables.mutations.add_row(site=8, node=4, derived_state="C", parent=15) + tables.mutations.add_row(site=8, node=4, derived_state="T", parent=16) + tables.mutations.add_row(site=8, node=4, derived_state="A", parent=17) + tables.mutations.add_row(site=8, node=4, derived_state="T", parent=18) + ts = tables.tree_sequence() + + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + simulator = tstrait.TraitSimulator( + ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1 + ) + tree = ts.first() + + c1 = simulator._obtain_allele_count(tree, ts.site(0)) + c2 = simulator._obtain_allele_count(tree, ts.site(1)) + c3 = simulator._obtain_allele_count(tree, ts.site(2)) + c4 = simulator._obtain_allele_count(tree, ts.site(3)) + c5 = simulator._obtain_allele_count(tree, ts.site(4)) + c6 = simulator._obtain_allele_count(tree, ts.site(5)) + c7 = simulator._obtain_allele_count(tree, ts.site(6)) + c8 = simulator._obtain_allele_count(tree, ts.site(7)) + c9 = simulator._obtain_allele_count(tree, ts.site(8)) + + assert c1 == {"T": 1} + assert c2 == {"T": 2} + assert c3 == {"T": 1} + assert c4 == {"T": 3} + assert c5 == {"T": 3} + assert c6 == {"A": 4} + assert c7 == {"C": 1} + assert c8 == {"T": 1, "C": 2} + assert c9 == {"T": 3} + + def test_binary_tree_internal_node(self): + ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence + + tables = ts.dump_tables() + for j in range(3): + tables.sites.add_row(j, "A") + + tables.individuals.add_row() + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[0] = 1 + individuals[1] = 1 + individuals[2] = 2 + individuals[3] = 2 + individuals[4] = 0 + individuals[5] = 0 + tables.nodes.individual = individuals + + tables.mutations.add_row(site=0, node=0, derived_state="T") + tables.mutations.add_row(site=0, node=5, derived_state="G") + + tables.mutations.add_row(site=1, node=4, derived_state="T") + tables.mutations.add_row(site=1, node=0, derived_state="A", parent=2) + tables.mutations.add_row(site=1, node=5, derived_state="T") + tables.mutations.add_row(site=1, node=5, derived_state="G", parent=4) + + tables.mutations.add_row(site=2, node=4, derived_state="T") + tables.mutations.add_row(site=2, node=0, derived_state="G", parent=6) + tables.mutations.add_row(site=2, node=1, derived_state="C", parent=6) + + ts = tables.tree_sequence() + + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + simulator = tstrait.TraitSimulator( + ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1 + ) + tree = ts.first() + + c1 = simulator._obtain_allele_count(tree, ts.site(0)) + c2 = simulator._obtain_allele_count(tree, ts.site(1)) + c3 = simulator._obtain_allele_count(tree, ts.site(2)) + + assert c1 == {"T": 1, "G": 2} + assert c2 == {"T": 1, "G": 2} + assert c3 == {"G": 1, "C": 1} + + def test_non_binary_tree(self): + # 2.00 7 + # ┊ ┏━┏━━┏━┻━━┓ ┊ + # 1.00┊ ┃ ┃ ┃ 6 ┊ + # ┊ ┃ ┃ ┃ ┏━┳┻┓ ┊ + # 0.00 0 1 2 3 4 5 + ts = tskit.Tree.generate_balanced(6, arity=4, span=10).tree_sequence + + tables = ts.dump_tables() + for j in range(3): + tables.sites.add_row(j, "A") + + tables.individuals.add_row() + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[0] = 0 + individuals[1] = 0 + individuals[2] = 1 + individuals[3] = 1 + individuals[4] = 2 + individuals[5] = 2 + tables.nodes.individual = individuals + + tables.mutations.add_row(site=0, node=6, derived_state="T") + tables.mutations.add_row(site=1, node=0, derived_state="T") + tables.mutations.add_row(site=2, node=6, derived_state="T") + tables.mutations.add_row(site=2, node=6, derived_state="C", parent=2) + tables.mutations.add_row(site=2, node=5, derived_state="T", parent=3) + + ts = tables.tree_sequence() + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + simulator = tstrait.TraitSimulator( + ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1 + ) + tree = ts.first() + + c1 = simulator._obtain_allele_count(tree, ts.site(0)) + c2 = simulator._obtain_allele_count(tree, ts.site(1)) + c3 = simulator._obtain_allele_count(tree, ts.site(2)) + + assert c1 == {"T": 3} + assert c2 == {"T": 1} + assert c3 == {"C": 2, "T": 1} + + def test_binary_tree_additional(self): + ts = tskit.Tree.generate_comb(6, span=10).tree_sequence + tables = ts.dump_tables() + for j in range(5): + tables.sites.add_row(j, "A") + + tables.individuals.add_row() + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[0] = 0 + individuals[1] = 0 + individuals[2] = 1 + individuals[3] = 1 + individuals[4] = 2 + individuals[5] = 2 + tables.nodes.individual = individuals + + tables.mutations.add_row(site=0, node=7, derived_state="T") + tables.mutations.add_row(site=1, node=0, derived_state="T") + tables.mutations.add_row(site=1, node=0, derived_state="A", parent=1) + + tables.mutations.add_row(site=2, node=9, derived_state="T") + tables.mutations.add_row(site=2, node=3, derived_state="C", parent=3) + + tables.mutations.add_row(site=3, node=8, derived_state="T") + tables.mutations.add_row(site=3, node=8, derived_state="A", parent=5) + tables.mutations.add_row(site=3, node=8, derived_state="C", parent=6) + tables.mutations.add_row(site=3, node=3, derived_state="A", parent=7) + tables.mutations.add_row(site=3, node=6, derived_state="G", parent=7) + + tables.mutations.add_row(site=4, node=8, derived_state="A") + + ts = tables.tree_sequence() + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + simulator = tstrait.TraitSimulator( + ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1 + ) + tree = ts.first() + + c1 = simulator._obtain_allele_count(tree, ts.site(0)) + c2 = simulator._obtain_allele_count(tree, ts.site(1)) + c3 = simulator._obtain_allele_count(tree, ts.site(2)) + c4 = simulator._obtain_allele_count(tree, ts.site(3)) + c5 = simulator._obtain_allele_count(tree, ts.site(4)) + + assert c1 == {"T": 3} + assert c2 == {"A": 6} + assert c3 == {"T": 4, "C": 1} + assert c4 == {"G": 2, "C": 1} + assert c5 == {"A": 6} + + +def sim_tree_seq(): + """ + The following tree sequence will be generated: + 10 + ┏━┻┓ + ┃ 9 + ┃ ┏┻━┓ + ┃ ┃ 8 + ┃ ┃ ┏┻━┓ + ┃ ┃ ┃ 7 + ┃ ┃ ┃ ┏┻━┓ + ┃ ┃ ┃ ┃ 6 + ┃ ┃ ┃ ┃ ┏┻━┓ + 0 1 2 3 4 5 + + Individual 0: Node 0 and 1 + Individual 1: Node 2 and 3 + Individual 3: Node 4 and 5 + + Site 0 Ancestral State: "A" + Causal Mutation: Node 8 + Reverse Mutation: Node 4 + + Site 1 Ancestral State: "A" + Causal Mutation: Node 5 + + Site 0 Genotype: + [A, A, T, T, A, T] + Ancestral state: A + Causal allele freq: 0.5 + Individual 0: 0 causal + Individual 1: 2 causal + Individual 2: 1 causal + + Site 1 Genotype: + [A, A, A, A, A, T] + Ancestral state: A + Causal allele freq: 1/6 + Individual 0: 0 causal + Individual 1: 0 causal + Individual 2: 1 causal + + Effect size distribution: + + SD Formula: + trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha + sqrt(2) from 2 causal sites + """ + ts = tskit.Tree.generate_comb(6, span=2).tree_sequence + + tables = ts.dump_tables() + for j in range(2): + tables.sites.add_row(j, "A") + + tables.individuals.add_row() + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[0] = 0 + individuals[1] = 0 + individuals[2] = 1 + individuals[3] = 1 + individuals[4] = 2 + individuals[5] = 2 + tables.nodes.individual = individuals + tables.mutations.add_row(site=0, node=8, derived_state="T") + tables.mutations.add_row(site=0, node=4, derived_state="A", parent=0) + tables.mutations.add_row(site=1, node=5, derived_state="T") + ts = tables.tree_sequence() + return ts + + +def sim_tree_seq_freq(alpha): + const1 = np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha)) + const2 = np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha)) + + return const1, const2 + + +class Test_fixed_effect_size: + @pytest.mark.parametrize("value", [0, 1, -2]) + @pytest.mark.parametrize("alpha", [0, 1, -0.5]) + @pytest.mark.parametrize("random_seed", [1, 2, None]) + def test_output_dim_normal(self, value, alpha, random_seed): + model = tstrait.trait_model(distribution="fixed", value=value) + ts = sim_tree_seq() + sim_result = tstrait.sim_trait( + ts=ts, num_causal=2, model=model, alpha=alpha, random_seed=random_seed + ) + const1, const2 = sim_tree_seq_freq(alpha) + assert np.array_equal(sim_result["site_id"], np.array([0, 1])) + assert np.array_equal(sim_result["causal_state"], np.array(["T", "T"])) + assert np.isclose(sim_result["effect_size"][0], value * const1) + assert np.isclose(sim_result["effect_size"][1], value * const2) + + +class Test_allele_freq_one: + @pytest.mark.parametrize("alpha", [0, 1, -0.5]) + def test_allele_freq_one(self, alpha): + ts = tskit.Tree.generate_comb(6, span=2).tree_sequence + tables = ts.dump_tables() + for j in range(2): + tables.sites.add_row(j, "A") + tables.mutations.add_row(site=0, node=8, derived_state="T") + tables.mutations.add_row(site=0, node=8, derived_state="A", parent=0) + tables.mutations.add_row(site=1, node=5, derived_state="T") + tables.mutations.add_row(site=1, node=5, derived_state="A", parent=2) + ts = tables.tree_sequence() + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + sim_result = tstrait.sim_trait( + ts=ts, num_causal=2, model=model, alpha=alpha, random_seed=1 + ) + assert np.array_equal(sim_result["effect_size"], np.array([0, 0])) + + +class Test_tree_sequence: + def const(self, freq, alpha): + value = np.sqrt(pow(2 * freq * (1 - freq), alpha)) + return value + + def test_tree_sequence(self): + ts = all_trees_ts(4) + tables = ts.dump_tables() + + tables.individuals.add_row() + tables.individuals.add_row() + individuals = tables.nodes.individual + individuals[0] = 0 + individuals[1] = 0 + individuals[2] = 1 + individuals[3] = 1 + tables.nodes.individual = individuals + + tables.sites.add_row(1, "A") + tables.sites.add_row(7, "A") + tables.sites.add_row(11, "G") + tables.sites.add_row(12, "C") + tables.sites.add_row(13, "A") + tables.sites.add_row(14, "A") + + tables.mutations.add_row(site=0, node=4, derived_state="T") + tables.mutations.add_row(site=1, node=4, derived_state="T") + tables.mutations.add_row(site=2, node=5, derived_state="C") + tables.mutations.add_row(site=2, node=4, derived_state="G", parent=2) + tables.mutations.add_row(site=3, node=5, derived_state="C") + tables.mutations.add_row(site=3, node=3, derived_state="T", parent=4) + tables.mutations.add_row(site=4, node=3, derived_state="T") + tables.mutations.add_row(site=4, node=3, derived_state="A", parent=6) + tables.mutations.add_row(site=4, node=3, derived_state="C", parent=7) + tables.mutations.add_row(site=5, node=4, derived_state="C") + tables.mutations.add_row(site=5, node=4, derived_state="C", parent=9) + tables.mutations.add_row(site=5, node=1, derived_state="A") + + ts = tables.tree_sequence() + + model = tstrait.trait_model(distribution="fixed", value=2) + sim_result = tstrait.sim_trait( + ts=ts, num_causal=6, model=model, alpha=-1, random_seed=1 + ) + assert np.array_equal(sim_result["site_id"], np.array([0, 1, 2, 3, 4, 5])) + assert np.array_equal( + sim_result["causal_state"], np.array(["T", "T", "C", "T", "C", "C"]) + ) + assert np.isclose( + sim_result["effect_size"][0], 2 * self.const(freq=0.5, alpha=-1) + ) + assert np.isclose( + sim_result["effect_size"][1], 2 * self.const(freq=0.75, alpha=-1) + ) + assert np.isclose( + sim_result["effect_size"][2], 2 * self.const(freq=0.25, alpha=-1) + ) + assert np.isclose( + sim_result["effect_size"][3], 2 * self.const(freq=0.25, alpha=-1) + ) + assert np.isclose( + sim_result["effect_size"][4], 2 * self.const(freq=0.25, alpha=-1) + ) + assert np.isclose( + sim_result["effect_size"][5], 2 * self.const(freq=0.5, alpha=-1) + ) diff --git a/tstrait/__init__.py b/tstrait/__init__.py index 055a321..3b0aa96 100644 --- a/tstrait/__init__.py +++ b/tstrait/__init__.py @@ -1,3 +1,5 @@ +from tstrait.simulate_effect_size import sim_trait +from tstrait.simulate_effect_size import TraitSimulator from tstrait.simulate_phenotype import GenotypeResult from tstrait.simulate_phenotype import PhenotypeResult from tstrait.simulate_phenotype import PhenotypeSimulator @@ -12,6 +14,8 @@ from tstrait.trait_model import TraitModelT __all__ = [ + "TraitSimulator", + "sim_trait", "sim_phenotype", "PhenotypeSimulator", "Result", diff --git a/tstrait/simulate_effect_size.py b/tstrait/simulate_effect_size.py new file mode 100644 index 0000000..14786c7 --- /dev/null +++ b/tstrait/simulate_effect_size.py @@ -0,0 +1,171 @@ +import collections +import numbers + +import numpy as np +import pandas as pd +import tskit +import tstrait + + +class TraitSimulator: + """Simulator class to select causal alleles and simulate effect sizes of causal + mutations. + + :param ts: Tree sequence data with mutation. + :type ts: tskit.TreeSequence + :param num_causal: Number of causal sites. + :type num_causal: int + :param model: Trait model that will be used to simulate effect sizes. + :type model: TraitModel + :param alpha: Parameter that determines the emphasis placed on rarer variants. + :type alpha: float + :param random_seed: The random seed. If this is not specified or None, simulation + will be done randomly. + :type random_seed: None or int + """ + + def __init__(self, ts, num_causal, model, alpha, random_seed): + self.ts = ts + self.num_causal = num_causal + self.model = model + self.alpha = alpha + self.rng = np.random.default_rng(random_seed) + + def _choose_causal_site(self): + """Randomly chooses causal site IDs among all the sites in the tree sequence + data. The site IDs are aligned based on their genomic positions as a part of + the tree sequence data requirement, so the chosen site IDs are sorted in the + final step. The algorithm will be faster if the site IDs are aligned by + their genomic locations. + """ + site_id = self.rng.choice( + range(self.ts.num_sites), size=self.num_causal, replace=False + ) + site_id = np.sort(site_id) + + return site_id + + def _obtain_allele_count(self, tree, site): + """Obtain a dictionary of allele counts, and the ancestral state is not + included in the dictionary. Input is the tree sequence site (`ts.site(ID)`) + instead of site ID, as obtaining `ts.site(ID)` can be time consuming. The + ancestral state is not deleted if the ancestral state is the only allele + at that site. + """ + counts = collections.Counter({site.ancestral_state: self.ts.num_samples}) + for m in site.mutations: + current_state = site.ancestral_state + if m.parent != tskit.NULL: + current_state = self.ts.mutation(m.parent).derived_state + # Silent mutations do nothing + if current_state != m.derived_state: + num_samples = tree.num_samples(m.node) + counts[m.derived_state] += num_samples + counts[current_state] -= num_samples + del counts[site.ancestral_state] + counts = {x: y for x, y in counts.items() if y != 0} + if len(counts) == 0: + counts = {site.ancestral_state: self.ts.num_samples} + return counts + + def _frequency_dependence(self, allele_freq): + if allele_freq == 0 or allele_freq == 1: + const = 0 + else: + const = np.sqrt(pow(2 * allele_freq * (1 - allele_freq), self.alpha)) + return const + + def sim_causal_mutation(self): + """This method randomly chooses causal sites and the corresponding causal state + based on the `num_causal` input. Afterwards, effect size of each causal site + is simulated based on the trait model given by the `model` input. + + :return: Returns a pandas dataframe that includes causal site ID, causal allele + and simulated effect size + :rtype: pandas.DataFrame + """ + causal_site_array = self._choose_causal_site() + num_samples = self.ts.num_samples + tree = tskit.Tree(self.ts) + + causal_state_array = np.zeros(self.num_causal, dtype=object) + beta_array = np.zeros(self.num_causal) + + for i, single_id in enumerate(causal_site_array): + site = self.ts.site(single_id) + tree.seek(site.position) + counts = self._obtain_allele_count(tree, site) + causal_state = self.rng.choice(list(counts)) + causal_state_array[i] = causal_state + allele_freq = counts[causal_state] / num_samples + beta = self.model.sim_effect_size(num_causal=self.num_causal, rng=self.rng) + beta *= self._frequency_dependence(allele_freq) + beta_array[i] = beta + + effect_size_df = pd.DataFrame( + { + "site_id": causal_site_array, + "causal_state": causal_state_array, + "effect_size": beta_array, + } + ) + + return effect_size_df + + +def sim_trait(ts, num_causal, model, alpha=0, random_seed=None): + """Randomly selects causal sites from the inputted tree sequence data, and simulates + effect sizes of causal mutations. + + :param ts: The tree sequence data that will be used in the quantitative trait + simulation. The tree sequence data must include a mutation. + :type ts: tskit.TreeSequence + :param num_causal: Number of causal sites that will be chosen randomly. It must be + a positive integer that is less than the number of sites in the tree sequence + data. + :type num_causal: int + :param model: Trait model that will be used to simulate effect sizes of causal + mutations. + :type model: TraitModel + :param alpha: Parameter that determines the relative weight on rarer variants. + A negative `alpha` value can increase the magnitude of effect sizes coming + from rarer variants. The frequency dependent architecture can be ignored + by setting `alpha` to be zero. + :type alpha: float + :param random_seed: The random seed. If this is not specified or None, simulation + will be done randomly. + :type random_seed: None or int + :return: Returns a pandas dataframe that includes causal site ID, causal allele and + simulated effect size. It can be used as an input in :func:`sim_phenotype` + function to simulate phenotypes. + :rtype: pandas.DataFrame + """ + if not isinstance(ts, tskit.TreeSequence): + raise TypeError("Input must be a tree sequence data") + if not isinstance(num_causal, numbers.Number): + raise TypeError("Number of causal sites must be an integer") + if int(num_causal) != num_causal or num_causal <= 0: + raise ValueError("Number of causal sites must be a positive integer") + if not isinstance(model, tstrait.TraitModel): + raise TypeError("Trait model must be an instance of TraitModel") + num_sites = ts.num_sites + if num_sites == 0: + raise ValueError("No mutation in the provided data") + if num_causal > num_sites: + raise ValueError( + "There are less number of sites in the tree sequence than the inputted " + "number of causal sites" + ) + if not isinstance(alpha, numbers.Number): + raise TypeError("Alpha must be a number") + + simulator = TraitSimulator( + ts=ts, + num_causal=num_causal, + model=model, + alpha=alpha, + random_seed=random_seed, + ) + effect_size_df = simulator.sim_causal_mutation() + + return effect_size_df