From 6f0d2650a39dda0c992d657a2e8dde7310e2a4ce Mon Sep 17 00:00:00 2001
From: daikitag <48062118+daikitag@users.noreply.github.com>
Date: Sat, 15 Jul 2023 00:04:03 +0900
Subject: [PATCH] dataftame-effect-size

Generate a pandas dataframe to output simulated effect sizes
---
 requirements/CI-complete/requirements.txt  |   1 +
 requirements/CI-docs/requirements.txt      |   1 +
 requirements/CI-tests-pip/requirements.txt |   1 +
 requirements/development.txt               |   1 +
 setup.cfg                                  |   1 +
 stats_tests.py                             | 561 +++++++++-------
 tests/test_sim_trait.py                    | 746 +++++++++++++++++++++
 tstrait/__init__.py                        |   4 +
 tstrait/simulate_effect_size.py            | 171 +++++
 9 files changed, 1227 insertions(+), 260 deletions(-)
 create mode 100644 tests/test_sim_trait.py
 create mode 100644 tstrait/simulate_effect_size.py

diff --git a/requirements/CI-complete/requirements.txt b/requirements/CI-complete/requirements.txt
index f2a4410..14bf36b 100644
--- a/requirements/CI-complete/requirements.txt
+++ b/requirements/CI-complete/requirements.txt
@@ -1,6 +1,7 @@
 msprime
 numba
 numpy
+pandas
 pytest==6.2.5
 pytest-cov==3.0.0
 tskit==0.5.3
\ No newline at end of file
diff --git a/requirements/CI-docs/requirements.txt b/requirements/CI-docs/requirements.txt
index 4f83450..4553860 100644
--- a/requirements/CI-docs/requirements.txt
+++ b/requirements/CI-docs/requirements.txt
@@ -5,6 +5,7 @@ matplotlib
 msprime
 numba
 numpy
+pandas
 scipy
 sphinx==4.5
 sphinx-argparse
diff --git a/requirements/CI-tests-pip/requirements.txt b/requirements/CI-tests-pip/requirements.txt
index 7ec4f48..13465f6 100644
--- a/requirements/CI-tests-pip/requirements.txt
+++ b/requirements/CI-tests-pip/requirements.txt
@@ -1,4 +1,5 @@
 msprime
 numba
 numpy
+pandas
 tskit==0.5.3
diff --git a/requirements/development.txt b/requirements/development.txt
index acf5a0b..d5c9975 100644
--- a/requirements/development.txt
+++ b/requirements/development.txt
@@ -7,6 +7,7 @@ matplotlib
 msprime
 numba
 numpy
+pandas
 pre-commit
 pytest
 scipy
diff --git a/setup.cfg b/setup.cfg
index 788fbab..f1fb39a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -26,6 +26,7 @@ python_requires = >=3.7
 install_requires = 
     numpy
     numba>=0.57.0
+    pandas
     tskit>=0.5.5
 
 [options.extras_require]
diff --git a/stats_tests.py b/stats_tests.py
index 904d108..f852a33 100644
--- a/stats_tests.py
+++ b/stats_tests.py
@@ -8,8 +8,7 @@
 import scipy.stats as stats
 import statsmodels.api as sm
 import tskit
-import tstrait.simulate_phenotype as simulate_phenotype
-import tstrait.trait_model as trait_model
+import tstrait
 from tqdm import tqdm
 
 
@@ -41,17 +40,27 @@ def sim_tree_seq():
 
     Site 0 Genotype:
         [A, A, T, T, A, T]
+        Ancestral state: A
+        Causal allele freq: 0.5
         Individual 0: 0 causal
         Individual 1: 2 causal
         Individual 2: 1 causal
 
     Site 1 Genotype:
         [A, A, A, A, A, T]
+        Ancestral state: T
+        Causal allele freq: 1/6
         Individual 0: 0 causal
         Individual 1: 0 causal
         Individual 2: 1 causal
+
+    Effect size distribution:
+
+    SD Formula:
+        trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha
+        sqrt(2) from 2 causal sites
     """
-    ts = tskit.Tree.generate_comb(6, span=10).tree_sequence
+    ts = tskit.Tree.generate_comb(6, span=2).tree_sequence
 
     tables = ts.dump_tables()
     for j in range(2):
@@ -75,61 +84,11 @@ def sim_tree_seq():
     return ts
 
 
-def sim_tree_internal():
-    """
-    The following tree sequence will be generated:
-
-        6
-     4━━┻━━5
-    ┏┻━┓  ┏┻━┓
-    0  1  2  3
-
-    Individual 0: Node 4 and 5
-    Individual 1: Node 2 and 3
-    Individual 3: Node 4 and 5
-
-    Site 0 Ancestral State: "A"
-        Causal Mutation: Node 4
-
-    Site 1 Ancestral State: "A"
-        Causal Mutation: Node 2
-
-    Site 0 Genotype:
-        [T, T, A, A, T, A]
-        Individual 0: 1 causal
-        Individual 1: 2 causal
-        Individual 2: 0 causal
-
-    Site 1 Genotype:
-        [A, A, T, A, A, A]
-        Individual 0: 0 causal
-        Individual 1: 0 causal
-        Individual 2: 1 causal
-    """
-    ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
-
-    tables = ts.dump_tables()
-    for j in range(2):
-        tables.sites.add_row(j, "A")
-
-    tables.individuals.add_row()
-    tables.individuals.add_row()
-    tables.individuals.add_row()
-    individuals = tables.nodes.individual
-    individuals[0] = 1
-    individuals[1] = 1
-    individuals[2] = 2
-    individuals[3] = 2
-    individuals[4] = 0
-    individuals[5] = 0
-    tables.nodes.individual = individuals
-
-    tables.mutations.add_row(site=0, node=4, derived_state="T")
-    tables.mutations.add_row(site=1, node=2, derived_state="T")
-
-    ts = tables.tree_sequence()
+def sim_tree_seq_freq(alpha):
+    const1 = np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha))
+    const2 = np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha))
 
-    return ts
+    return const1, const2
 
 
 class Test:
@@ -178,257 +137,336 @@ def plot_qq_normal(self, data, loc, scale, filename, title=""):
         plt.savefig(f, dpi=72)
         plt.close("all")
 
+    def plot_qq_exponential(self, data, scale, filename, title=""):
+        sm.qqplot(data, stats.expon, scale=scale, line="45")
+        plt.title(title)
+        f = self._build_filename(filename)
+        plt.savefig(f, dpi=72)
+        plt.close("all")
+
 
-class TestGenetic(Test):
+class TestNormal(Test):
     def test_normal(self):
-        """
-        Genotype of individuals:
-
-        Site 0 Genotype:
-            [A, A, T, T, A, T]
-            Ancestral state: A
-            Causal allele freq: 0.5
-            Individual 0: 0 causal
-            Individual 1: 2 causal
-            Individual 2: 1 causal
-
-        Site 1 Genotype:
-            [A, A, A, A, A, T]
-            Ancestral state: T
-            Causal allele freq: 1/6
-            Individual 0: 0 causal
-            Individual 1: 0 causal
-            Individual 2: 1 causal
-
-        Effect size distribution:
-
-        SD Formula:
-            trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha
-            sqrt(2) from 2 causal sites
-
-        Environmental noise is simulated from a normal distribution where standard
-        deviation depends on the variance of the simulated genetic values
-        """
-        rng = np.random.default_rng(1)
-        alpha_array = np.array([0, -0.3])
-        trait_mean_array = np.array([0, 1])
-        trait_var_array = np.array([1, 2])
-        h2_array = np.array([0.3, 0.8])
+
+        alpha_array = np.array([0, -0.3, 1])
+        mean_array = np.array([0, 1])
+        var_array = np.array([1, 2])
 
         ts = sim_tree_seq()
 
         count = 0
 
-        prod = itertools.product(
-            alpha_array, trait_mean_array, trait_var_array, h2_array
-        )
-
-        for element in tqdm(prod, total=16):
+        prod = itertools.product(alpha_array, mean_array, var_array)
+        for element in tqdm(prod, total=12):
             alpha = element[0]
-            trait_mean = element[1]
-            trait_var = element[2]
-            trait_sd = np.sqrt(trait_var)
-            h2 = element[3]
-            model = trait_model.TraitModelAlleleFrequency(trait_mean, trait_var, alpha)
-            trait_mean /= 2
-            genetic0 = np.zeros(1000)
-            genetic1 = np.zeros(1000)
-            genetic2 = np.zeros(1000)
-
-            environment0 = np.zeros(1000)
-            environment1 = np.zeros(1000)
-            environment2 = np.zeros(1000)
-
+            mean = element[1]
+            var = element[2]
+            sd = np.sqrt(var)
+            const0, const1 = sim_tree_seq_freq(alpha)
+            model = tstrait.trait_model(distribution="normal", mean=mean, var=var)
+            effect_size0 = np.zeros(1000)
+            effect_size1 = np.zeros(1000)
             for i in range(1000):
-                sim_result = simulate_phenotype.sim_phenotype(
-                    ts, num_causal=2, model=model, h2=h2, random_seed=i
+                sim_result = tstrait.sim_trait(
+                    ts=ts,
+                    num_causal=2,
+                    model=model,
+                    alpha=alpha,
+                    random_seed=i + 1000 * count,
                 )
-                phenotype_result = sim_result.phenotype
-                genetic0[i] = phenotype_result.genetic_value[0]
-                genetic1[i] = phenotype_result.genetic_value[1]
-                genetic2[i] = phenotype_result.genetic_value[2]
-
-                environment0[i] = phenotype_result.environment_noise[0]
-                environment1[i] = phenotype_result.environment_noise[1]
-                environment2[i] = phenotype_result.environment_noise[2]
-            assert np.array_equal(genetic0, np.zeros(1000))
-
-            effect_size_sd0 = (
-                trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha))
+                effect_size0[i] = sim_result["effect_size"][0]
+                effect_size1[i] = sim_result["effect_size"][1]
+            self.plot_qq_normal(
+                data=effect_size0,
+                loc=mean * const0 / 2,
+                scale=sd * const0 / 2,
+                filename=f"effect_size_0_{count}",
+                title=f"EffectSize0, Normal, alpha = {alpha}, "
+                f"mean = {mean}, var = {var}",
             )
-            effect_size_sd1 = (
-                trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha))
+            self.plot_qq_normal(
+                data=effect_size1,
+                loc=mean * const1 / 2,
+                scale=sd * const1 / 2,
+                filename=f"effect_size_1_{count}",
+                title=f"EffectSize1, Normal, alpha = {alpha}, "
+                f"mean = {mean}, var = {var}",
             )
+            count += 1
 
-            effect_size_mean0 = trait_mean * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha))
-            effect_size_mean1 = trait_mean * np.sqrt(
-                pow(2 * 1 / 6 * (1 - 1 / 6), alpha)
-            )
 
-            ind_sd1 = effect_size_sd0 * 2
-            ind_sd2 = np.sqrt((effect_size_sd0**2) + (effect_size_sd1**2))
+class TestExponential(Test):
+    def test_exponential(self):
 
-            self.plot_qq_normal(
-                data=genetic1,
-                loc=2 * effect_size_mean0,
-                scale=ind_sd1,
-                filename=f"ind_1_genetic_{count}",
-                title=f"Individual 1 Genetic, alpha = {alpha}, "
-                f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}",
+        alpha_array = np.array([0, -0.3, 1])
+        scale_array = np.array([1, 2.5])
+
+        ts = sim_tree_seq()
+
+        count = 0
+
+        prod = itertools.product(alpha_array, scale_array)
+        for element in tqdm(prod, total=6):
+            alpha = element[0]
+            scale = element[1]
+            const0, const1 = sim_tree_seq_freq(alpha)
+            model = tstrait.trait_model(
+                distribution="exponential", scale=scale, negative=False
             )
-            self.plot_qq_normal(
-                data=genetic2,
-                loc=effect_size_mean0 + effect_size_mean1,
-                scale=ind_sd2,
-                filename=f"ind_2_genetic_{count}",
-                title=f"Individual 2 Genetic, alpha = {alpha}, "
-                f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}",
+            effect_size0 = np.zeros(1000)
+            effect_size1 = np.zeros(1000)
+            for i in range(1000):
+                sim_result = tstrait.sim_trait(
+                    ts=ts,
+                    num_causal=2,
+                    model=model,
+                    alpha=alpha,
+                    random_seed=i + 1000 * count,
+                )
+                effect_size0[i] = sim_result["effect_size"][0]
+                effect_size1[i] = sim_result["effect_size"][1]
+            self.plot_qq_exponential(
+                data=effect_size0,
+                scale=scale / 2 * const0,
+                filename=f"effect_size_0_{count}",
+                title=f"EffectSize0, Exponential, alpha = {alpha}, "
+                f"scale = {scale}, negative = False",
+            )
+            self.plot_qq_exponential(
+                data=effect_size1,
+                scale=scale / 2 * const1,
+                filename=f"effect_size_1_{count}",
+                title=f"EffectSize1, Expnential, alpha = {alpha}, "
+                f"scale = {scale}, negative = False",
             )
+            count += 1
 
-            genetic_sim = np.zeros(3)
-            env_sim = np.zeros(1000)
+    def test_exponential_negative(self):
 
-            for i in range(1000):
-                x1 = rng.normal(loc=effect_size_mean0, scale=effect_size_sd0)
-                x2 = rng.normal(loc=effect_size_mean1, scale=effect_size_sd1)
-                genetic_sim[0] = 0
-                genetic_sim[1] = 2 * x1
-                genetic_sim[2] = x1 + x2
-                env_std = np.sqrt((1 - h2) / h2 * np.var(genetic_sim))
-                env_sim[i] = rng.normal(loc=0, scale=env_std)
+        alpha_array = np.array([0, -0.3, 1])
+        scale_array = np.array([1, 2.5])
 
-            self.plot_qq_compare(
-                v1=environment0,
-                v2=env_sim,
-                x_label="Individual 0 Environment",
-                y_label="Simulated values",
-                filename=f"ind_0_env_{count}",
-                title=f"Individual 0 Env, alpha = {alpha}, trait_mean = {trait_mean}, "
-                f"trait_sd = {trait_sd}, h2 = {h2}",
+        ts = sim_tree_seq()
+
+        count = 0
+
+        prod = itertools.product(alpha_array, scale_array)
+        for element in tqdm(prod, total=6):
+            alpha = element[0]
+            scale = element[1]
+            const0, const1 = sim_tree_seq_freq(alpha)
+            model = tstrait.trait_model(
+                distribution="exponential", scale=scale, negative=True
             )
+            effect_size0 = np.zeros(1000)
+            effect_size1 = np.zeros(1000)
+            for i in range(1000):
+                sim_result = tstrait.sim_trait(
+                    ts=ts,
+                    num_causal=2,
+                    model=model,
+                    alpha=alpha,
+                    random_seed=i + 1000 * count,
+                )
+                effect_size0[i] = sim_result["effect_size"][0]
+                effect_size1[i] = sim_result["effect_size"][1]
+            rng = np.random.default_rng(count + 1)
+            exponential = rng.exponential(scale=scale / 2, size=1000)
+            exponential *= rng.choice([-1, 1], size=1000)
+
             self.plot_qq_compare(
-                v1=environment1,
-                v2=env_sim,
-                x_label="Individual 1 Environment",
-                y_label="Simulated values",
-                filename=f"ind_1_env_{count}",
-                title=f"Individual 1 Env, alpha = {alpha}, trait_mean = {trait_mean}, "
-                f"trait_sd = {trait_sd}, h2 = {h2}",
+                v1=effect_size0,
+                v2=exponential * const0,
+                x_label="Simulated Effect Size",
+                y_label="Exponential Distribution",
+                filename=f"effect_size_0_negative_{count}",
+                title=f"EffectSize0, Exponential, alpha = {alpha}, "
+                f"scale = {scale}, negative = True",
             )
             self.plot_qq_compare(
-                v1=environment2,
-                v2=env_sim,
-                x_label="Individual 2 Environment",
-                y_label="Simulated values",
-                filename=f"ind_2_env_{count}",
-                title=f"Individual 2 Env, alpha = {alpha}, trait_mean = {trait_mean}, "
-                f"trait_sd = {trait_sd}, h2 = {h2}",
+                v1=effect_size1,
+                v2=exponential * const1,
+                x_label="Simulated Effect Size",
+                y_label="Exponential Distribution",
+                filename=f"effect_size_1_negative_{count}",
+                title=f"EffectSize1, Exponential, alpha = {alpha}, "
+                f"scale = {scale}, negative = True",
             )
 
             count += 1
 
 
-class TestInternal(Test):
-    def test_internal(self):
-        """
-        Genotype of individuals:
+class TestT(Test):
+    def test_t(self):
 
-        Site 0 Genotype:
-            [T, T, A, A, T, A]
-            Ancestral state: A
-            Causal allele freq: 0.5
-            Individual 0: 1 causal
-            Individual 1: 2 causal
-            Individual 2: 0 causal
+        alpha_array = np.array([0, -0.3, 1])
+        mean_array = np.array([0, 1])
+        var_array = np.array([1, 2])
+        df_array = np.array([10, 13.5])
 
-        Site 1 Genotype:
-            [A, A, T, A, A, A]
-            Ancestral state: A
-            Causal allele freq: 1/6
-            Individual 0: 0 causal
-            Individual 1: 0 causal
-            Individual 2: 1 causal
+        ts = sim_tree_seq()
 
-        Effect size distribution:
+        count = 0
 
-        SD Formula:
-            trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha
-            sqrt(2) from 2 causal sites
+        prod = itertools.product(alpha_array, mean_array, var_array, df_array)
+        for element in tqdm(prod, total=24):
+            alpha = element[0]
+            mean = element[1]
+            var = element[2]
+            df = element[3]
+            sd = np.sqrt(var)
+            const0, const1 = sim_tree_seq_freq(alpha)
+            model = tstrait.trait_model(distribution="t", mean=mean, var=var, df=df)
+            effect_size0 = np.zeros(1000)
+            effect_size1 = np.zeros(1000)
+            for i in range(1000):
+                sim_result = tstrait.sim_trait(
+                    ts=ts,
+                    num_causal=2,
+                    model=model,
+                    alpha=alpha,
+                    random_seed=i + 1000 * count,
+                )
+                effect_size0[i] = sim_result["effect_size"][0]
+                effect_size1[i] = sim_result["effect_size"][1]
+            rng = np.random.default_rng(count + 1)
+            simulated_t = rng.standard_t(df, size=1000)
+            simulated_t = simulated_t * sd + mean
+            simulated_t /= 2
 
-        Environmental noise is simulated from a normal distribution where standard
-        deviation depends on the variance of the simulated genetic values
-        """
+            self.plot_qq_compare(
+                v1=effect_size0,
+                v2=simulated_t * const0,
+                x_label="Simulated Effect Size",
+                y_label="T Distribution",
+                filename=f"effect_size_0_{count}",
+                title=f"EffectSize0, T, alpha = {alpha}, mean = {mean}, "
+                f"var = {var}, df = {df}",
+            )
+            self.plot_qq_compare(
+                v1=effect_size1,
+                v2=simulated_t * const1,
+                x_label="Simulated Effect Size",
+                y_label="T Distribution",
+                filename=f"effect_size_1_{count}",
+                title=f"EffectSize1, T, alpha = {alpha}, mean = {mean}, "
+                f"var = {var}, df = {df}",
+            )
+            count += 1
 
-        ts = sim_tree_internal()
 
-        alpha_array = np.array([0, -1])
-        trait_mean_array = np.array([0, 1])
-        trait_var_array = np.array([1, 2])
-        h2_array = np.array([0.3, 0.8])
+class TestGamma(Test):
+    def test_gamma(self):
 
-        count = 0
+        alpha_array = np.array([0, -0.3, 1])
+        shape_array = np.array([1, 3.5])
+        scale_array = np.array([1, 2.5])
 
-        prod = itertools.product(
-            alpha_array, trait_mean_array, trait_var_array, h2_array
-        )
+        ts = sim_tree_seq()
 
-        for element in tqdm(prod, total=16):
-            alpha = element[0]
-            trait_mean = element[1]
-            trait_var = element[2]
-            trait_sd = np.sqrt(trait_var)
-            h2 = element[3]
-            model = trait_model.TraitModelAlleleFrequency(trait_mean, trait_var, alpha)
-            trait_mean /= 2
-            genetic0 = np.zeros(1000)
-            genetic1 = np.zeros(1000)
-            genetic2 = np.zeros(1000)
+        count = 0
 
+        prod = itertools.product(alpha_array, shape_array, scale_array)
+        for element in tqdm(prod, total=12):
+            alpha = element[0]
+            shape = element[1]
+            scale = element[2]
+            const0, const1 = sim_tree_seq_freq(alpha)
+            model = tstrait.trait_model(
+                distribution="gamma", shape=shape, scale=scale, negative=False
+            )
+            effect_size0 = np.zeros(1000)
+            effect_size1 = np.zeros(1000)
             for i in range(1000):
-                sim_result = simulate_phenotype.sim_phenotype(
-                    ts, num_causal=2, model=model, h2=h2, random_seed=i
+                sim_result = tstrait.sim_trait(
+                    ts=ts,
+                    num_causal=2,
+                    model=model,
+                    alpha=alpha,
+                    random_seed=i + 1000 * count,
                 )
-                phenotype_result = sim_result.phenotype
-                genetic0[i] = phenotype_result.genetic_value[0]
-                genetic1[i] = phenotype_result.genetic_value[1]
-                genetic2[i] = phenotype_result.genetic_value[2]
+                effect_size0[i] = sim_result["effect_size"][0]
+                effect_size1[i] = sim_result["effect_size"][1]
 
-            effect_size_sd0 = (
-                trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha))
-            )
-            effect_size_sd1 = (
-                trait_sd / np.sqrt(2) * np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha))
+            rng = np.random.default_rng(count + 1)
+            gamma = rng.gamma(shape=shape, scale=scale, size=1000)
+            gamma /= 2
+
+            self.plot_qq_compare(
+                v1=effect_size0,
+                v2=gamma * const0,
+                x_label="Simulated Effect Size",
+                y_label="Gamma Distribution",
+                filename=f"effect_size_0_{count}",
+                title=f"EffectSize0, Gamma, alpha = {alpha}, "
+                f"shape = {shape}, scale = {scale}, negative = False",
             )
-            effect_size_mean0 = trait_mean * np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha))
-            effect_size_mean1 = trait_mean * np.sqrt(
-                pow(2 * 1 / 6 * (1 - 1 / 6), alpha)
+            self.plot_qq_compare(
+                v1=effect_size1,
+                v2=gamma * const1,
+                x_label="Simulated Effect Size",
+                y_label="Gamma Distribution",
+                filename=f"effect_size_1_{count}",
+                title=f"EffectSize1, Gamma, alpha = {alpha}, "
+                f"shape = {shape}, scale = {scale}, negative = False",
             )
+            count += 1
 
-            self.plot_qq_normal(
-                data=genetic0,
-                loc=effect_size_mean0,
-                scale=effect_size_sd0,
-                filename=f"internal_ind_0_genetic_{count}",
-                title=f"Individual 0 Genetic, alpha = {alpha}, "
-                f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}",
+    def test_gamma_negative(self):
+
+        alpha_array = np.array([0, -0.3, 1])
+        shape_array = np.array([1, 3.5])
+        scale_array = np.array([1, 2.5])
+
+        ts = sim_tree_seq()
+
+        count = 0
+
+        prod = itertools.product(alpha_array, shape_array, scale_array)
+        for element in tqdm(prod, total=12):
+            alpha = element[0]
+            shape = element[1]
+            scale = element[2]
+            const0, const1 = sim_tree_seq_freq(alpha)
+            model = tstrait.trait_model(
+                distribution="gamma", shape=shape, scale=scale, negative=True
             )
-            self.plot_qq_normal(
-                data=genetic1,
-                loc=2 * effect_size_mean0,
-                scale=2 * effect_size_sd0,
-                filename=f"internal_ind_1_genetic_{count}",
-                title=f"Individual 1 Genetic, alpha = {alpha}, "
-                f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}",
+            effect_size0 = np.zeros(1000)
+            effect_size1 = np.zeros(1000)
+            for i in range(1000):
+                sim_result = tstrait.sim_trait(
+                    ts=ts,
+                    num_causal=2,
+                    model=model,
+                    alpha=alpha,
+                    random_seed=i + 1000 * count,
+                )
+                effect_size0[i] = sim_result["effect_size"][0]
+                effect_size1[i] = sim_result["effect_size"][1]
+
+            rng = np.random.default_rng(count + 1)
+            gamma = rng.gamma(shape=shape, scale=scale, size=1000)
+            gamma /= 2
+            gamma *= rng.choice([-1, 1], size=1000)
+
+            self.plot_qq_compare(
+                v1=effect_size0,
+                v2=gamma * const0,
+                x_label="Simulated Effect Size",
+                y_label="Gamma Distribution",
+                filename=f"effect_size_0_negative_{count}",
+                title=f"EffectSize0, Gamma, alpha = {alpha}, "
+                f"shape = {shape}, scale = {scale}, negative = True",
             )
-            self.plot_qq_normal(
-                data=genetic2,
-                loc=effect_size_mean1,
-                scale=effect_size_sd1,
-                filename=f"internal_ind_2_genetic_{count}",
-                title=f"Individual 2 Genetic, alpha = {alpha}, "
-                f"trait_mean = {trait_mean}, trait_sd = {trait_sd}, h2 = {h2}",
+            self.plot_qq_compare(
+                v1=effect_size1,
+                v2=gamma * const1,
+                x_label="Simulated Effect Size",
+                y_label="Gamma Distribution",
+                filename=f"effect_size_1_negative_{count}",
+                title=f"EffectSize1, Gamma, alpha = {alpha}, "
+                f"shape = {shape}, scale = {scale}, negative = True",
             )
-
             count += 1
 
 
@@ -440,4 +478,7 @@ def run_tests(suite, output_dir):
 
 
 if __name__ == "__main__":
-    run_tests(["TestGenetic", "TestInternal"], "_output/stats_tests_output")
+    run_tests(
+        ["TestNormal", "TestExponential", "TestT", "TestGamma"],
+        "_output/stats_tests_output",
+    )
diff --git a/tests/test_sim_trait.py b/tests/test_sim_trait.py
new file mode 100644
index 0000000..c1167e4
--- /dev/null
+++ b/tests/test_sim_trait.py
@@ -0,0 +1,746 @@
+import functools
+
+import msprime
+import numpy as np
+import pytest
+import tskit
+import tstrait
+
+
+@functools.lru_cache(maxsize=None)
+def all_trees_ts(n):
+    """
+    Generate a tree sequence that corresponds to the lexicographic listing
+    of all trees with n leaves (i.e. from tskit.all_trees(n)).
+
+    Note: it would be nice to include a version of this in the combinatorics
+    module at some point but the implementation is quite inefficient. Also
+    it's not entirely clear that the way we're allocating node times is
+    guaranteed to work.
+    """
+    tables = tskit.TableCollection(0)
+    for _ in range(n):
+        tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0)
+    for j in range(1, n):
+        tables.nodes.add_row(flags=0, time=j)
+
+    L = 0
+    for tree in tskit.all_trees(n):
+        for u in tree.preorder()[1:]:
+            tables.edges.add_row(L, L + 1, tree.parent(u), u)
+        L += 1
+    tables.sequence_length = L
+    tables.sort()
+    tables.simplify()
+    return tables.tree_sequence()
+
+
+class Test_sim_phenotype_output_dim:
+    def check_dimensions(self, df, num_causal):
+        assert len(df) == num_causal
+        assert df.shape[1] == 3
+        assert len(df["site_id"]) == num_causal
+        assert len(df["causal_state"]) == num_causal
+        assert len(df["effect_size"]) == num_causal
+
+    @pytest.mark.parametrize("num_ind", [1, 2, np.array([5])[0]])
+    @pytest.mark.parametrize("num_causal", [1, 2, np.array([3])[0]])
+    @pytest.mark.parametrize("alpha", [0, 1, -1.1])
+    @pytest.mark.parametrize("random_seed", [1, 2, None])
+    def test_output_dim_normal(self, num_ind, num_causal, alpha, random_seed):
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        ts = msprime.sim_ancestry(
+            num_ind, sequence_length=100_000, random_seed=random_seed
+        )
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed)
+        sim_result = tstrait.sim_trait(
+            ts=ts,
+            num_causal=num_causal,
+            model=model,
+            alpha=alpha,
+            random_seed=random_seed,
+        )
+        self.check_dimensions(sim_result, num_causal)
+
+    def test_binary_mutation_model(self):
+        random_seed = 1
+        num_ind = 10
+        num_causal = 5
+        alpha = 1
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        ts = msprime.sim_ancestry(
+            num_ind, sequence_length=100_000, random_seed=random_seed
+        )
+        ts = msprime.sim_mutations(
+            ts, rate=0.01, random_seed=random_seed, model="binary"
+        )
+        sim_result = tstrait.sim_trait(
+            ts=ts,
+            num_causal=num_causal,
+            model=model,
+            alpha=alpha,
+            random_seed=random_seed,
+        )
+        self.check_dimensions(sim_result, num_causal)
+
+    def test_output_dim_exponential(self):
+        model = tstrait.trait_model(distribution="exponential", scale=1)
+        random_seed = 1
+        num_ind = 10
+        num_causal = 5
+        alpha = 1
+        ts = msprime.sim_ancestry(
+            num_ind, sequence_length=100_000, random_seed=random_seed
+        )
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed)
+        sim_result = tstrait.sim_trait(
+            ts=ts,
+            num_causal=num_causal,
+            model=model,
+            alpha=alpha,
+            random_seed=random_seed,
+        )
+        self.check_dimensions(sim_result, num_causal)
+
+    def test_output_dim_fixed(self):
+        model = tstrait.trait_model(distribution="fixed", value=1)
+        random_seed = 1
+        num_ind = 10
+        num_causal = 5
+        alpha = 1
+        ts = msprime.sim_ancestry(
+            num_ind, sequence_length=100_000, random_seed=random_seed
+        )
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed)
+        sim_result = tstrait.sim_trait(
+            ts=ts,
+            num_causal=num_causal,
+            model=model,
+            alpha=alpha,
+            random_seed=random_seed,
+        )
+        self.check_dimensions(sim_result, num_causal)
+
+    def test_output_dim_t(self):
+        model = tstrait.trait_model(distribution="t", mean=0, var=1, df=1)
+        random_seed = 1
+        num_ind = 10
+        num_causal = 5
+        alpha = 1
+        ts = msprime.sim_ancestry(
+            num_ind, sequence_length=100_000, random_seed=random_seed
+        )
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed)
+        sim_result = tstrait.sim_trait(
+            ts=ts,
+            num_causal=num_causal,
+            model=model,
+            alpha=alpha,
+            random_seed=random_seed,
+        )
+        self.check_dimensions(sim_result, num_causal)
+
+    def test_output_dim_gamma(self):
+        model = tstrait.trait_model(distribution="gamma", shape=1, scale=1)
+        random_seed = 1
+        num_ind = 10
+        num_causal = 5
+        alpha = 1
+        ts = msprime.sim_ancestry(
+            num_ind, sequence_length=100_000, random_seed=random_seed
+        )
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=random_seed)
+        sim_result = tstrait.sim_trait(
+            ts=ts,
+            num_causal=num_causal,
+            model=model,
+            alpha=alpha,
+            random_seed=random_seed,
+        )
+        self.check_dimensions(sim_result, num_causal)
+
+
+class Test_sim_phenotype_input:
+    @pytest.mark.parametrize("ts", [0, "a", [1, 1]])
+    def test_ts(self, ts):
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        with pytest.raises(TypeError, match="Input must be a tree sequence data"):
+            tstrait.sim_trait(ts=ts, num_causal=3, model=model, alpha=1, random_seed=1)
+
+    @pytest.mark.parametrize("num_causal", ["1", "a", [1, 1]])
+    def test_num_causal(self, num_causal):
+        ts = msprime.sim_ancestry(2, sequence_length=100_000, random_seed=1)
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        with pytest.raises(
+            TypeError, match="Number of causal sites must be an integer"
+        ):
+            tstrait.sim_trait(
+                ts=ts, num_causal=num_causal, model=model, alpha=1, random_seed=1
+            )
+
+    @pytest.mark.parametrize("num_causal", [-1, 1.8, -1.5, 0])
+    def test_num_causal_positive(self, num_causal):
+        ts = msprime.sim_ancestry(2, sequence_length=100_000, random_seed=1)
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        with pytest.raises(
+            ValueError, match="Number of causal sites must be a positive integer"
+        ):
+            tstrait.sim_trait(
+                ts=ts, num_causal=num_causal, model=model, alpha=1, random_seed=1
+            )
+
+    @pytest.mark.parametrize("model", ["normal", 1, [None, 1, "a"]])
+    def test_model(self, model):
+        ts = msprime.sim_ancestry(5, sequence_length=100_000, random_seed=2)
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=2)
+        with pytest.raises(
+            TypeError, match="Trait model must be an instance of TraitModel"
+        ):
+            tstrait.sim_trait(ts=ts, num_causal=1, model=model, alpha=1, random_seed=1)
+
+    def test_error_mutation(self):
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1)
+        with pytest.raises(ValueError, match="No mutation in the provided data"):
+            tstrait.sim_trait(
+                ts=ts,
+                num_causal=5,
+                model=model,
+                alpha=0,
+                random_seed=1,
+            )
+
+    def test_error_num_causal_add(self):
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        ts = msprime.sim_ancestry(10, sequence_length=100_000, random_seed=1)
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
+        num_causal = ts.num_sites + 1
+        with pytest.raises(
+            ValueError,
+            match="There are less number of sites in the "
+            "tree sequence than the inputted number of causal sites",
+        ):
+            tstrait.sim_trait(
+                ts=ts,
+                num_causal=num_causal,
+                model=model,
+                alpha=0,
+                random_seed=1,
+            )
+
+    def test_output_num_causal(self):
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        ts = msprime.sim_ancestry(3, sequence_length=100_000, random_seed=1)
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
+        num_causal = ts.num_sites
+        sim_result = tstrait.sim_trait(
+            ts=ts, num_causal=num_causal, model=model, alpha=-1, random_seed=1
+        )
+        assert sim_result.shape[0] == num_causal
+
+    @pytest.mark.parametrize("alpha", ["1", "a", [1, 1]])
+    def test_alpha(self, alpha):
+        ts = msprime.sim_ancestry(2, sequence_length=100_000, random_seed=1)
+        ts = msprime.sim_mutations(ts, rate=0.01, random_seed=1)
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        with pytest.raises(TypeError, match="Alpha must be a number"):
+            tstrait.sim_trait(
+                ts=ts, num_causal=3, model=model, alpha=alpha, random_seed=1
+            )
+
+
+class Test_site_genotypes:
+    def test_binary_tree(self):
+        #  3.00   6
+        #     ┊ ┏━┻━┓    ┊
+        # 2.00┊ ┃   5   ┊
+        #     ┊ ┃ ┏━┻┓   ┊
+        # 1.00┊ ┃ ┃  4  ┊
+        #     ┊ ┃ ┃ ┏┻┓  ┊
+        # 0.00 0 1 2 3
+        #
+        ts = tskit.Tree.generate_comb(4, span=15).tree_sequence
+        tables = ts.dump_tables()
+        for j in range(12):
+            tables.sites.add_row(j, "A")
+
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        individuals = tables.nodes.individual
+        individuals[0] = 0
+        individuals[1] = 0
+        individuals[2] = 1
+        individuals[3] = 1
+        tables.nodes.individual = individuals
+
+        tables.mutations.add_row(site=0, node=0, derived_state="T")
+        tables.mutations.add_row(site=1, node=4, derived_state="T")
+        tables.mutations.add_row(site=2, node=1, derived_state="T")
+
+        tables.mutations.add_row(site=3, node=5, derived_state="T")
+        tables.mutations.add_row(site=3, node=2, derived_state="T", parent=3)
+
+        tables.mutations.add_row(site=4, node=0, derived_state="T")
+        tables.mutations.add_row(site=4, node=4, derived_state="T")
+
+        tables.mutations.add_row(site=5, node=0, derived_state="T")
+        tables.mutations.add_row(site=5, node=0, derived_state="A", parent=7)
+
+        tables.mutations.add_row(site=6, node=0, derived_state="T")
+        tables.mutations.add_row(site=6, node=0, derived_state="G", parent=9)
+        tables.mutations.add_row(site=6, node=0, derived_state="T", parent=10)
+        tables.mutations.add_row(site=6, node=0, derived_state="C", parent=11)
+
+        tables.mutations.add_row(site=7, node=5, derived_state="T")
+        tables.mutations.add_row(site=7, node=4, derived_state="C", parent=13)
+
+        tables.mutations.add_row(site=8, node=5, derived_state="T")
+        tables.mutations.add_row(site=8, node=4, derived_state="C", parent=15)
+        tables.mutations.add_row(site=8, node=4, derived_state="T", parent=16)
+        tables.mutations.add_row(site=8, node=4, derived_state="A", parent=17)
+        tables.mutations.add_row(site=8, node=4, derived_state="T", parent=18)
+
+        tables.mutations.add_row(site=9, node=6, derived_state="A")
+
+        tables.mutations.add_row(site=10, node=6, derived_state="C")
+
+        tables.mutations.add_row(site=11, node=5, derived_state="C")
+        tables.mutations.add_row(site=11, node=0, derived_state="T")
+
+        ts = tables.tree_sequence()
+
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        simulator = tstrait.TraitSimulator(
+            ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1
+        )
+        tree = ts.first()
+
+        c1 = simulator._obtain_allele_count(tree, ts.site(0))
+        c2 = simulator._obtain_allele_count(tree, ts.site(1))
+        c3 = simulator._obtain_allele_count(tree, ts.site(2))
+        c4 = simulator._obtain_allele_count(tree, ts.site(3))
+        c5 = simulator._obtain_allele_count(tree, ts.site(4))
+        c6 = simulator._obtain_allele_count(tree, ts.site(5))
+        c7 = simulator._obtain_allele_count(tree, ts.site(6))
+        c8 = simulator._obtain_allele_count(tree, ts.site(7))
+        c9 = simulator._obtain_allele_count(tree, ts.site(8))
+        c10 = simulator._obtain_allele_count(tree, ts.site(10))
+        c11 = simulator._obtain_allele_count(tree, ts.site(11))
+
+        assert c1 == {"T": 1}
+        assert c2 == {"T": 2}
+        assert c3 == {"T": 1}
+        assert c4 == {"T": 3}
+        assert c5 == {"T": 3}
+        assert c6 == {"A": 4}
+        assert c7 == {"C": 1}
+        assert c8 == {"T": 1, "C": 2}
+        assert c9 == {"T": 3}
+        assert c10 == {"C": 4}
+        assert c11 == {"C": 3, "T": 1}
+
+    def test_binary_tree_different_individual(self):
+        #  3.00   6
+        #     ┊ ┏━┻━┓    ┊
+        # 2.00┊ ┃   5   ┊
+        #     ┊ ┃ ┏━┻┓   ┊
+        # 1.00┊ ┃ ┃  4  ┊
+        #     ┊ ┃ ┃ ┏┻┓  ┊
+        # 0.00 0 1 2 3
+        #
+        ts = tskit.Tree.generate_comb(4, span=10).tree_sequence
+        tables = ts.dump_tables()
+        for j in range(9):
+            tables.sites.add_row(j, "A")
+
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        individuals = tables.nodes.individual
+        individuals[0] = 0
+        individuals[1] = 1
+        individuals[2] = 0
+        individuals[3] = 1
+        tables.nodes.individual = individuals
+
+        tables.mutations.add_row(site=0, node=0, derived_state="T")
+        tables.mutations.add_row(site=1, node=4, derived_state="T")
+        tables.mutations.add_row(site=2, node=1, derived_state="T")
+
+        tables.mutations.add_row(site=3, node=5, derived_state="T")
+        tables.mutations.add_row(site=3, node=2, derived_state="T", parent=3)
+
+        tables.mutations.add_row(site=4, node=0, derived_state="T")
+        tables.mutations.add_row(site=4, node=4, derived_state="T")
+
+        tables.mutations.add_row(site=5, node=0, derived_state="T")
+        tables.mutations.add_row(site=5, node=0, derived_state="A", parent=7)
+
+        tables.mutations.add_row(site=6, node=0, derived_state="T")
+        tables.mutations.add_row(site=6, node=0, derived_state="G", parent=9)
+        tables.mutations.add_row(site=6, node=0, derived_state="T", parent=10)
+        tables.mutations.add_row(site=6, node=0, derived_state="C", parent=11)
+
+        tables.mutations.add_row(site=7, node=5, derived_state="T")
+        tables.mutations.add_row(site=7, node=4, derived_state="C", parent=13)
+
+        tables.mutations.add_row(site=8, node=5, derived_state="T")
+        tables.mutations.add_row(site=8, node=4, derived_state="C", parent=15)
+        tables.mutations.add_row(site=8, node=4, derived_state="T", parent=16)
+        tables.mutations.add_row(site=8, node=4, derived_state="A", parent=17)
+        tables.mutations.add_row(site=8, node=4, derived_state="T", parent=18)
+        ts = tables.tree_sequence()
+
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        simulator = tstrait.TraitSimulator(
+            ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1
+        )
+        tree = ts.first()
+
+        c1 = simulator._obtain_allele_count(tree, ts.site(0))
+        c2 = simulator._obtain_allele_count(tree, ts.site(1))
+        c3 = simulator._obtain_allele_count(tree, ts.site(2))
+        c4 = simulator._obtain_allele_count(tree, ts.site(3))
+        c5 = simulator._obtain_allele_count(tree, ts.site(4))
+        c6 = simulator._obtain_allele_count(tree, ts.site(5))
+        c7 = simulator._obtain_allele_count(tree, ts.site(6))
+        c8 = simulator._obtain_allele_count(tree, ts.site(7))
+        c9 = simulator._obtain_allele_count(tree, ts.site(8))
+
+        assert c1 == {"T": 1}
+        assert c2 == {"T": 2}
+        assert c3 == {"T": 1}
+        assert c4 == {"T": 3}
+        assert c5 == {"T": 3}
+        assert c6 == {"A": 4}
+        assert c7 == {"C": 1}
+        assert c8 == {"T": 1, "C": 2}
+        assert c9 == {"T": 3}
+
+    def test_binary_tree_internal_node(self):
+        ts = tskit.Tree.generate_balanced(4, span=10).tree_sequence
+
+        tables = ts.dump_tables()
+        for j in range(3):
+            tables.sites.add_row(j, "A")
+
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        individuals = tables.nodes.individual
+        individuals[0] = 1
+        individuals[1] = 1
+        individuals[2] = 2
+        individuals[3] = 2
+        individuals[4] = 0
+        individuals[5] = 0
+        tables.nodes.individual = individuals
+
+        tables.mutations.add_row(site=0, node=0, derived_state="T")
+        tables.mutations.add_row(site=0, node=5, derived_state="G")
+
+        tables.mutations.add_row(site=1, node=4, derived_state="T")
+        tables.mutations.add_row(site=1, node=0, derived_state="A", parent=2)
+        tables.mutations.add_row(site=1, node=5, derived_state="T")
+        tables.mutations.add_row(site=1, node=5, derived_state="G", parent=4)
+
+        tables.mutations.add_row(site=2, node=4, derived_state="T")
+        tables.mutations.add_row(site=2, node=0, derived_state="G", parent=6)
+        tables.mutations.add_row(site=2, node=1, derived_state="C", parent=6)
+
+        ts = tables.tree_sequence()
+
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        simulator = tstrait.TraitSimulator(
+            ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1
+        )
+        tree = ts.first()
+
+        c1 = simulator._obtain_allele_count(tree, ts.site(0))
+        c2 = simulator._obtain_allele_count(tree, ts.site(1))
+        c3 = simulator._obtain_allele_count(tree, ts.site(2))
+
+        assert c1 == {"T": 1, "G": 2}
+        assert c2 == {"T": 1, "G": 2}
+        assert c3 == {"G": 1, "C": 1}
+
+    def test_non_binary_tree(self):
+        # 2.00      7
+        #     ┊ ┏━┏━━┏━┻━━┓   ┊
+        # 1.00┊ ┃ ┃ ┃     6   ┊
+        #     ┊ ┃ ┃ ┃  ┏━┳┻┓ ┊
+        # 0.00 0 1 2   3 4 5
+        ts = tskit.Tree.generate_balanced(6, arity=4, span=10).tree_sequence
+
+        tables = ts.dump_tables()
+        for j in range(3):
+            tables.sites.add_row(j, "A")
+
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        individuals = tables.nodes.individual
+        individuals[0] = 0
+        individuals[1] = 0
+        individuals[2] = 1
+        individuals[3] = 1
+        individuals[4] = 2
+        individuals[5] = 2
+        tables.nodes.individual = individuals
+
+        tables.mutations.add_row(site=0, node=6, derived_state="T")
+        tables.mutations.add_row(site=1, node=0, derived_state="T")
+        tables.mutations.add_row(site=2, node=6, derived_state="T")
+        tables.mutations.add_row(site=2, node=6, derived_state="C", parent=2)
+        tables.mutations.add_row(site=2, node=5, derived_state="T", parent=3)
+
+        ts = tables.tree_sequence()
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        simulator = tstrait.TraitSimulator(
+            ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1
+        )
+        tree = ts.first()
+
+        c1 = simulator._obtain_allele_count(tree, ts.site(0))
+        c2 = simulator._obtain_allele_count(tree, ts.site(1))
+        c3 = simulator._obtain_allele_count(tree, ts.site(2))
+
+        assert c1 == {"T": 3}
+        assert c2 == {"T": 1}
+        assert c3 == {"C": 2, "T": 1}
+
+    def test_binary_tree_additional(self):
+        ts = tskit.Tree.generate_comb(6, span=10).tree_sequence
+        tables = ts.dump_tables()
+        for j in range(5):
+            tables.sites.add_row(j, "A")
+
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        individuals = tables.nodes.individual
+        individuals[0] = 0
+        individuals[1] = 0
+        individuals[2] = 1
+        individuals[3] = 1
+        individuals[4] = 2
+        individuals[5] = 2
+        tables.nodes.individual = individuals
+
+        tables.mutations.add_row(site=0, node=7, derived_state="T")
+        tables.mutations.add_row(site=1, node=0, derived_state="T")
+        tables.mutations.add_row(site=1, node=0, derived_state="A", parent=1)
+
+        tables.mutations.add_row(site=2, node=9, derived_state="T")
+        tables.mutations.add_row(site=2, node=3, derived_state="C", parent=3)
+
+        tables.mutations.add_row(site=3, node=8, derived_state="T")
+        tables.mutations.add_row(site=3, node=8, derived_state="A", parent=5)
+        tables.mutations.add_row(site=3, node=8, derived_state="C", parent=6)
+        tables.mutations.add_row(site=3, node=3, derived_state="A", parent=7)
+        tables.mutations.add_row(site=3, node=6, derived_state="G", parent=7)
+
+        tables.mutations.add_row(site=4, node=8, derived_state="A")
+
+        ts = tables.tree_sequence()
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        simulator = tstrait.TraitSimulator(
+            ts=ts, num_causal=1, model=model, alpha=0.3, random_seed=1
+        )
+        tree = ts.first()
+
+        c1 = simulator._obtain_allele_count(tree, ts.site(0))
+        c2 = simulator._obtain_allele_count(tree, ts.site(1))
+        c3 = simulator._obtain_allele_count(tree, ts.site(2))
+        c4 = simulator._obtain_allele_count(tree, ts.site(3))
+        c5 = simulator._obtain_allele_count(tree, ts.site(4))
+
+        assert c1 == {"T": 3}
+        assert c2 == {"A": 6}
+        assert c3 == {"T": 4, "C": 1}
+        assert c4 == {"G": 2, "C": 1}
+        assert c5 == {"A": 6}
+
+
+def sim_tree_seq():
+    """
+    The following tree sequence will be generated:
+      10
+    ┏━┻┓
+    ┃  9
+    ┃ ┏┻━┓
+    ┃ ┃   8
+    ┃ ┃  ┏┻━┓
+    ┃ ┃  ┃  7
+    ┃ ┃  ┃ ┏┻━┓
+    ┃ ┃  ┃ ┃  6
+    ┃ ┃  ┃ ┃ ┏┻━┓
+    0 1  2 3 4  5
+
+    Individual 0: Node 0 and 1
+    Individual 1: Node 2 and 3
+    Individual 3: Node 4 and 5
+
+    Site 0 Ancestral State: "A"
+        Causal Mutation: Node 8
+        Reverse Mutation: Node 4
+
+    Site 1 Ancestral State: "A"
+        Causal Mutation: Node 5
+
+    Site 0 Genotype:
+        [A, A, T, T, A, T]
+        Ancestral state: A
+        Causal allele freq: 0.5
+        Individual 0: 0 causal
+        Individual 1: 2 causal
+        Individual 2: 1 causal
+
+    Site 1 Genotype:
+        [A, A, A, A, A, T]
+        Ancestral state: A
+        Causal allele freq: 1/6
+        Individual 0: 0 causal
+        Individual 1: 0 causal
+        Individual 2: 1 causal
+
+    Effect size distribution:
+
+    SD Formula:
+        trait_sd / sqrt(2) * [sqrt(2 * freq * (1 - freq))] ^ alpha
+        sqrt(2) from 2 causal sites
+    """
+    ts = tskit.Tree.generate_comb(6, span=2).tree_sequence
+
+    tables = ts.dump_tables()
+    for j in range(2):
+        tables.sites.add_row(j, "A")
+
+    tables.individuals.add_row()
+    tables.individuals.add_row()
+    tables.individuals.add_row()
+    individuals = tables.nodes.individual
+    individuals[0] = 0
+    individuals[1] = 0
+    individuals[2] = 1
+    individuals[3] = 1
+    individuals[4] = 2
+    individuals[5] = 2
+    tables.nodes.individual = individuals
+    tables.mutations.add_row(site=0, node=8, derived_state="T")
+    tables.mutations.add_row(site=0, node=4, derived_state="A", parent=0)
+    tables.mutations.add_row(site=1, node=5, derived_state="T")
+    ts = tables.tree_sequence()
+    return ts
+
+
+def sim_tree_seq_freq(alpha):
+    const1 = np.sqrt(pow(2 * 0.5 * (1 - 0.5), alpha))
+    const2 = np.sqrt(pow(2 * 1 / 6 * (1 - 1 / 6), alpha))
+
+    return const1, const2
+
+
+class Test_fixed_effect_size:
+    @pytest.mark.parametrize("value", [0, 1, -2])
+    @pytest.mark.parametrize("alpha", [0, 1, -0.5])
+    @pytest.mark.parametrize("random_seed", [1, 2, None])
+    def test_output_dim_normal(self, value, alpha, random_seed):
+        model = tstrait.trait_model(distribution="fixed", value=value)
+        ts = sim_tree_seq()
+        sim_result = tstrait.sim_trait(
+            ts=ts, num_causal=2, model=model, alpha=alpha, random_seed=random_seed
+        )
+        const1, const2 = sim_tree_seq_freq(alpha)
+        assert np.array_equal(sim_result["site_id"], np.array([0, 1]))
+        assert np.array_equal(sim_result["causal_state"], np.array(["T", "T"]))
+        assert np.isclose(sim_result["effect_size"][0], value * const1)
+        assert np.isclose(sim_result["effect_size"][1], value * const2)
+
+
+class Test_allele_freq_one:
+    @pytest.mark.parametrize("alpha", [0, 1, -0.5])
+    def test_allele_freq_one(self, alpha):
+        ts = tskit.Tree.generate_comb(6, span=2).tree_sequence
+        tables = ts.dump_tables()
+        for j in range(2):
+            tables.sites.add_row(j, "A")
+        tables.mutations.add_row(site=0, node=8, derived_state="T")
+        tables.mutations.add_row(site=0, node=8, derived_state="A", parent=0)
+        tables.mutations.add_row(site=1, node=5, derived_state="T")
+        tables.mutations.add_row(site=1, node=5, derived_state="A", parent=2)
+        ts = tables.tree_sequence()
+        model = tstrait.trait_model(distribution="normal", mean=0, var=1)
+        sim_result = tstrait.sim_trait(
+            ts=ts, num_causal=2, model=model, alpha=alpha, random_seed=1
+        )
+        assert np.array_equal(sim_result["effect_size"], np.array([0, 0]))
+
+
+class Test_tree_sequence:
+    def const(self, freq, alpha):
+        value = np.sqrt(pow(2 * freq * (1 - freq), alpha))
+        return value
+
+    def test_tree_sequence(self):
+        ts = all_trees_ts(4)
+        tables = ts.dump_tables()
+
+        tables.individuals.add_row()
+        tables.individuals.add_row()
+        individuals = tables.nodes.individual
+        individuals[0] = 0
+        individuals[1] = 0
+        individuals[2] = 1
+        individuals[3] = 1
+        tables.nodes.individual = individuals
+
+        tables.sites.add_row(1, "A")
+        tables.sites.add_row(7, "A")
+        tables.sites.add_row(11, "G")
+        tables.sites.add_row(12, "C")
+        tables.sites.add_row(13, "A")
+        tables.sites.add_row(14, "A")
+
+        tables.mutations.add_row(site=0, node=4, derived_state="T")
+        tables.mutations.add_row(site=1, node=4, derived_state="T")
+        tables.mutations.add_row(site=2, node=5, derived_state="C")
+        tables.mutations.add_row(site=2, node=4, derived_state="G", parent=2)
+        tables.mutations.add_row(site=3, node=5, derived_state="C")
+        tables.mutations.add_row(site=3, node=3, derived_state="T", parent=4)
+        tables.mutations.add_row(site=4, node=3, derived_state="T")
+        tables.mutations.add_row(site=4, node=3, derived_state="A", parent=6)
+        tables.mutations.add_row(site=4, node=3, derived_state="C", parent=7)
+        tables.mutations.add_row(site=5, node=4, derived_state="C")
+        tables.mutations.add_row(site=5, node=4, derived_state="C", parent=9)
+        tables.mutations.add_row(site=5, node=1, derived_state="A")
+
+        ts = tables.tree_sequence()
+
+        model = tstrait.trait_model(distribution="fixed", value=2)
+        sim_result = tstrait.sim_trait(
+            ts=ts, num_causal=6, model=model, alpha=-1, random_seed=1
+        )
+        assert np.array_equal(sim_result["site_id"], np.array([0, 1, 2, 3, 4, 5]))
+        assert np.array_equal(
+            sim_result["causal_state"], np.array(["T", "T", "C", "T", "C", "C"])
+        )
+        assert np.isclose(
+            sim_result["effect_size"][0], 2 * self.const(freq=0.5, alpha=-1)
+        )
+        assert np.isclose(
+            sim_result["effect_size"][1], 2 * self.const(freq=0.75, alpha=-1)
+        )
+        assert np.isclose(
+            sim_result["effect_size"][2], 2 * self.const(freq=0.25, alpha=-1)
+        )
+        assert np.isclose(
+            sim_result["effect_size"][3], 2 * self.const(freq=0.25, alpha=-1)
+        )
+        assert np.isclose(
+            sim_result["effect_size"][4], 2 * self.const(freq=0.25, alpha=-1)
+        )
+        assert np.isclose(
+            sim_result["effect_size"][5], 2 * self.const(freq=0.5, alpha=-1)
+        )
diff --git a/tstrait/__init__.py b/tstrait/__init__.py
index 055a321..3b0aa96 100644
--- a/tstrait/__init__.py
+++ b/tstrait/__init__.py
@@ -1,3 +1,5 @@
+from tstrait.simulate_effect_size import sim_trait
+from tstrait.simulate_effect_size import TraitSimulator
 from tstrait.simulate_phenotype import GenotypeResult
 from tstrait.simulate_phenotype import PhenotypeResult
 from tstrait.simulate_phenotype import PhenotypeSimulator
@@ -12,6 +14,8 @@
 from tstrait.trait_model import TraitModelT
 
 __all__ = [
+    "TraitSimulator",
+    "sim_trait",
     "sim_phenotype",
     "PhenotypeSimulator",
     "Result",
diff --git a/tstrait/simulate_effect_size.py b/tstrait/simulate_effect_size.py
new file mode 100644
index 0000000..14786c7
--- /dev/null
+++ b/tstrait/simulate_effect_size.py
@@ -0,0 +1,171 @@
+import collections
+import numbers
+
+import numpy as np
+import pandas as pd
+import tskit
+import tstrait
+
+
+class TraitSimulator:
+    """Simulator class to select causal alleles and simulate effect sizes of causal
+    mutations.
+
+    :param ts: Tree sequence data with mutation.
+    :type ts: tskit.TreeSequence
+    :param num_causal: Number of causal sites.
+    :type num_causal: int
+    :param model: Trait model that will be used to simulate effect sizes.
+    :type model: TraitModel
+    :param alpha: Parameter that determines the emphasis placed on rarer variants.
+    :type alpha: float
+    :param random_seed: The random seed. If this is not specified or None, simulation
+        will be done randomly.
+    :type random_seed: None or int
+    """
+
+    def __init__(self, ts, num_causal, model, alpha, random_seed):
+        self.ts = ts
+        self.num_causal = num_causal
+        self.model = model
+        self.alpha = alpha
+        self.rng = np.random.default_rng(random_seed)
+
+    def _choose_causal_site(self):
+        """Randomly chooses causal site IDs among all the sites in the tree sequence
+        data. The site IDs are aligned based on their genomic positions as a part of
+        the tree sequence data requirement, so the chosen site IDs are sorted in the
+        final step. The algorithm will be faster if the site IDs are aligned by
+        their genomic locations.
+        """
+        site_id = self.rng.choice(
+            range(self.ts.num_sites), size=self.num_causal, replace=False
+        )
+        site_id = np.sort(site_id)
+
+        return site_id
+
+    def _obtain_allele_count(self, tree, site):
+        """Obtain a dictionary of allele counts, and the ancestral state is not
+        included in the dictionary. Input is the tree sequence site (`ts.site(ID)`)
+        instead of site ID, as obtaining `ts.site(ID)` can be time consuming. The
+        ancestral state is not deleted if the ancestral state is the only allele
+        at that site.
+        """
+        counts = collections.Counter({site.ancestral_state: self.ts.num_samples})
+        for m in site.mutations:
+            current_state = site.ancestral_state
+            if m.parent != tskit.NULL:
+                current_state = self.ts.mutation(m.parent).derived_state
+            # Silent mutations do nothing
+            if current_state != m.derived_state:
+                num_samples = tree.num_samples(m.node)
+                counts[m.derived_state] += num_samples
+                counts[current_state] -= num_samples
+        del counts[site.ancestral_state]
+        counts = {x: y for x, y in counts.items() if y != 0}
+        if len(counts) == 0:
+            counts = {site.ancestral_state: self.ts.num_samples}
+        return counts
+
+    def _frequency_dependence(self, allele_freq):
+        if allele_freq == 0 or allele_freq == 1:
+            const = 0
+        else:
+            const = np.sqrt(pow(2 * allele_freq * (1 - allele_freq), self.alpha))
+        return const
+
+    def sim_causal_mutation(self):
+        """This method randomly chooses causal sites and the corresponding causal state
+        based on the `num_causal` input. Afterwards, effect size of each causal site
+        is simulated based on the trait model given by the `model` input.
+
+        :return: Returns a pandas dataframe that includes causal site ID, causal allele
+            and simulated effect size
+        :rtype: pandas.DataFrame
+        """
+        causal_site_array = self._choose_causal_site()
+        num_samples = self.ts.num_samples
+        tree = tskit.Tree(self.ts)
+
+        causal_state_array = np.zeros(self.num_causal, dtype=object)
+        beta_array = np.zeros(self.num_causal)
+
+        for i, single_id in enumerate(causal_site_array):
+            site = self.ts.site(single_id)
+            tree.seek(site.position)
+            counts = self._obtain_allele_count(tree, site)
+            causal_state = self.rng.choice(list(counts))
+            causal_state_array[i] = causal_state
+            allele_freq = counts[causal_state] / num_samples
+            beta = self.model.sim_effect_size(num_causal=self.num_causal, rng=self.rng)
+            beta *= self._frequency_dependence(allele_freq)
+            beta_array[i] = beta
+
+        effect_size_df = pd.DataFrame(
+            {
+                "site_id": causal_site_array,
+                "causal_state": causal_state_array,
+                "effect_size": beta_array,
+            }
+        )
+
+        return effect_size_df
+
+
+def sim_trait(ts, num_causal, model, alpha=0, random_seed=None):
+    """Randomly selects causal sites from the inputted tree sequence data, and simulates
+    effect sizes of causal mutations.
+
+    :param ts: The tree sequence data that will be used in the quantitative trait
+        simulation. The tree sequence data must include a mutation.
+    :type ts: tskit.TreeSequence
+    :param num_causal: Number of causal sites that will be chosen randomly. It must be
+        a positive integer that is less than the number of sites in the tree sequence
+        data.
+    :type num_causal: int
+    :param model: Trait model that will be used to simulate effect sizes of causal
+        mutations.
+    :type model: TraitModel
+    :param alpha: Parameter that determines the relative weight on rarer variants.
+        A negative `alpha` value can increase the magnitude of effect sizes coming
+        from rarer variants. The frequency dependent architecture can be ignored
+        by setting `alpha` to be zero.
+    :type alpha: float
+    :param random_seed: The random seed. If this is not specified or None, simulation
+        will be done randomly.
+    :type random_seed: None or int
+    :return: Returns a pandas dataframe that includes causal site ID, causal allele and
+        simulated effect size. It can be used as an input in :func:`sim_phenotype`
+        function to simulate phenotypes.
+    :rtype: pandas.DataFrame
+    """
+    if not isinstance(ts, tskit.TreeSequence):
+        raise TypeError("Input must be a tree sequence data")
+    if not isinstance(num_causal, numbers.Number):
+        raise TypeError("Number of causal sites must be an integer")
+    if int(num_causal) != num_causal or num_causal <= 0:
+        raise ValueError("Number of causal sites must be a positive integer")
+    if not isinstance(model, tstrait.TraitModel):
+        raise TypeError("Trait model must be an instance of TraitModel")
+    num_sites = ts.num_sites
+    if num_sites == 0:
+        raise ValueError("No mutation in the provided data")
+    if num_causal > num_sites:
+        raise ValueError(
+            "There are less number of sites in the tree sequence than the inputted "
+            "number of causal sites"
+        )
+    if not isinstance(alpha, numbers.Number):
+        raise TypeError("Alpha must be a number")
+
+    simulator = TraitSimulator(
+        ts=ts,
+        num_causal=num_causal,
+        model=model,
+        alpha=alpha,
+        random_seed=random_seed,
+    )
+    effect_size_df = simulator.sim_causal_mutation()
+
+    return effect_size_df