INFRA: Verification

Statistical tests against external simulators
tskit-dev · Mar 5, 2024 · 6cdfef8 · 6cdfef8
1 parent 255539a
commit 6cdfef8
Show file tree

Hide file tree

Showing 9 changed files with 1,283 additions and 191 deletions.
diff --git a/data/comparison.py b/data/comparison.py
@@ -0,0 +1,259 @@
+"""
+This includes the Python codes to simulate quantitative traits based on three
+external simulators, AlphaSimR, simplePHENOTYPES, and the simulation framework
+described in the ARG-Needle paper.
+"""
+import subprocess
+import tempfile
+from dataclasses import dataclass
+
+import numpy as np
+import pandas as pd
+
+
+@dataclass
+class SimulationResult:
+    """
+    Dataclass that contains simulated effect sizes and phenotypes.
+
+    Attributes
+    ----------
+    trait : pandas.DataFrame
+        Trait dataframe.
+    phenotype : pandas.DataFrame
+        Phenotype dataframe.
+    """
+
+    phenotype: pd.DataFrame
+    trait: pd.DataFrame
+
+
+def simplePHENOTYPES_simulation(
+    ts, num_causal, add_effect, random_seed, num_trait=1, add_effect_2=1
+):
+    """
+    The function to simulate quantitative traits by using simplePHENOTYPES.
+    We will specify the number of causal sites and the parameter for the
+    geometric series where the effect sizes are determined.
+
+    Parameters
+    ----------
+    ts : tskit.TreeSequence
+        The tree sequence data that will be used in the quantitative trait
+        simulation.
+    num_causal : int
+        Number of causal sites.
+    add_effect : float
+        Parameter that is used in geometric series to obtain the effect
+        sizes of the first trait.
+    random_seed : int
+        Random seed.
+    num_trait : int
+        Number of traits to be simulated. It must be 0 or 1.
+    add_effect_2 : float
+        Parameter that is used in geometric series to obtain the effect
+        sizes of the second trait.
+    """
+
+    directory = tempfile.TemporaryDirectory()
+
+    vcf_filename = "vcf_comparison_simplePHENOTYPES"
+    with open(f"{directory.name}/{vcf_filename}.vcf", "w") as vcf_file:
+        ts.write_vcf(vcf_file)
+    cmd = ["Rscript", "data/simulate_simplePHENOTYPES.R"]
+    args = [
+        str(num_causal),
+        str(num_trait),
+        str(add_effect),
+        str(add_effect_2),
+        directory.name,
+        vcf_filename,
+        str(random_seed),
+    ]
+    input_cmd = cmd + args
+    subprocess.check_output(input_cmd)
+
+    if num_trait == 1:
+        phenotype_df = pd.read_csv(
+            f"{directory.name}/Simulated_Data_1_Reps_Herit_1.txt", sep="\t"
+        )
+        del phenotype_df["reps"]
+        phenotype_df = phenotype_df.rename(
+            columns={"<Trait>": "individual_id", "Pheno": "phenotype"}
+        )
+    else:
+        phenotype_df = pd.read_csv(
+            f"{directory.name}/Simulated_Data_1_Reps_Herit_1_1.txt", sep="\t"
+        )
+        del phenotype_df["Rep"]
+        phenotype_df = pd.melt(
+            phenotype_df,
+            value_vars=["Trait_1_H2_1", "Trait_2_H2_1"],
+            id_vars=["<Trait>"],
+        )
+        phenotype_df = phenotype_df.rename(
+            columns={
+                "<Trait>": "individual_id",
+                "value": "phenotype",
+                "variable": "trait_id",
+            }
+        )
+        phenotype_df = phenotype_df.replace({"Trait_1_H2_1": 0, "Trait_2_H2_1": 1})
+
+    num_ind = ts.num_individuals
+    # Change the individual ID in simplePHENOTYPES output to be consistent with the
+    # tstrait output
+    for i in range(num_ind):
+        phenotype_df = phenotype_df.replace(f"tsk_{i}", i)
+
+    qtn_df = pd.read_csv(f"{directory.name}/Additive_Selected_QTNs.txt", sep="\t")
+
+    # Obtain the list of causal allele
+    causal_allele = []
+    effect_size = []
+    effect_size_2 = []
+    for i, site_id in enumerate(qtn_df["snp"].values, start=1):
+        # simplePHENOTYPES uses ancestral state as a causal allele
+        allele = ts.site(site_id).ancestral_state
+        causal_allele.append(allele)
+        effect_size.append(add_effect**i)
+        effect_size_2.append(add_effect_2**i)
+
+    if num_trait == 2:
+        effect_size = np.append(effect_size, effect_size_2)
+
+    trait_df = pd.DataFrame(
+        {
+            "site_id": np.tile(qtn_df["snp"].values, num_trait),
+            "causal_allele": np.tile(causal_allele, num_trait),
+            "effect_size": effect_size,
+            "trait_id": np.repeat(np.arange(num_trait), len(causal_allele)),
+        }
+    )
+
+    simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df)
+
+    directory.cleanup()
+
+    return simulation_result
+
+
+def alphasimr_simulation(ts, num_causal, random_seed, corA=1, num_trait=1):
+    """
+    The function to simulate quantitative traits by using AlphaSimR. We will
+    specify the number of causal sites, such that the AlphaSimR simulation
+    will be conducted randomly.
+
+    Parameters
+    ----------
+    ts : tskit.TreeSequence
+        The tree sequence data that will be used in the quantitative trait
+        simulation.
+    num_causal : int
+        Number of causal sites.
+    corA : float
+        Correlation of pleiotropic traits.
+    num_trait : int
+        Number of traits to be simulated. It must be 1 or 2.
+    random_seed : int
+        Random seed.
+
+    Returns
+    -------
+    SimulationResult
+        Dataclass object that includes simulated phenotypes and effect sizes.
+    """
+
+    directory = tempfile.TemporaryDirectory()
+    tree_filename = "tree_comparison_AlphaSimR"
+    ts.dump(f"{directory.name}/{tree_filename}.tree")
+    phenotype_filename = "phenotype_comparison_AlphaSimR"
+    trait_filename = "trait_comparison_AlphaSimR"
+    cmd = ["Rscript", "data/simulate_AlphaSimR.R"]
+    args = [
+        str(num_causal),
+        directory.name,
+        tree_filename,
+        phenotype_filename,
+        trait_filename,
+        str(corA),
+        str(num_trait),
+        str(random_seed),
+    ]
+    input_cmd = cmd + args
+    subprocess.check_output(input_cmd)
+
+    phenotype_df = pd.read_csv(f"{directory.name}/{phenotype_filename}.csv")
+    trait_df = pd.read_csv(f"{directory.name}/{trait_filename}.csv")
+
+    # Obtain the list of causal allele
+    causal_allele = []
+    for site_id in trait_df["site_id"]:
+        allele = ts.mutation(site_id).derived_state
+        causal_allele.append(allele)
+
+    trait_df["causal_allele"] = causal_allele
+
+    simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df)
+
+    directory.cleanup()
+
+    return simulation_result
+
+
+def arg_needle_simulation(ts, alpha, random_seed):
+    """
+    The function to simulate quantitative traits based on the simulation framework
+    described in the ARG-Needle paper (Zhang et al., 2023). This assumes that all
+    sites are causal. The codes are adapted from https://zenodo.org/records/7745746.
+
+    Parameters
+    ----------
+    ts : tskit.TreeSequence
+        The tree sequence data that will be used in the quantitative trait
+        simulation.
+    alpha : float
+        The alpha parameter that will be used in frequency dependence
+        architecture.
+    random_seed : float
+        Random seed.
+
+    Returns
+    -------
+    SimulationResult
+        Dataclass object that includes simulated phenotypes and effect sizes.
+    """
+
+    rng = np.random.default_rng(random_seed)
+    num_ind = ts.num_individuals
+
+    phenotypes = np.zeros(num_ind)
+    beta_list = []
+    causal_allele = []
+
+    for variant in ts.variants():
+        row = variant.genotypes.astype("float64")
+        row = row.reshape((num_ind, 2)).sum(axis=-1)
+        std = np.std(row, ddof=1)
+        beta = rng.normal()
+        beta_list.append(beta)
+        causal_allele.append(variant.alleles[1])
+        phenotypes += row * (beta * std**alpha)
+
+    phenotypes -= np.mean(phenotypes)
+    phenotypes /= np.std(phenotypes, ddof=1)
+
+    phenotype_df = pd.DataFrame(
+        {"individual_id": np.arange(len(phenotypes)), "phenotype": phenotypes}
+    )
+    trait_df = pd.DataFrame(
+        {
+            "site_id": np.arange(len(causal_allele)),
+            "causal_allele": causal_allele,
+            "effect_size": beta_list,
+        }
+    )
+
+    simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df)
+
+    return simulation_result
diff --git a/data/simulate_AlphaSimR.R b/data/simulate_AlphaSimR.R
@@ -14,9 +14,9 @@ if (!require("tidyr")) install.packages("tidyr", repos='http://cran.us.r-project
 #   https://github.com/ ynorr/AlphaSimR_Examples/blob/master/misc/msprime.R
 # 2. Simulate quantitative traits of the founder population in AlphaSimR
 
-# The commandline input has 8 elements
+# The commandline input has 11 elements
 # [num_causal, temporary_directory_name, tree_filename, phenotype_filename,
-# trait_filename, corA, num_trait, random_seed]
+# trait_filename, corA, num_trait, h2, h2_2, num_rep, random_seed]
 myArgs <- commandArgs(trailingOnly = TRUE)
 # Convert to numerics
 num_causal <- as.numeric(myArgs[1])
@@ -26,7 +26,10 @@ phenotype_filename <- myArgs[4]
 trait_filename <- myArgs[5]
 corA <- as.numeric(myArgs[6])
 num_trait <- as.numeric(myArgs[7])
-random_seed <- as.numeric(myArgs[8])
+h2 <- as.numeric(myArgs[8])
+h2_2 <- as.numeric(myArgs[9])
+num_rep <- as.numeric(myArgs[10])
+random_seed <- as.numeric(myArgs[11])
 
 set.seed(random_seed)
 
@@ -51,49 +54,56 @@ if (num_trait == 1){
   mean <- 0
   var <- 1
   corA <- NULL
-  H2 <- 1
+  H2 <- h2
 } else if (num_trait == 2){
   mean <- c(0,0)
   var <- c(1,1)
   corA <- matrix(c(1,corA,corA,1),nrow=2,ncol=2)
-  H2 <- c(1,1)
+  H2 <- c(h2,h2_2)
 }
 
-SP <- SimParam$
-  new(founderPop)$
-  addTraitA(
-    nQtlPerChr=num_causal,
-    mean=mean,
-    var=var,
-    corA=corA
-  )$
-  setVarE(H2=H2)
-
-individuals <- newPop(founderPop)
-
-trait_df <- c()
-phenotype_df <- c()
-
-for (trait_id in 1:num_trait){
-  qtl_site <- SP$traits[[trait_id]]@lociLoc - 1
-  effect_size <- SP$traits[[trait_id]]@addEff
-  trait_id_df <- data.frame(
-    effect_size = effect_size,
-    site_id = qtl_site,
-    trait_id = rep(trait_id-1, length(effect_size))
-  )
-  trait_df <- rbind(trait_df, trait_id_df)
-  phenotype <- individuals@pheno[,trait_id]
-  phenotype_id_df <- data.frame(
-    phenotype=phenotype,
-    individual_id = 0:(num_ind-1),
-    trait_id = rep(trait_id-1, num_ind)
-  )
-  phenotype_df <- rbind(phenotype_df, phenotype_id_df)
+phenotype_result <- c()
+trait_result <- c()
+
+for (i in 1:num_rep) {
+  SP <- SimParam$
+    new(founderPop)$
+    addTraitA(
+      nQtlPerChr=num_causal,
+      mean=mean,
+      var=var,
+      corA=corA
+    )$
+    setVarE(H2=H2)
+
+  individuals <- newPop(founderPop)
+
+  trait_df <- c()
+  phenotype_df <- c()
+
+  for (trait_id in 1:num_trait){
+    qtl_site <- SP$traits[[trait_id]]@lociLoc - 1
+    effect_size <- SP$traits[[trait_id]]@addEff
+    trait_id_df <- data.frame(
+      effect_size = effect_size,
+      site_id = qtl_site,
+      trait_id = rep(trait_id-1, length(effect_size))
+    )
+    trait_df <- rbind(trait_df, trait_id_df)
+    phenotype <- individuals@pheno[,trait_id]
+    phenotype_id_df <- data.frame(
+      phenotype=phenotype,
+      individual_id = 0:(num_ind-1),
+      trait_id = rep(trait_id-1, num_ind)
+    )
+    phenotype_df <- rbind(phenotype_df, phenotype_id_df)
+  }
+  phenotype_result <- rbind(phenotype_result, phenotype_df)
+  trait_result <- rbind(trait_result, trait_df)
 }
 
 phenotype_filename <- paste0(directory_name,"/",phenotype_filename,".csv")
-write.csv(phenotype_df, phenotype_filename, row.names=FALSE)
+write.csv(phenotype_result, phenotype_filename, row.names=FALSE)
 
 trait_filename <- paste0(directory_name,"/",trait_filename,".csv")
-write.csv(trait_df, trait_filename, row.names=FALSE)
+write.csv(trait_result, trait_filename, row.names=FALSE)
diff --git a/data/simulate_simplePHENOTYPES.R b/data/simulate_simplePHENOTYPES.R
@@ -47,4 +47,4 @@ suppressMessages(create_phenotypes(
   verbose = FALSE,
   mean = mean,
   ntraits = num_trait
-))
+))