Skip to content

Commit

Permalink
INFRA: Verification
Browse files Browse the repository at this point in the history
Statistical tests against external simulators
  • Loading branch information
daikitag committed Mar 5, 2024
1 parent 255539a commit 6cdfef8
Show file tree
Hide file tree
Showing 9 changed files with 1,283 additions and 191 deletions.
259 changes: 259 additions & 0 deletions data/comparison.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""
This includes the Python codes to simulate quantitative traits based on three
external simulators, AlphaSimR, simplePHENOTYPES, and the simulation framework
described in the ARG-Needle paper.
"""
import subprocess
import tempfile
from dataclasses import dataclass

import numpy as np
import pandas as pd


@dataclass
class SimulationResult:
"""
Dataclass that contains simulated effect sizes and phenotypes.
Attributes
----------
trait : pandas.DataFrame
Trait dataframe.
phenotype : pandas.DataFrame
Phenotype dataframe.
"""

phenotype: pd.DataFrame
trait: pd.DataFrame


def simplePHENOTYPES_simulation(
ts, num_causal, add_effect, random_seed, num_trait=1, add_effect_2=1
):
"""
The function to simulate quantitative traits by using simplePHENOTYPES.
We will specify the number of causal sites and the parameter for the
geometric series where the effect sizes are determined.
Parameters
----------
ts : tskit.TreeSequence
The tree sequence data that will be used in the quantitative trait
simulation.
num_causal : int
Number of causal sites.
add_effect : float
Parameter that is used in geometric series to obtain the effect
sizes of the first trait.
random_seed : int
Random seed.
num_trait : int
Number of traits to be simulated. It must be 0 or 1.
add_effect_2 : float
Parameter that is used in geometric series to obtain the effect
sizes of the second trait.
"""

directory = tempfile.TemporaryDirectory()

vcf_filename = "vcf_comparison_simplePHENOTYPES"
with open(f"{directory.name}/{vcf_filename}.vcf", "w") as vcf_file:
ts.write_vcf(vcf_file)
cmd = ["Rscript", "data/simulate_simplePHENOTYPES.R"]
args = [
str(num_causal),
str(num_trait),
str(add_effect),
str(add_effect_2),
directory.name,
vcf_filename,
str(random_seed),
]
input_cmd = cmd + args
subprocess.check_output(input_cmd)

if num_trait == 1:
phenotype_df = pd.read_csv(
f"{directory.name}/Simulated_Data_1_Reps_Herit_1.txt", sep="\t"
)
del phenotype_df["reps"]
phenotype_df = phenotype_df.rename(
columns={"<Trait>": "individual_id", "Pheno": "phenotype"}
)
else:
phenotype_df = pd.read_csv(
f"{directory.name}/Simulated_Data_1_Reps_Herit_1_1.txt", sep="\t"
)
del phenotype_df["Rep"]
phenotype_df = pd.melt(
phenotype_df,
value_vars=["Trait_1_H2_1", "Trait_2_H2_1"],
id_vars=["<Trait>"],
)
phenotype_df = phenotype_df.rename(
columns={
"<Trait>": "individual_id",
"value": "phenotype",
"variable": "trait_id",
}
)
phenotype_df = phenotype_df.replace({"Trait_1_H2_1": 0, "Trait_2_H2_1": 1})

num_ind = ts.num_individuals
# Change the individual ID in simplePHENOTYPES output to be consistent with the
# tstrait output
for i in range(num_ind):
phenotype_df = phenotype_df.replace(f"tsk_{i}", i)

qtn_df = pd.read_csv(f"{directory.name}/Additive_Selected_QTNs.txt", sep="\t")

# Obtain the list of causal allele
causal_allele = []
effect_size = []
effect_size_2 = []
for i, site_id in enumerate(qtn_df["snp"].values, start=1):
# simplePHENOTYPES uses ancestral state as a causal allele
allele = ts.site(site_id).ancestral_state
causal_allele.append(allele)
effect_size.append(add_effect**i)
effect_size_2.append(add_effect_2**i)

if num_trait == 2:
effect_size = np.append(effect_size, effect_size_2)

trait_df = pd.DataFrame(
{
"site_id": np.tile(qtn_df["snp"].values, num_trait),
"causal_allele": np.tile(causal_allele, num_trait),
"effect_size": effect_size,
"trait_id": np.repeat(np.arange(num_trait), len(causal_allele)),
}
)

simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df)

directory.cleanup()

return simulation_result


def alphasimr_simulation(ts, num_causal, random_seed, corA=1, num_trait=1):
"""
The function to simulate quantitative traits by using AlphaSimR. We will
specify the number of causal sites, such that the AlphaSimR simulation
will be conducted randomly.
Parameters
----------
ts : tskit.TreeSequence
The tree sequence data that will be used in the quantitative trait
simulation.
num_causal : int
Number of causal sites.
corA : float
Correlation of pleiotropic traits.
num_trait : int
Number of traits to be simulated. It must be 1 or 2.
random_seed : int
Random seed.
Returns
-------
SimulationResult
Dataclass object that includes simulated phenotypes and effect sizes.
"""

directory = tempfile.TemporaryDirectory()
tree_filename = "tree_comparison_AlphaSimR"
ts.dump(f"{directory.name}/{tree_filename}.tree")
phenotype_filename = "phenotype_comparison_AlphaSimR"
trait_filename = "trait_comparison_AlphaSimR"
cmd = ["Rscript", "data/simulate_AlphaSimR.R"]
args = [
str(num_causal),
directory.name,
tree_filename,
phenotype_filename,
trait_filename,
str(corA),
str(num_trait),
str(random_seed),
]
input_cmd = cmd + args
subprocess.check_output(input_cmd)

phenotype_df = pd.read_csv(f"{directory.name}/{phenotype_filename}.csv")
trait_df = pd.read_csv(f"{directory.name}/{trait_filename}.csv")

# Obtain the list of causal allele
causal_allele = []
for site_id in trait_df["site_id"]:
allele = ts.mutation(site_id).derived_state
causal_allele.append(allele)

trait_df["causal_allele"] = causal_allele

simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df)

directory.cleanup()

return simulation_result


def arg_needle_simulation(ts, alpha, random_seed):
"""
The function to simulate quantitative traits based on the simulation framework
described in the ARG-Needle paper (Zhang et al., 2023). This assumes that all
sites are causal. The codes are adapted from https://zenodo.org/records/7745746.
Parameters
----------
ts : tskit.TreeSequence
The tree sequence data that will be used in the quantitative trait
simulation.
alpha : float
The alpha parameter that will be used in frequency dependence
architecture.
random_seed : float
Random seed.
Returns
-------
SimulationResult
Dataclass object that includes simulated phenotypes and effect sizes.
"""

rng = np.random.default_rng(random_seed)
num_ind = ts.num_individuals

phenotypes = np.zeros(num_ind)
beta_list = []
causal_allele = []

for variant in ts.variants():
row = variant.genotypes.astype("float64")
row = row.reshape((num_ind, 2)).sum(axis=-1)
std = np.std(row, ddof=1)
beta = rng.normal()
beta_list.append(beta)
causal_allele.append(variant.alleles[1])
phenotypes += row * (beta * std**alpha)

phenotypes -= np.mean(phenotypes)
phenotypes /= np.std(phenotypes, ddof=1)

phenotype_df = pd.DataFrame(
{"individual_id": np.arange(len(phenotypes)), "phenotype": phenotypes}
)
trait_df = pd.DataFrame(
{
"site_id": np.arange(len(causal_allele)),
"causal_allele": causal_allele,
"effect_size": beta_list,
}
)

simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df)

return simulation_result
86 changes: 48 additions & 38 deletions data/simulate_AlphaSimR.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ if (!require("tidyr")) install.packages("tidyr", repos='http://cran.us.r-project
# https://github.com/ ynorr/AlphaSimR_Examples/blob/master/misc/msprime.R
# 2. Simulate quantitative traits of the founder population in AlphaSimR

# The commandline input has 8 elements
# The commandline input has 11 elements
# [num_causal, temporary_directory_name, tree_filename, phenotype_filename,
# trait_filename, corA, num_trait, random_seed]
# trait_filename, corA, num_trait, h2, h2_2, num_rep, random_seed]
myArgs <- commandArgs(trailingOnly = TRUE)
# Convert to numerics
num_causal <- as.numeric(myArgs[1])
Expand All @@ -26,7 +26,10 @@ phenotype_filename <- myArgs[4]
trait_filename <- myArgs[5]
corA <- as.numeric(myArgs[6])
num_trait <- as.numeric(myArgs[7])
random_seed <- as.numeric(myArgs[8])
h2 <- as.numeric(myArgs[8])
h2_2 <- as.numeric(myArgs[9])
num_rep <- as.numeric(myArgs[10])
random_seed <- as.numeric(myArgs[11])

set.seed(random_seed)

Expand All @@ -51,49 +54,56 @@ if (num_trait == 1){
mean <- 0
var <- 1
corA <- NULL
H2 <- 1
H2 <- h2
} else if (num_trait == 2){
mean <- c(0,0)
var <- c(1,1)
corA <- matrix(c(1,corA,corA,1),nrow=2,ncol=2)
H2 <- c(1,1)
H2 <- c(h2,h2_2)
}

SP <- SimParam$
new(founderPop)$
addTraitA(
nQtlPerChr=num_causal,
mean=mean,
var=var,
corA=corA
)$
setVarE(H2=H2)

individuals <- newPop(founderPop)

trait_df <- c()
phenotype_df <- c()

for (trait_id in 1:num_trait){
qtl_site <- SP$traits[[trait_id]]@lociLoc - 1
effect_size <- SP$traits[[trait_id]]@addEff
trait_id_df <- data.frame(
effect_size = effect_size,
site_id = qtl_site,
trait_id = rep(trait_id-1, length(effect_size))
)
trait_df <- rbind(trait_df, trait_id_df)
phenotype <- individuals@pheno[,trait_id]
phenotype_id_df <- data.frame(
phenotype=phenotype,
individual_id = 0:(num_ind-1),
trait_id = rep(trait_id-1, num_ind)
)
phenotype_df <- rbind(phenotype_df, phenotype_id_df)
phenotype_result <- c()
trait_result <- c()

for (i in 1:num_rep) {
SP <- SimParam$
new(founderPop)$
addTraitA(
nQtlPerChr=num_causal,
mean=mean,
var=var,
corA=corA
)$
setVarE(H2=H2)

individuals <- newPop(founderPop)

trait_df <- c()
phenotype_df <- c()

for (trait_id in 1:num_trait){
qtl_site <- SP$traits[[trait_id]]@lociLoc - 1
effect_size <- SP$traits[[trait_id]]@addEff
trait_id_df <- data.frame(
effect_size = effect_size,
site_id = qtl_site,
trait_id = rep(trait_id-1, length(effect_size))
)
trait_df <- rbind(trait_df, trait_id_df)
phenotype <- individuals@pheno[,trait_id]
phenotype_id_df <- data.frame(
phenotype=phenotype,
individual_id = 0:(num_ind-1),
trait_id = rep(trait_id-1, num_ind)
)
phenotype_df <- rbind(phenotype_df, phenotype_id_df)
}
phenotype_result <- rbind(phenotype_result, phenotype_df)
trait_result <- rbind(trait_result, trait_df)
}

phenotype_filename <- paste0(directory_name,"/",phenotype_filename,".csv")
write.csv(phenotype_df, phenotype_filename, row.names=FALSE)
write.csv(phenotype_result, phenotype_filename, row.names=FALSE)

trait_filename <- paste0(directory_name,"/",trait_filename,".csv")
write.csv(trait_df, trait_filename, row.names=FALSE)
write.csv(trait_result, trait_filename, row.names=FALSE)
2 changes: 1 addition & 1 deletion data/simulate_simplePHENOTYPES.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ suppressMessages(create_phenotypes(
verbose = FALSE,
mean = mean,
ntraits = num_trait
))
))
Loading

0 comments on commit 6cdfef8

Please sign in to comment.