diff --git a/data/simulate_AlphaSimR.R b/data/simulate_AlphaSimR.R deleted file mode 100644 index a768b6c..0000000 --- a/data/simulate_AlphaSimR.R +++ /dev/null @@ -1,99 +0,0 @@ -if (!require("AlphaSimR")) install.packages("AlphaSimR", repos='http://cran.us.r-project.org') -library(AlphaSimR) -if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') -library(reticulate) -if (!require("tidyr")) install.packages("tidyr", repos='http://cran.us.r-project.org') - -# This code is used from verification.py to simulate quantitative traits -# by using AlphaSimR. -# -# The basic simulation step is the following: -# 1. Use the tskit Python package through the R package tskit and load the tree -# sequence data as a founder population in AlphaSimR. The codes of this step are -# largely adapted from -# https://github.com/ ynorr/AlphaSimR_Examples/blob/master/misc/msprime.R -# 2. Simulate quantitative traits of the founder population in AlphaSimR - -# The commandline input has 8 elements -# [num_causal, temporary_directory_name, tree_filename, phenotype_filename, -# trait_filename, corA, num_trait, random_seed] -myArgs <- commandArgs(trailingOnly = TRUE) -# Convert to numerics -num_causal <- as.numeric(myArgs[1]) -directory_name <- myArgs[2] -tree_filename <- myArgs[3] -phenotype_filename <- myArgs[4] -trait_filename <- myArgs[5] -corA <- as.numeric(myArgs[6]) -num_trait <- as.numeric(myArgs[7]) -random_seed <- as.numeric(myArgs[8]) - -set.seed(random_seed) - -tskit <- import("tskit") - -tree_filename <- paste0(directory_name,"/", tree_filename,".tree") -ts <- tskit$load(tree_filename) - -sites <- ts$tables$sites$asdict() -pos <- sites$position * 1e-8 # Convert to Morgans -pos <- pos - pos[1] # Set first position to zero - -# Extract haplotypes -haplo <- t(ts$genotype_matrix()) - -# Create an AlphaSimR founder population -founderPop <- newMapPop(genMap=list(pos), haplotypes=list(haplo)) - -num_ind <- nrow(haplo) / 2 - -if (num_trait == 1){ - mean <- 0 - var <- 1 - corA <- NULL - H2 <- 1 -} else if (num_trait == 2){ - mean <- c(0,0) - var <- c(1,1) - corA <- matrix(c(1,corA,corA,1),nrow=2,ncol=2) - H2 <- c(1,1) -} - -SP <- SimParam$ - new(founderPop)$ - addTraitA( - nQtlPerChr=num_causal, - mean=mean, - var=var, - corA=corA - )$ - setVarE(H2=H2) - -individuals <- newPop(founderPop) - -trait_df <- c() -phenotype_df <- c() - -for (trait_id in 1:num_trait){ - qtl_site <- SP$traits[[trait_id]]@lociLoc - 1 - effect_size <- SP$traits[[trait_id]]@addEff - trait_id_df <- data.frame( - effect_size = effect_size, - site_id = qtl_site, - trait_id = rep(trait_id-1, length(effect_size)) - ) - trait_df <- rbind(trait_df, trait_id_df) - phenotype <- individuals@pheno[,trait_id] - phenotype_id_df <- data.frame( - phenotype=phenotype, - individual_id = 0:(num_ind-1), - trait_id = rep(trait_id-1, num_ind) - ) - phenotype_df <- rbind(phenotype_df, phenotype_id_df) -} - -phenotype_filename <- paste0(directory_name,"/",phenotype_filename,".csv") -write.csv(phenotype_df, phenotype_filename, row.names=FALSE) - -trait_filename <- paste0(directory_name,"/",trait_filename,".csv") -write.csv(trait_df, trait_filename, row.names=FALSE) \ No newline at end of file diff --git a/scripts/simulate_AlphaSimR.R b/scripts/simulate_AlphaSimR.R new file mode 100644 index 0000000..6be3d84 --- /dev/null +++ b/scripts/simulate_AlphaSimR.R @@ -0,0 +1,106 @@ +if (!require("AlphaSimR")) install.packages("AlphaSimR", repos='http://cran.us.r-project.org') +library(AlphaSimR) +if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') +library(reticulate) + +# This code is used from verification.py to simulate quantitative traits +# by using AlphaSimR. +# +# The basic simulation step is the following: +# 1. Use the tskit Python package through the R package tskit and load the tree +# sequence data as a founder population in AlphaSimR. The codes of this step are +# largely adapted from +# https://github.com/ ynorr/AlphaSimR_Examples/blob/master/misc/msprime.R +# 2. Simulate quantitative traits of the founder population in AlphaSimR + +# The commandline input has 8 elements +# [num_causal, temporary_directory_name, +# corA, num_trait, h2, h2_2, num_rep, random_seed] + +myArgs <- commandArgs(trailingOnly = TRUE) +# Convert to numerics +num_causal <- as.numeric(myArgs[1]) +directory_name <- myArgs[2] +corA <- as.numeric(myArgs[3]) +num_trait <- as.numeric(myArgs[4]) +h2 <- as.numeric(myArgs[5]) +h2_2 <- as.numeric(myArgs[6]) +num_rep <- as.numeric(myArgs[7]) +random_seed <- as.numeric(myArgs[8]) + +set.seed(random_seed) + +tskit <- import("tskit") + +tree_filename <- paste0(directory_name,"/tree_seq.tree") +ts <- tskit$load(tree_filename) + +sites <- ts$tables$sites$asdict() +pos <- sites$position * 1e-8 # Convert to Morgans +pos <- pos - pos[1] # Set first position to zero + +# Extract haplotypes +haplo <- t(ts$genotype_matrix()) + +# Create an AlphaSimR founder population +founderPop <- newMapPop(genMap=list(pos), haplotypes=list(haplo)) + +num_ind <- nrow(haplo) / 2 + +if (num_trait == 1){ + mean <- 0 + var <- 1 + corA <- NULL + H2 <- h2 +} else if (num_trait == 2){ + mean <- c(0,0) + var <- c(1,1) + corA <- matrix(c(1,corA,corA,1),nrow=2,ncol=2) + H2 <- c(h2,h2_2) +} + +phenotype_result <- c() +trait_result <- c() + +for (i in 1:num_rep) { + SP <- SimParam$ + new(founderPop)$ + addTraitA( + nQtlPerChr=num_causal, + mean=mean, + var=var, + corA=corA + )$ + setVarE(H2=H2) + + individuals <- newPop(founderPop) + + trait_df <- c() + phenotype_df <- c() + + for (trait_id in 1:num_trait){ + qtl_site <- SP$traits[[trait_id]]@lociLoc - 1 + effect_size <- SP$traits[[trait_id]]@addEff + trait_id_df <- data.frame( + effect_size = effect_size, + site_id = qtl_site, + trait_id = rep(trait_id-1, length(effect_size)) + ) + trait_df <- rbind(trait_df, trait_id_df) + phenotype <- individuals@pheno[,trait_id] + phenotype_id_df <- data.frame( + phenotype=phenotype, + individual_id = 0:(num_ind-1), + trait_id = rep(trait_id-1, num_ind) + ) + phenotype_df <- rbind(phenotype_df, phenotype_id_df) + } + phenotype_result <- rbind(phenotype_result, phenotype_df) + trait_result <- rbind(trait_result, trait_df) +} + +phenotype_filename <- paste0(directory_name,"/phenotype_alphasimr.csv") +write.csv(phenotype_result, phenotype_filename, row.names=FALSE) + +trait_filename <- paste0(directory_name,"/trait_alphasimr.csv") +write.csv(trait_result, trait_filename, row.names=FALSE) diff --git a/data/simulate_simplePHENOTYPES.R b/scripts/simulate_simplePHENOTYPES_exact.R similarity index 77% rename from data/simulate_simplePHENOTYPES.R rename to scripts/simulate_simplePHENOTYPES_exact.R index fd94c78..de71d44 100644 --- a/data/simulate_simplePHENOTYPES.R +++ b/scripts/simulate_simplePHENOTYPES_exact.R @@ -1,16 +1,14 @@ if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org') library(simplePHENOTYPES) -if (!require("reticulate")) install.packages("reticulate", repos='http://cran.us.r-project.org') -library(reticulate) # This code is used from verification.py to simulate quantitative traits # by using simplePHENOTYPES. # This code loads the vcf file and simulates quantitative traits -# The commandline input has 7 elements +# The commandline input has 6 elements # [num_causal, num_trait, add_effect, add_effect_2, directory_name, -# vcf_filename, random_seed] +# random_seed] myArgs <- commandArgs(trailingOnly = TRUE) num_causal <- as.numeric(myArgs[1]) @@ -18,8 +16,7 @@ num_trait <- as.numeric(myArgs[2]) add_effect <- as.numeric(myArgs[3]) add_effect_2 <- as.numeric(myArgs[4]) directory_name <- myArgs[5] -vcf_filename <- myArgs[6] -random_seed <- as.numeric(myArgs[7]) +random_seed <- as.numeric(myArgs[6]) if (num_trait == 1){ effect <- add_effect @@ -32,7 +29,7 @@ if (num_trait == 1){ } suppressMessages(create_phenotypes( - geno_file = paste0(directory_name, "/", vcf_filename, ".vcf"), + geno_file = paste0(directory_name, "/tree_seq.vcf"), add_QTN_num = num_causal, add_effect = effect, rep = 1, diff --git a/scripts/simulate_simplePHENOTYPES_qqplot.R b/scripts/simulate_simplePHENOTYPES_qqplot.R new file mode 100644 index 0000000..39cd01d --- /dev/null +++ b/scripts/simulate_simplePHENOTYPES_qqplot.R @@ -0,0 +1,70 @@ +if (!require("simplePHENOTYPES")) install.packages("simplePHENOTYPES", repos='http://cran.us.r-project.org') +library(simplePHENOTYPES) + +# This code is used from verification.py to simulate quantitative traits +# by using simplePHENOTYPES. + +# This code loads the vcf file that is generated in `verification.py` +# and uses the effect size from a normal distribution to simulate +# additive traits. + +# The commandline input has 7 elements +# [num_causal, h2, directory_name, +# num_rep, mean, var, random_seed] +myArgs <- commandArgs(trailingOnly = TRUE) + +num_causal <- as.numeric(myArgs[1]) +h2 <- as.numeric(myArgs[2]) +directory_name <- myArgs[3] +num_rep <- myArgs[4] +mean <- as.numeric(myArgs[5]) +var <- as.numeric(myArgs[6]) +random_seed <- as.numeric(myArgs[7]) + +set.seed(random_seed) + +sd <- sqrt(var) + +# Function to simulate phenotypes from simplePHENOTYPES +# The effect sizes are simulated from a normal distribution, +# as the geometric series is the only effect size distribution +# supported in simplePHENOTYPES. +simulate_simplePHENOTYPE <- function( + num_causal, random_seed + ) { + effect_size <- list(rnorm(n=num_causal, mean=mean, sd=sd)) + phenotypes <- suppressMessages(create_phenotypes( + geno_file = paste0(directory_name, "/tree_seq.vcf"), + add_QTN_num = num_causal, + add_effect = effect_size, + rep = 1, + h2 = h2, + model = "A", + seed = random_seed, + vary_QTN = FALSE, + to_r = TRUE, + sim_method = "custom", + quiet = TRUE, + home_dir = directory_name, + verbose = FALSE, + mean = 0 + )) + # The mean is centered at 0 from simplePHENOTYPES simulation + # so we will divide it by the standard deviation to normalise + # the data + phenotypes[,2] <- phenotypes[,2] / sd(phenotypes[,2]) + names(phenotypes)[1:2] <- c("individual_id", "phenotype") + return(phenotypes) +} + +phenotype_result <- c() + +for (i in 1:num_rep) { + simulated_result <- simulate_simplePHENOTYPE( + num_causal=num_causal, random_seed=random_seed+i + ) + phenotype_result <- rbind(phenotype_result, simulated_result) +} + +filename = paste0(directory_name, "/simplePHENOTYPES.csv") +write.csv(phenotype_result, filename, row.names=FALSE) diff --git a/scripts/simulation_codes.md b/scripts/simulation_codes.md new file mode 100644 index 0000000..2e2920b --- /dev/null +++ b/scripts/simulation_codes.md @@ -0,0 +1,10 @@ +# Simulation Codes + +This +This directory is used to store the R simulation codes. + +R simulation codes: + +- simulate_AlphaSimR.R: This R code uses AlphaSimR to simulate quantitative traits based on a simulated tree sequence data. +- simulate_simplePHENOTYPES_exact.R: This R code uses simplePHENOTYPES to simulate quantitative traits based on a VCF file. This is used for exact tests. +- simulate_simplePHENOTYPES_qqplot.R: This R code uses simplePHENOTYPES to simulate quantitative traits. This is used in qqplot construction. \ No newline at end of file diff --git a/verification.py b/verification.py index ba7fb07..7b43454 100644 --- a/verification.py +++ b/verification.py @@ -2,6 +2,57 @@ Script to automate verification of tstrait against known statistical results and benchmark programs such as AlphaSimR and simplePHENOTYPES. +We have conducted the following tests: + +1. Exact tests +We simulated effect sizes and phenotypes by using AlphaSimR, simplePHENOTYPES +and the simulation framework described in ARG-Needle paper, and used the +simulated effect sizes in tstrait to simulate phenotypes, while setting the +environmental noise to be zero in all simulations. We then tested if the +simulated phenotypes in tstrait exactly match the simulated phenotypes +of external programs. + +This test aims to examine whether tstrait can correctly use the genetic information +of individuals to accurately compute the genetic values. We have validated the +tstrait's output for single trait simulation and pleiotropic trait simulation. +These tests are implemented in `ExactTest` class. + +2. Comparison tests +We simulated phenotypes in AlphaSimR, simplePHENOTYPES and the simulation +framework described in ARG-Needle paper by using the same parameters as the +tstrait simulation. We have simulated traits for a single individual in the +tree sequence multiple times and examined if their phenotype distributions match +by using a QQ-plot. + +This test serves as an end to end testing of tstrait with environmental noise +simulation and tries to examine if the statistical properties of the +simulated traits matches the output of different simulation packages. +We have examined the tstrait output for different values of +heritability and the alpha parameter that is used in the frequency dependence +architecture. These tests are implemented in ComparisonTest. + +3. Statistical tests +We have examined the statistical properties of tstrait's simulation output. +The tests in `EffectSizeDistribution` examine the statistical properties of +simulated effect sizes and the tests in `EnvironmentalNoise` examine the +simulated environmental noise. + +NOTE: The properties of tstrait's simulation algorithm (such as whether it +can correctly detect mutations in a tree sequence) are validated in unit tests. + +THe differences between each simulators are highlighted as the following: +1. simplePHENOTYPES +- Effect sizes can only be simulated from geometric series, so a normal +distribution must be specified if we want to simulate traits where effect +sizes are drawn from a normal distribution +- Ancestral state is set as a causal state in simplePHENOTYPES + +2. AlphaSimR +- Genetic values are normalized in the simulation process + +3. Simulation framework in ARG-Needle paper +- We assume that all sites are causal + These codes are largely adapted from msprime/verification.py. Please see its documentation for usage. @@ -25,6 +76,7 @@ import pandas as pd import scipy import seaborn as sns +import stdpopsim import tqdm import tstrait from matplotlib import pyplot @@ -101,49 +153,70 @@ class SimulationResult: trait: pd.DataFrame -class ExactTest(Test): +def _simulate_simplePHENOTYPES_qqplot( + ts, num_causal, h2, random_seed, num_rep, mean=0, var=1 +): """ - Compare tstrait against simplePHENOTYPES, AlphaSimR and the simulation framework - in ARG-Needle paper. For all benchmarking, we simulate a genetic data of 100 - individuals with 100 kb sequence length and mutations are simulated from an - infinite-sites model. - - We assume that the following in all quantitative trait simulation: - - Narrow-sense heritability is 1 to compare the simulated genetic values. - - Genetic effect sizes are simulated from an external simulator and they are used to - simulate quantitative traits in tstrait. + The input of this function is a tree sequence and the output is a phenotype + dataframe. This function is used for comparing the simulation output in + a QQ-plot. + The output of this function will be a phenotype dataframe. """ + with tempfile.TemporaryDirectory() as tmpdirname: + with open(f"{tmpdirname}/tree_seq.vcf", "w") as vcf_file: + ts.write_vcf( + vcf_file, individual_names=np.arange(ts.num_individuals).astype(str) + ) + cmd = ["Rscript", "scripts/simulate_simplePHENOTYPES_qqplot.R"] + args = [ + str(num_causal), + str(h2), + tmpdirname, + str(num_rep), + str(mean), + str(var), + str(random_seed), + ] + input_cmd = cmd + args + subprocess.check_output(input_cmd) - def _simulate_simplePHENOTYPE( - self, ts, num_causal, add_effect, random_seed, num_trait=1, add_effect_2=1 - ): - """ - The function to simulate quantitative traits by using simplePHENOTYPES. - We will specify the number of causal sites and the parameter for the - geometric series where the effect sizes are determined. - """ + phenotype_df = pd.read_csv(f"{tmpdirname}/simplePHENOTYPES.csv") + + return phenotype_df - directory = tempfile.TemporaryDirectory() - vcf_filename = "vcf_comparison_simplePHENOTYPES" - with open(f"{directory.name}/{vcf_filename}.vcf", "w") as vcf_file: - ts.write_vcf(vcf_file) - cmd = ["Rscript", "data/simulate_simplePHENOTYPES.R"] +def _simulate_simplePHENOTYPES_exact( + ts, num_causal, random_seed, add_effect=1, num_trait=1, add_effect_2=1 +): + """ + The function to simulate quantitative traits by using simplePHENOTYPES. + We will specify the number of causal sites and the parameter for the + geometric series where the effect sizes are determined. + The output of this function will be a SimulationResult object. + """ + + with tempfile.TemporaryDirectory() as tmpdirname: + with open(f"{tmpdirname}/tree_seq.vcf", "w") as vcf_file: + ts.write_vcf( + vcf_file, individual_names=np.arange(ts.num_individuals).astype(str) + ) + cmd = ["Rscript", "scripts/simulate_simplePHENOTYPES_exact.R"] args = [ str(num_causal), str(num_trait), str(add_effect), str(add_effect_2), - directory.name, - vcf_filename, + tmpdirname, str(random_seed), ] input_cmd = cmd + args subprocess.check_output(input_cmd) + qtn_df = pd.read_csv(f"{tmpdirname}/Additive_Selected_QTNs.txt", sep="\t") + if num_trait == 1: phenotype_df = pd.read_csv( - f"{directory.name}/Simulated_Data_1_Reps_Herit_1.txt", sep="\t" + f"{tmpdirname}/Simulated_Data_1_Reps_Herit_1.txt", sep="\t" ) del phenotype_df["reps"] phenotype_df = phenotype_df.rename( @@ -151,7 +224,7 @@ def _simulate_simplePHENOTYPE( ) else: phenotype_df = pd.read_csv( - f"{directory.name}/Simulated_Data_1_Reps_Herit_1_1.txt", sep="\t" + f"{tmpdirname}/Simulated_Data_1_Reps_Herit_1_1.txt", sep="\t" ) del phenotype_df["Rep"] phenotype_df = pd.melt( @@ -168,97 +241,93 @@ def _simulate_simplePHENOTYPE( ) phenotype_df = phenotype_df.replace({"Trait_1_H2_1": 0, "Trait_2_H2_1": 1}) - num_ind = ts.num_individuals - # Change the individual ID in simplePHENOTYPES output to be consistent with the - # tstrait output - for i in range(num_ind): - phenotype_df = phenotype_df.replace(f"tsk_{i}", i) - - qtn_df = pd.read_csv(f"{directory.name}/Additive_Selected_QTNs.txt", sep="\t") - - # Obtain the list of causal allele - causal_allele = [] - effect_size = [] - effect_size_2 = [] - for i, site_id in enumerate(qtn_df["snp"].values, start=1): - # simplePHENOTYPES uses ancestral state as a causal allele - allele = ts.site(site_id).ancestral_state - causal_allele.append(allele) - effect_size.append(add_effect**i) - effect_size_2.append(add_effect_2**i) - - if num_trait == 2: - effect_size = np.append(effect_size, effect_size_2) - - trait_df = pd.DataFrame( - { - "site_id": np.tile(qtn_df["snp"].values, num_trait), - "causal_allele": np.tile(causal_allele, num_trait), - "effect_size": effect_size, - "trait_id": np.repeat(np.arange(num_trait), len(causal_allele)), - } - ) + # Obtain the list of causal allele + causal_allele = [] + effect_size = [] + effect_size_2 = [] + for i, site_id in enumerate(qtn_df["snp"].values, start=1): + # simplePHENOTYPES uses ancestral state as a causal allele + allele = ts.site(site_id).ancestral_state + causal_allele.append(allele) + effect_size.append(add_effect**i) + effect_size_2.append(add_effect_2**i) + + if num_trait == 2: + effect_size = np.append(effect_size, effect_size_2) + + trait_df = pd.DataFrame( + { + "site_id": np.tile(qtn_df["snp"].values, num_trait), + "causal_allele": np.tile(causal_allele, num_trait), + "effect_size": effect_size, + "trait_id": np.repeat(np.arange(num_trait), len(causal_allele)), + } + ) - simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) + simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) - directory.cleanup() + return simulation_result - return simulation_result - def _simulate_AlphaSimR(self, ts, num_causal, random_seed, corA=1, num_trait=1): - """ - The function to simulate quantitative traits by using AlphaSimR. We will - specify the number of causal sites, such that the AlphaSimR simulation - will be conducted randomly. corA is used to specify the correlation - coefficient of pleiotropic traits. - """ +def _simulate_AlphaSimR( + ts, num_causal, h2, random_seed, num_rep=1, h2_2=1, corA=1, num_trait=1 +): + """ + The function to simulate quantitative traits by using AlphaSimR. We will + specify the number of causal sites, such that the AlphaSimR simulation + will be conducted randomly. corA is used to specify the correlation + coefficient of pleiotropic traits. The output of this function is a + PhenotypeResult object. + """ - directory = tempfile.TemporaryDirectory() - tree_filename = "tree_comparison_AlphaSimR" - ts.dump(f"{directory.name}/{tree_filename}.tree") - phenotype_filename = "phenotype_comparison_AlphaSimR" - trait_filename = "trait_comparison_AlphaSimR" - cmd = ["Rscript", "data/simulate_AlphaSimR.R"] + with tempfile.TemporaryDirectory() as tmpdirname: + ts.dump(f"{tmpdirname}/tree_seq.tree") + cmd = ["Rscript", "scripts/simulate_AlphaSimR.R"] args = [ str(num_causal), - directory.name, - tree_filename, - phenotype_filename, - trait_filename, + tmpdirname, str(corA), str(num_trait), + str(h2), + str(h2_2), + str(num_rep), str(random_seed), ] input_cmd = cmd + args subprocess.check_output(input_cmd) - phenotype_df = pd.read_csv(f"{directory.name}/{phenotype_filename}.csv") - trait_df = pd.read_csv(f"{directory.name}/{trait_filename}.csv") + phenotype_df = pd.read_csv(f"{tmpdirname}/phenotype_alphasimr.csv") + trait_df = pd.read_csv(f"{tmpdirname}/trait_alphasimr.csv") - # Obtain the list of causal allele - causal_allele = [] - for site_id in trait_df["site_id"]: - allele = ts.mutation(site_id).derived_state - causal_allele.append(allele) + # Obtain the list of causal allele + causal_allele = [] + for site_id in trait_df["site_id"]: + allele = ts.mutation(site_id).derived_state + causal_allele.append(allele) - trait_df["causal_allele"] = causal_allele + trait_df["causal_allele"] = causal_allele - simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) + simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) - directory.cleanup() + return simulation_result - return simulation_result - def _simulate_arg_needle(self, ts, alpha, random_seed): - """ - Simulate phenotypes by using the quantitative trait simulation framework that is - adapted from the ARG-Needle paper. - https://zenodo.org/records/7745746 - """ +def _simulate_arg_needle(ts, alpha, h2, random_seed, num_rep=1): + """ + Simulate phenotypes by using the quantitative trait simulation framework that is + adapted from the ARG-Needle paper. + https://zenodo.org/records/7745746 + """ - rng = np.random.default_rng(random_seed) - num_ind = ts.num_individuals + rng = np.random.default_rng(random_seed) + num_ind = ts.num_individuals + + phenotype_result = pd.DataFrame({"individual_id": [], "phenotype": []}) + trait_result = pd.DataFrame( + {"site_id": [], "causal_allele": [], "effect_size": [], "trait_id": []} + ) + for _ in range(num_rep): phenotypes = np.zeros(num_ind) beta_list = [] causal_allele = [] @@ -266,12 +335,24 @@ def _simulate_arg_needle(self, ts, alpha, random_seed): for variant in ts.variants(): row = variant.genotypes.astype("float64") row = row.reshape((num_ind, 2)).sum(axis=-1) + # This uses std instead of \sqrt{2p(1-p)} that we are using, but since we + # are using an infinite-sites model, the standard deviation will + # approximately be \sqrt{2p(1-p)} when the sample size is large. std = np.std(row, ddof=1) beta = rng.normal() - beta_list.append(beta) causal_allele.append(variant.alleles[1]) phenotypes += row * (beta * std**alpha) + beta_list.append(beta * std**alpha) + + # normalize pheno to have mean 0 and var = h2 + phenotypes -= np.mean(phenotypes) + phenotypes /= np.std(phenotypes, ddof=1) + phenotypes *= np.sqrt(h2) + + # sample environmental component with variance 1-h2 and add it to phenotype + phenotypes += rng.normal(size=num_ind) * np.sqrt(1 - h2) + # normalize it all phenotypes -= np.mean(phenotypes) phenotypes /= np.std(phenotypes, ddof=1) @@ -287,11 +368,33 @@ def _simulate_arg_needle(self, ts, alpha, random_seed): } ) - simulation_result = SimulationResult(phenotype=phenotype_df, trait=trait_df) + phenotype_result = pd.concat( + [phenotype_result, phenotype_df], ignore_index=True + ) + trait_result = pd.concat([trait_result, trait_df], ignore_index=True) + + trait_result = trait_result.astype({"site_id": int}) + simulation_result = SimulationResult(phenotype=phenotype_result, trait=trait_result) + + return simulation_result + - return simulation_result +class ExactTest(Test): + """ + Test class that conducts exact test. Phenotype and trait information are + simulated by using an external simulator. Afterwards, the trait information is used + to compute the genetic values by using tstrait, and their values are compared + to examine if the tstrait package can compute the accurate genetic values. + + The numericalization input is used to modify the numericalization of genotypes. + The standard numericalization of tstrait is (AA=2, Aa=1, aa=0), where A is the + causal allele, but when numericalization is True, it will be (AA=1, Aa=0, aa=-1). + """ - def _run(self, model, random_seed, **kwargs): + def _simulate_tree_seq(self, random_seed): + """ + Simulates a tree sequence from an infinite-sites model. + """ ts = msprime.sim_ancestry( samples=100, recombination_rate=1e-8, @@ -303,81 +406,553 @@ def _run(self, model, random_seed, **kwargs): ts, rate=1e-8, random_seed=random_seed, discrete_genome=False ) - if model == "simplePHENOTYPES": - simulation_output = self._simulate_simplePHENOTYPE( - ts=ts, - num_causal=kwargs["num_causal"], - add_effect=kwargs["add_effect"], - random_seed=random_seed, - num_trait=kwargs["num_trait"], - add_effect_2=kwargs["add_effect_2"], - ) - - elif model == "AlphaSimR": - simulation_output = self._simulate_AlphaSimR( - ts=ts, - num_causal=kwargs["num_causal"], - random_seed=random_seed, - corA=kwargs["corA"], - num_trait=kwargs["num_trait"], - ) - - elif model == "ARG-Needle": - simulation_output = self._simulate_arg_needle( - ts=ts, alpha=kwargs["alpha"], random_seed=random_seed - ) + return ts - trait_df = simulation_output.trait.sort_values(by=["site_id"]) + def _compute_tstrait(self, ts, trait_df, numericalization=False): + """ + This method computes genetic values from the trait dataframe. + There is an option to change the numericalization of genetic values. + No other post-processing is conducted to the computed genetic values. + """ + trait_df = trait_df.sort_values(by=["site_id"]) genetic_df = tstrait.genetic_value(ts, trait_df) + if numericalization: + trait_id_array = trait_df["trait_id"].unique() + for trait_id in trait_id_array: + effect_size_sum = trait_df[trait_df["trait_id"] == trait_id][ + "effect_size" + ].sum() + genetic_df.loc[ + genetic_df["trait_id"] == trait_id, ["genetic_value"] + ] += effect_size_sum phenotype_df = tstrait.sim_env(genetic_df, h2=1, random_seed=1) - if model == "simplePHENOTYPES": - grouped = phenotype_df.groupby("trait_id")[["phenotype"]] - phenotype_df = grouped.transform(lambda x: (x - x.mean())) - elif model == "AlphaSimR": - phenotype_df = tstrait.normalise_phenotypes( - phenotype_df, mean=0, var=1, ddof=0 - ) - elif model == "ARG-Needle": - phenotype_df = tstrait.normalise_phenotypes(phenotype_df, mean=0, var=1) + return phenotype_df - tstrait_phenotype = phenotype_df["phenotype"] - simulated_phenotype = simulation_output.phenotype["phenotype"].values - np.testing.assert_array_almost_equal(tstrait_phenotype, simulated_phenotype) +class ExactTestSimplePHENOTYPES(ExactTest): + """ + Exact test between simplePHENOTYPES and tstrait. + """ - def test_exact_simplePHENOTYPES_single(self): - self._run( - model="simplePHENOTYPES", + def _compare_simplePHENOTYPES( + self, + num_causal, + random_seed, + add_effect=1, + num_trait=1, + add_effect_2=1, + numericalization=False, + ): + ts = self._simulate_tree_seq(random_seed) + simulation_output = _simulate_simplePHENOTYPES_exact( + ts=ts, + num_causal=num_causal, + random_seed=random_seed, + add_effect=add_effect, + add_effect_2=add_effect_2, + num_trait=num_trait, + ) + phenotype_df = self._compute_tstrait( + ts=ts, trait_df=simulation_output.trait, numericalization=numericalization + ) + grouped = phenotype_df.groupby("trait_id")[["phenotype"]] + phenotype_df = grouped.transform(lambda x: (x - x.mean())) + + np.testing.assert_array_almost_equal( + phenotype_df["phenotype"].values, + simulation_output.phenotype["phenotype"].values, + ) + + def test_exact_simplePHENOTYPES_single_causal_30_add_09(self): + self._compare_simplePHENOTYPES( num_causal=30, add_effect=0.9, random_seed=100, num_trait=1, - add_effect_2=1, + numericalization=False, ) - def test_exact_simplePHENOTYPES_pleiotropic(self): - self._run( - model="simplePHENOTYPES", - num_causal=30, + def test_exact_simplePHENOTYPES_single_causal_50_add_09(self): + self._compare_simplePHENOTYPES( + num_causal=50, add_effect=0.9, random_seed=101, + num_trait=1, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_single_causal_30_add_11(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=1.1, + random_seed=102, + num_trait=1, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_single_causal_50_add_11(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=1.1, + random_seed=103, + num_trait=1, + numericalization=False, + ) + + def test_exact_simplePHENOTYPES_single_causal_30_add_09_numer(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.9, + random_seed=104, + num_trait=1, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_single_causal_30_add_11_numer(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=1.1, + random_seed=106, + num_trait=1, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_single_causal_50_add_11_numer(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=1.1, + random_seed=107, + num_trait=1, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_pleio_causal_30_add_09_08(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.9, + random_seed=108, num_trait=2, add_effect_2=0.8, + numericalization=False, ) - def test_exact_AlphaSimR_single(self): - self._run( - model="AlphaSimR", num_causal=100, random_seed=200, corA=1, num_trait=1 + def test_exact_simplePHENOTYPES_pleio_causal_50_add_11_08(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=1.1, + random_seed=110, + num_trait=2, + add_effect_2=0.8, + numericalization=False, ) - def test_exact_AlphaSimR_pleiotropic(self): - self._run( - model="AlphaSimR", num_causal=100, random_seed=201, corA=0.8, num_trait=2 + def test_exact_simplePHENOTYPES_pleio_causal_30_add_08_08(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.8, + random_seed=111, + num_trait=2, + add_effect_2=0.8, + numericalization=False, ) - def test_exact_ARG_Needle(self): - self._run(model="ARG-Needle", alpha=0, random_seed=300) + def test_exact_simplePHENOTYPES_pleio_causal_50_add_09_08_numer(self): + self._compare_simplePHENOTYPES( + num_causal=50, + add_effect=0.9, + random_seed=112, + num_trait=2, + add_effect_2=0.8, + numericalization=True, + ) + + def test_exact_simplePHENOTYPES_pleio_causal_30_add_08_08_numer(self): + self._compare_simplePHENOTYPES( + num_causal=30, + add_effect=0.8, + random_seed=113, + num_trait=2, + add_effect_2=0.8, + numericalization=True, + ) + + +class ExactTestAlphaSimR(ExactTest): + """ + Exact test between AlphaSimR and tstrait. + """ + + def _compare_AlphaSimR( + self, num_causal, random_seed, corA=1, num_trait=1, numericalization=False + ): + ts = self._simulate_tree_seq(random_seed) + simulation_output = _simulate_AlphaSimR( + ts=ts, + num_causal=num_causal, + h2=1, + random_seed=random_seed, + corA=corA, + num_trait=num_trait, + ) + phenotype_df = self._compute_tstrait( + ts=ts, trait_df=simulation_output.trait, numericalization=numericalization + ) + phenotype_df = tstrait.normalise_phenotypes(phenotype_df, mean=0, var=1, ddof=0) + + np.testing.assert_array_almost_equal( + phenotype_df["phenotype"].values, + simulation_output.phenotype["phenotype"].values, + ) + + def test_exact_AlphaSimR_single_causal_30(self): + self._compare_AlphaSimR(num_causal=30, random_seed=200, numericalization=False) + + def test_exact_AlphaSimR_single_causal_100(self): + self._compare_AlphaSimR(num_causal=100, random_seed=201, numericalization=False) + + def test_exact_AlphaSimR_single_causal_30_numer(self): + self._compare_AlphaSimR(num_causal=30, random_seed=202, numericalization=True) + + def test_exact_AlphaSimR_single_causal_100_numer(self): + self._compare_AlphaSimR(num_causal=100, random_seed=203, numericalization=True) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor08(self): + self._compare_AlphaSimR( + num_causal=30, + random_seed=204, + corA=0.8, + num_trait=2, + numericalization=False, + ) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor01(self): + self._compare_AlphaSimR( + num_causal=30, + random_seed=205, + corA=0.1, + num_trait=2, + numericalization=False, + ) + + def test_exact_AlphaSimR_pleiotropic_causal_100_cor01(self): + self._compare_AlphaSimR( + num_causal=100, + random_seed=206, + corA=0.1, + num_trait=2, + numericalization=False, + ) + + def test_exact_AlphaSimR_pleiotropic_causal_100_cor08(self): + self._compare_AlphaSimR( + num_causal=100, + random_seed=207, + corA=0.8, + num_trait=2, + numericalization=False, + ) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor08_numer(self): + self._compare_AlphaSimR( + num_causal=30, random_seed=208, corA=0.8, num_trait=2, numericalization=True + ) + + def test_exact_AlphaSimR_pleiotropic_causal_30_cor01_numer(self): + self._compare_AlphaSimR( + num_causal=30, random_seed=209, corA=0.1, num_trait=2, numericalization=True + ) + + +class ExactTestARGNeedle(ExactTest): + """ + Exact test between the simulation framework described in the + ARG-Needle paper and tstrait. + """ + + def _compare_arg_needle(self, alpha, random_seed, numericalization=False): + ts = self._simulate_tree_seq(random_seed) + simulation_output = _simulate_arg_needle( + ts=ts, alpha=alpha, h2=1, random_seed=random_seed + ) + phenotype_df = self._compute_tstrait( + ts=ts, trait_df=simulation_output.trait, numericalization=numericalization + ) + phenotype_df = tstrait.normalise_phenotypes(phenotype_df, mean=0, var=1) + + np.testing.assert_array_almost_equal( + phenotype_df["phenotype"].values, + simulation_output.phenotype["phenotype"].values, + ) + + def test_exact_ARG_Needle_alpha_0(self): + self._compare_arg_needle(alpha=0, random_seed=300, numericalization=False) + + def test_exact_ARG_Needle_alpha_negative_1(self): + self._compare_arg_needle(alpha=-1, random_seed=301, numericalization=False) + + def test_exact_ARG_Needle_alpha_1(self): + self._compare_arg_needle(alpha=1, random_seed=302, numericalization=False) + + def test_exact_ARG_Needle_alpha_0_numer(self): + self._compare_arg_needle(alpha=0, random_seed=303, numericalization=True) + + def test_exact_ARG_Needle_alpha_negative_1_numer(self): + self._compare_arg_needle(alpha=-1, random_seed=304, numericalization=True) + + def test_exact_ARG_Needle_alpha_1_numer(self): + self._compare_arg_needle(alpha=1, random_seed=305, numericalization=True) + + +class ComparisonTest(Test): + """ + Compare tstrait against currently available simulators and create a QQ-plot. We + assume an infinite sites model and we also assume that all sites are causal. + """ + + def _compare_phenotype(self, data1, data1_name, data2, data2_name, ind_id): + """ + The input of this function should be a pandas dataframe with individual_id and + phenotype columns. The values inside the individual_id column must be a number + and match with the 2 dataframes that are provided inside this function. + `ind_id` is a list of individual IDs that are of interest to compare. + """ + + for i in ind_id: + data1_phenotype = data1.loc[data1["individual_id"] == i]["phenotype"].values + data2_phenotype = data2.loc[data2["individual_id"] == i]["phenotype"].values + self._plot_qq_compare( + data1=data1_phenotype, + data1_name=f"{data1_name}_ind_{i}", + data2=data2_phenotype, + data2_name=f"{data2_name}_ind_{i}", + ) + + def _simulate_tstrait(self, ts, num_rep, h2, random_seed, mean=0, var=1, alpha=0): + """ + Simulates trait and phenotype information based on a simulated tree + sequence and return the simulated phenotypes that are normalised. + We assume that all sites are causal. + """ + + model = tstrait.trait_model(distribution="normal", mean=mean, var=var) + phenotype_df_list = [] + for i in range(num_rep): + sim_result = tstrait.sim_phenotype( + ts, + num_causal=ts.num_sites, + alpha=alpha, + model=model, + h2=h2, + random_seed=random_seed + i, + ) + phenotype_df = tstrait.normalise_phenotypes(sim_result.phenotype) + phenotype_df_list.append(phenotype_df) + + phenotype_result = pd.concat(phenotype_df_list).reset_index(drop=True) + + return phenotype_result + + def _simulate_out_of_africa(self, random_seed): + """ + Simulate a tree sequence data from the Out of Africa model and select individuals + from different populations. + """ + rng = np.random.default_rng(random_seed) + species = stdpopsim.get_species("HomSap") + model = species.get_demographic_model("OutOfAfrica_3G09") + contig = species.get_contig("chr22", mutation_rate=0, length_multiplier=1e-3) + samples = {"YRI": 300, "CHB": 300, "CEU": 300} + engine = stdpopsim.get_engine("msprime") + ts = engine.simulate(model, contig, samples, seed=random_seed) + ts = msprime.sim_mutations( + ts, rate=model.mutation_rate, random_seed=random_seed, discrete_genome=False + ) + node0 = rng.choice(ts.samples(population=0)) + node1 = rng.choice(ts.samples(population=1)) + node2 = rng.choice(ts.samples(population=2)) + ind_id = np.array( + [ + ts.node(node0).individual, + ts.node(node1).individual, + ts.node(node2).individual, + ] + ) + return ts, ind_id + + +class ComparisonTestSimplePHENOTYPES(ComparisonTest): + def _qqplot_simplePHENOTYPES( + self, num_rep, h2, random_seed, mean=0, var=1, alpha=0 + ): + ts, ind_id = self._simulate_out_of_africa(random_seed=random_seed) + tstrait_phenotype = self._simulate_tstrait( + ts=ts, + num_rep=num_rep, + h2=h2, + random_seed=random_seed, + mean=mean, + var=var, + alpha=alpha, + ) + # - mean because simplePHENOTYPES uses ancestral state as a causal allele + simplePHENOTYPES_phenotype = _simulate_simplePHENOTYPES_qqplot( + ts=ts, + num_causal=ts.num_sites, + h2=h2, + random_seed=random_seed, + num_rep=num_rep, + mean=-mean, + var=var, + ) + self._compare_phenotype( + data1=tstrait_phenotype, + data1_name="tstrait", + data2=simplePHENOTYPES_phenotype, + data2_name="simplePHENOTYPES", + ind_id=ind_id, + ) + + def test_compare_simplePHENOTYPES_h2_08_mean_0_var_1(self): + self._qqplot_simplePHENOTYPES( + num_rep=500, + h2=0.8, + random_seed=1000, + mean=0, + var=1, + ) + + def test_compare_simplePHENOTYPES_h2_09_mean_1_var_4(self): + self._qqplot_simplePHENOTYPES( + num_rep=500, + h2=0.9, + random_seed=1001, + mean=1, + var=4, + ) + + def test_compare_simplePHENOTYPES_h2_02_mean_negative_1_var_4(self): + self._qqplot_simplePHENOTYPES( + num_rep=500, + h2=0.2, + random_seed=1002, + mean=-1, + var=4, + ) + + def test_compare_simplePHENOTYPES_h2_01_mean_0_var_4(self): + self._qqplot_simplePHENOTYPES( + num_rep=500, + h2=0.1, + random_seed=1001, + mean=0, + var=4, + ) + + +class ComparisonTestAlphaSimR(ComparisonTest): + def _simulate_tstrait_genetic_normalise(self, ts, num_rep, h2, random_seed): + """ + Simulates trait and phenotype information based on a simulated tree + sequence and return the simulated phenotypes that are normalised. + We assume that all sites are causal. + + AlphaSimR simulates effect sizes from a standard normal distribution + and normalizes the genetic value afterwards. + """ + + model = tstrait.trait_model(distribution="normal", mean=0, var=1) + phenotype_df_list = [] + for i in range(num_rep): + trait_df = tstrait.sim_trait( + ts=ts, num_causal=ts.num_sites, model=model, random_seed=random_seed + i + ) + genetic_df = tstrait.genetic_value(ts, trait_df) + normalised_df = tstrait.normalise_genetic_value(genetic_df) + + phenotype_df = tstrait.sim_env( + normalised_df, h2=h2, random_seed=random_seed + i + ) + + phenotype_df_list.append(phenotype_df) + + phenotype_result = pd.concat(phenotype_df_list).reset_index(drop=True) + + return phenotype_result + + def _qqplot_AlphaSimR(self, num_rep, h2, random_seed): + ts, ind_id = self._simulate_out_of_africa(random_seed=random_seed) + tstrait_phenotype = self._simulate_tstrait_genetic_normalise( + ts=ts, num_rep=num_rep, h2=h2, random_seed=random_seed + ) + alphasimr_phenotype = _simulate_AlphaSimR( + ts=ts, + num_causal=ts.num_sites, + h2=h2, + random_seed=random_seed, + num_rep=num_rep, + ) + self._compare_phenotype( + data1=tstrait_phenotype, + data1_name="tstrait", + data2=alphasimr_phenotype.phenotype, + data2_name="AlphaSimR", + ind_id=ind_id, + ) + + def test_compare_AlphaSimR_h2_08(self): + self._qqplot_AlphaSimR(num_rep=500, random_seed=2000, h2=0.8) + + def test_compare_AlphaSimR_h2_02(self): + self._qqplot_AlphaSimR(num_rep=500, random_seed=2001, h2=0.2) + + def test_compare_AlphaSimR_h2_05(self): + self._qqplot_AlphaSimR(num_rep=500, random_seed=2000, h2=0.5) + + +class ComparisonTestArgNeedle(ComparisonTest): + def _qqplot_arg_needle(self, num_rep, h2, alpha, random_seed): + ts, ind_id = self._simulate_out_of_africa(random_seed=random_seed) + tstrait_phenotype = self._simulate_tstrait( + ts=ts, num_rep=num_rep, h2=h2, random_seed=random_seed, alpha=alpha + ) + argneedle_phenotype = _simulate_arg_needle( + ts=ts, alpha=alpha, h2=h2, random_seed=random_seed, num_rep=num_rep + ) + self._compare_phenotype( + data1=tstrait_phenotype, + data1_name="tstrait", + data2=argneedle_phenotype.phenotype, + data2_name="ARG-Needle", + ind_id=ind_id, + ) + + def test_compare_arg_needle_alpha_0_h2_08(self): + self._qqplot_arg_needle(num_rep=500, h2=0.8, random_seed=3000, alpha=0) + + def test_compare_arg_needle_alpha_negative_1_h2_08(self): + self._qqplot_arg_needle(num_rep=500, h2=0.8, random_seed=3001, alpha=-1) + + def test_compare_arg_needle_alpha_positive_1_h2_09(self): + self._qqplot_arg_needle(num_rep=500, h2=0.9, random_seed=3002, alpha=1) + + def test_compare_arg_needle_alpha_positive_2_h2_01(self): + self._qqplot_arg_needle(num_rep=500, h2=0.1, random_seed=3003, alpha=2) + + def test_compare_arg_needle_alpha_negative_2_h2_01(self): + self._qqplot_arg_needle(num_rep=500, h2=0.1, random_seed=3004, alpha=-2) + + def test_compare_arg_needle_alpha_0_h2_02(self): + self._qqplot_arg_needle(num_rep=500, h2=0.2, random_seed=3005, alpha=0) + + def test_compare_arg_needle_alpha_negative_1_h2_05(self): + self._qqplot_arg_needle(num_rep=500, h2=0.5, random_seed=3006, alpha=-1) + + def test_compare_arg_needle_alpha_positive_10_h2_09(self): + self._qqplot_arg_needle(num_rep=500, h2=0.9, random_seed=3007, alpha=10) + + def test_compare_arg_needle_alpha_negative_10_h2_08(self): + self._qqplot_arg_needle(num_rep=500, h2=0.8, random_seed=3008, alpha=-10) def model_list(loc, scale):