diff --git a/Project.toml b/Project.toml index 1f5604c93..ab5981062 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PopGen" uuid = "af524d12-c74b-11e9-22a8-3b091653023f" authors = ["Pavel Dimens", "Jason Selwyn"] -version = "0.5.1" +version = "0.5.2" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" diff --git a/src/AlleleFreq.jl b/src/AlleleFreq.jl index 803801a0e..1621e4488 100755 --- a/src/AlleleFreq.jl +++ b/src/AlleleFreq.jl @@ -66,7 +66,7 @@ end #TODO add to docs (API/AlleleFreq) #BUG """ - avg_allele_freq(allele_dicts::AbstractVector{T}, power::Int) where T<:Dict{Signed,Float32} + avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::Int = 1) Takes a Vector of Dicts generated by `allele_freq` and returns a Dict of the average allele frequencies raised to the `power` (exponent) specified (default: `1`). This is typically done to calculate average allele frequencies across populations. @@ -87,7 +87,7 @@ DataFrames.combine( function avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::Int = 1) where T<:Integer sum_dict = Dict{Int16, Tuple{Float32, Int}}() # remove any dicts with no entries (i.e. from a group without that locus) - allele_dicts = allele_dicts[length.(allele_dicts) .> 0] + allele_dicts = allele_dicts[findall(i -> length(i) > 0, allele_dicts)] # create a list of all the alleles all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique # populate the sum dict with allele frequency and n for each allele @@ -109,6 +109,33 @@ function avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power:: return avg_dict end +# method for nei_fst (pairwise) +function avg_allele_freq(allele_dicts::T, power::Int = 1) where T<:Tuple + sum_dict = Dict{Int16, Tuple{Float32, Int}}() + # remove any dicts with no entries (i.e. from a group without that locus) + allele_dicts = allele_dicts[findall(i -> length(i) > 0, allele_dicts)] + # create a list of all the alleles + all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique + # populate the sum dict with allele frequency and n for each allele + @inbounds for allele in all_alleles + for allele_dict in allele_dicts + @inbounds sum_dict[allele] = get!(sum_dict, allele, (0., 0)) .+ (get!(allele_dict, allele, 0.), 1) + end + end + avg_dict = Dict{Int16, Float32}() + # collapse the sum dict into a dict of averages + @inbounds for (key, value) in sum_dict + freq_sum, n = value + avg = (freq_sum / n) ^ power + # drop zeroes + if !iszero(avg) + @inbounds avg_dict[key] = avg + end + end + return avg_dict + end + + """ allele_freq(data::PopData, locus::String; population::Bool = false) Return a `Dict` of allele frequencies of a single locus in a `PopData` diff --git a/src/FStatistics/FstMethods.jl b/src/FStatistics/FstMethods.jl index c8482e2e2..5f9c2c310 100644 --- a/src/FStatistics/FstMethods.jl +++ b/src/FStatistics/FstMethods.jl @@ -1,28 +1,33 @@ ## Nei 1987 FST ## -function nei_fst(data::AbstractDataFrame) - # observed/expected het per locus per pop - het_df = DataFrames.combine( - groupby(data, [:locus, :population]), - :genotype => nonmissing => :n, - :genotype => hetero_o => :het_obs, - :genotype => hetero_e => :het_exp, - :genotype => allele_freq => :alleles - ) - # collapse down to retrieve averages and counts - n_df = DataFrames.combine( - groupby(het_df, :locus), - :n => count_nonzeros => :count, - :n => (n -> count_nonzeros(n) / reciprocal_sum(n)) => :mn, - [:het_obs, :het_exp, :n] => ((o,e,n) -> mean(skipmissing(gene_diversity_nei87.(e, o, count_nonzeros(n) / reciprocal_sum(n))))) => :HS, - :het_obs => (o -> mean(skipmissing(o)))=> :Het_obs, - :alleles => (alleles -> sum(values(avg_allele_freq(alleles, 2))))=> :avg_freq - ) +function nei_fst(population_1::T, population_2::T) where T<:AbstractMatrix + n = hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2))) + # number of populations represented per locus + n_pop_per_loc = map(count_nonzeros, eachrow(n)) + # corrected n for population size + corr_n_per_loc = n_pop_per_loc ./ map(reciprocal_sum, eachrow(n)) + # observed heterozygosity + het_obs_p1 = map(hetero_o, eachcol(population_1)) + het_obs_p2 = map(hetero_o, eachcol(population_2)) + # expected heterozygosity + het_exp_p1 = map(hetero_e, eachcol(population_1)) + het_exp_p2 = map(hetero_e, eachcol(population_2)) + # genic diversity for population 1 and 2 + p1_nei = gene_diversity_nei87.(het_exp_p1, het_obs_p1, corr_n_per_loc) + p2_nei = gene_diversity_nei87.(het_exp_p2, het_obs_p2, corr_n_per_loc) + # mean genic diversity + HS = map(i -> mean(skipmissing(i)), zip(p1_nei, p2_nei)) + # allele freqs for population 1 and 2 + alle_freq_p1 = map(allele_freq, eachcol(population_1)) + alle_freq_p2 = map(allele_freq, eachcol(population_2)) + # sum of the squared average allele frequencies + avg_freq = avg_allele_freq.(zip(alle_freq_p1, alle_freq_p2),2) .|> values .|> sum + # average observed heterozygosity per locus + Het_obs = map(i -> mean(skipmissing(i)), zip(het_obs_p1, het_obs_p2)) - Ht = @inbounds 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count) - DST = @inbounds Ht .- n_df.HS - DST′ = @inbounds n_df.count ./ (n_df.count .- 1) .* DST - HT′ = @inbounds n_df.HS .+ DST′ - #TODO replace safemean + Ht = @inbounds 1.0 .- avg_freq .+ (HS ./ corr_n_per_loc ./ n_pop_per_loc) - (Het_obs ./ 2.0 ./ corr_n_per_loc ./ n_pop_per_loc) + DST = @inbounds Ht .- HS + DST′ = @inbounds n_pop_per_loc ./ (n_pop_per_loc .- 1) .* DST + HT′ = @inbounds HS .+ DST′ round(mean(skipinfnan(DST′)) / mean(skipinfnan(HT′)), digits = 5) end @@ -30,12 +35,15 @@ function _pairwise_Nei(data::PopData) idx_pdata = groupby(data.loci, :population) pops = getindex.(keys(idx_pdata), :population) npops = length(idx_pdata) + n_loci = size(data)[2] results = zeros(npops, npops) @sync for i in 2:npops Base.Threads.@spawn begin for j in 1:(i-1) - results[i,j] = nei_fst(DataFrame(idx_pdata[[j,i]])) - end + pop1 = reshape(idx_pdata[i].genotype, :, n_loci) + pop2 = reshape(idx_pdata[j].genotype, :, n_loci) + results[i,j] = nei_fst(pop1,pop2) + end end end return PairwiseFST(DataFrame(results, Symbol.(pops)), "Nei 1987") @@ -43,61 +51,51 @@ end ## Weir & Cockerham 1984 FST ## -function _wc_helper(x::AbstractArray{Float64,2}) - view(x,:,1) ./ sum(x, dims = 2) -end - -function weircockerham_fst(data::AbstractDataFrame) - loc_names = unique(data.locus) - n_loci = length(loc_names) - n_pops = length(unique(data.population)) +function weircockerham_fst(population_1::T, population_2::T) where T<:AbstractMatrix + n_loci = size(population_1, 2) + n_pops = 2 # get genotype counts - per_locpop = groupby(data, [:locus, :population]) - nonmissing_counts = DataFrames.combine(per_locpop, :genotype => nonmissing => :n) # reshape as a matrix of loci x pop (row x col) - n_per_locpop = reshape(nonmissing_counts.n', (n_pops, n_loci))' - n_total = reduce(+, eachcol(n_per_locpop)) + n_per_locpop = hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2))) + n_total = sum(n_per_locpop, dims = 2) # screen for completely absent loci - missing_loc = findall(iszero, n_total) - if !isempty(missing_loc) - present_loc = n_loci - length(missing_loc) + present_loc = 0 .∉ eachrow(n_per_locpop) + if 0 ∈ present_loc # index locus names by the missing indices - tmp_data = filter(:locus => loc -> loc ∉ loc_names[missing_loc], data) - loc_names = unique(tmp_data.locus) - n_loci = length(loc_names) - per_locpop = groupby(tmp_data, [:locus, :population]) - nonmissing_counts = DataFrames.combine(per_locpop, :genotype => nonmissing => :n) - # reshape as a matrix of loci x pop (row x col) - n_per_locpop = reshape(nonmissing_counts.n', (n_pops, present_loci))' - n_total = reduce(+, eachcol(n_per_locpop)) + pop_1 = @view population_1[:,present_loc] + pop_2 = @view population_2[:,present_loc] + n_per_locpop = hcat(map(nonmissing, eachcol(pop_1)), map(nonmissing, eachcol(pop_2))) + n_total = sum(n_per_locpop, dims = 2) else - tmp_data = data + pop_1 = population_1 + pop_2 = population_2 end - + merged = vcat(pop_1, pop_2) n_pop_per_loc = map(count_nonzeros, eachrow(n_per_locpop)) - per_loc = groupby(tmp_data, :locus) - # find all the alleles for - allele_counts = DataFrames.combine( - per_loc, - :genotype => allele_count => :allele_count, # allele counts (global) - :genotype => allele_freq => :freqs, # allele freqs (global) - ) + + # global allele counts + glob_allele_counts = map(allele_count, eachcol(merged)) + # global allele frequencies + glob_allele_freqs = map(allele_freq, eachcol(merged)) + # allele freqs per locus per population - alle_freqs = DataFrames.combine(per_locpop, :genotype => allele_freq => :freqs) + pop_1_freq = map(allele_freq, eachcol(pop_1)) + pop_2_freq = map(allele_freq, eachcol(pop_2)) + #alle_freqs = DataFrames.combine(per_locpop, :genotype => allele_freq => :freqs) # expand out the n matrix to be same dimensions as unique_alleles x pop - n_expanded = reduce(hcat, repeat.(eachrow(n_per_locpop), 1, allele_counts.allele_count)) |> permutedims + n_expanded = reduce(hcat, repeat.(eachrow(n_per_locpop), 1, glob_allele_counts)) |> permutedims # expand n_total matrix to be same dimensions as unique_alleles x pop - n_tot_expanded = reduce(vcat, repeat.(eachrow(n_total), allele_counts.allele_count)) + n_tot_expanded = reduce(vcat, repeat.(eachrow(n_total), glob_allele_counts)) # calculate corrected n per locus corr_n_per_loc = (n_total .- (sum(n_per_locpop .^2, dims = 2) ./ n_total)) ./ (n_pop_per_loc .- 1) # expand corr_n matrix to be same dimensions as unique_alleles x pop - corr_n_per_loc_exp = reduce(vcat, repeat.(eachrow(corr_n_per_loc), allele_counts.allele_count)) + corr_n_per_loc_exp = reduce(vcat, repeat.(eachrow(corr_n_per_loc), glob_allele_counts)) # list of alleles in each locus - _alleles_perloc = [sort(unique_alleles(i.genotype)) for i in per_loc] + _alleles_perloc = [sort(unique_alleles(i)) for i in eachcol(merged)] # extremely convoluted, creates a big matrix of allele freqs per locus per population # TODO are there too many reshapes going on? #TODO move into its own function? This seems like it could be a recurring thing - afreq_tmp = permutedims(reshape(alle_freqs.freqs, n_pops, :)) + afreq_tmp = hcat(pop_1_freq, pop_2_freq) allele_freq_pop = reshape( reduce(vcat, map(zip(eachrow(afreq_tmp), _alleles_perloc)) do (_freqs_p, _alle) @@ -111,12 +109,13 @@ function weircockerham_fst(data::AbstractDataFrame) :, n_pops # reshape by these dimensions ) # global allele freqs - _freqs = map(i -> values(sort(i)), allele_counts.freqs) |> Base.Iterators.flatten |> collect + _freqs = map(i -> values(sort(i)), glob_allele_freqs) |> Base.Iterators.flatten |> collect #heterozygotes per allele per locus per population # gets emptied from the popfirst! calls below(!) - genos_vec = [i.genotype for i in per_locpop] - + _p1 = collect(eachcol(pop_1)) + _p2 = collect(eachcol(pop_2)) + genos_vec = permutedims([_p1 _p2])[:] # create matrix of heterozygote occurrences per allele per pop het_mtx = reduce(vcat, # vcat will concatenate the returned matrices into a single matrix map(_alleles_perloc) do _alleles # map across the vector of alleles for each locus @@ -133,7 +132,7 @@ function weircockerham_fst(data::AbstractDataFrame) SSG = sum(n_expanded .* allele_freq_pop - μ_het, dims = 2) SSi = sum(n_expanded .* (allele_freq_pop - 2 * allele_freq_pop .^ 2) + μ_het, dims = 2) SSP = 2 .* sum(n_expanded .* reduce(hcat, map(i -> (i .- _freqs) .^ 2, eachcol(allele_freq_pop))), dims = 2) - n_correction = reduce(vcat, fill.(n_pop_per_loc, allele_counts.allele_count)) + n_correction = reduce(vcat, fill.(n_pop_per_loc, glob_allele_counts)) MSG = SSG ./ n_tot_expanded MSP = SSP ./ (n_correction .- 1) @@ -151,12 +150,15 @@ function _pairwise_WeirCockerham(data::PopData) idx_pdata = groupby(data.loci, :population) pops = getindex.(keys(idx_pdata), :population) npops = length(idx_pdata) + n_loci = size(data)[2] results = zeros(npops, npops) @sync for i in 2:npops Base.Threads.@spawn begin for j in 1:(i-1) - results[i,j] = weircockerham_fst(DataFrame(idx_pdata[[j,i]])) - end + pop1 = reshape(idx_pdata[i].genotype, :, n_loci) + pop2 = reshape(idx_pdata[j].genotype, :, n_loci) + results[i,j] = weircockerham_fst(pop1,pop2) + end end end return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham") diff --git a/src/FStatistics/FstPermutations.jl b/src/FStatistics/FstPermutations.jl new file mode 100644 index 000000000..27552ce3c --- /dev/null +++ b/src/FStatistics/FstPermutations.jl @@ -0,0 +1,82 @@ +#TODO add to docs/API + +function _permuted_WeirCockerham(data::PopData, iterations::Int64) + idx_pdata = groupby(data.loci, :population) + pops = getindex.(keys(idx_pdata), :population) + npops = length(idx_pdata) + n_loci = size(data)[2] + results = zeros(npops, npops) + perm_vector = zeros(iterations-1) + p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30) + @inbounds for i in 2:npops + @inbounds for j in 1:(i-1) + pop1 = reshape(idx_pdata[i].genotype, :, n_loci) + pop2 = reshape(idx_pdata[j].genotype, :, n_loci) + fst_val = weircockerham_fst(pop1,pop2) + @inbounds @sync for iter in 1:iterations-1 + Base.Threads.@spawn begin + perm_p1, perm_p2 = _fst_permutation(pop1, pop2) + perm_vector[iter] = weircockerham_fst(perm_p1, perm_p2) + pops_text = string(pops[i]) * " & " * string(pops[j]) + ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")]) + end + end + @inbounds results[i,j] = fst_val + @inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations + end + end + println("Below diagonal: FST values | Above diagonal: P values") + return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham (with p-values)") +end + +function _permuted_Nei(data::PopData, iterations::Int64) + idx_pdata = groupby(data.loci, :population) + pops = getindex.(keys(idx_pdata), :population) + npops = length(idx_pdata) + n_loci = size(data)[2] + results = zeros(npops, npops) + perm_vector = zeros(iterations-1) + p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30) + @inbounds for i in 2:npops + @inbounds for j in 1:(i-1) + pop1 = reshape(idx_pdata[i].genotype, :, n_loci) + pop2 = reshape(idx_pdata[j].genotype, :, n_loci) + fst_val = nei_fst(pop1,pop2) + @inbounds @sync for iter in 1:iterations-1 + Base.Threads.@spawn begin + perm_p1, perm_p2 = _fst_permutation(pop1, pop2) + perm_vector[iter] = nei_fst(perm_p1, perm_p2) + pops_text = string(pops[i]) * " & " * string(pops[j]) + ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")]) + end + end + @inbounds results[i,j] = fst_val + @inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations + end + end + println("Below diagonal: FST values | Above diagonal: P values") + return PairwiseFST(DataFrame(results, Symbol.(pops)), "Nei 1987 (with p-values)") +end + + +""" + _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix +Returns two matrices with rows (samples) shuffled between them. Respects the +number of rows of the original matrices (i.e. population sizes). +""" +function _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix + merged = vcat(population_1, population_2) + # get n for each population + p1_size = size(population_1, 1) + p2_size = size(population_2, 1) + # total number of samples + n_samples = p1_size + p2_size + # generate permutation indices for the first "population" + perm1 = sample(Xoroshiro128Star(), 1:n_samples, p1_size, replace = false) + # generate permutation indices for the second "population" + perm2 = @inbounds (1:n_samples)[Not(perm1)] + # index the merged matrix with the permuted indices + new_pop_1 = @inbounds @view merged[perm1,:] + new_pop_2 = @inbounds @view merged[perm2,:] + return new_pop_1, new_pop_2 +end \ No newline at end of file diff --git a/src/FStatistics/PairwiseFST.jl b/src/FStatistics/PairwiseFST.jl index 1178793c9..3260485da 100644 --- a/src/FStatistics/PairwiseFST.jl +++ b/src/FStatistics/PairwiseFST.jl @@ -22,13 +22,29 @@ end #TODO add methods and complete docstring """ - pairwise_fst(data::PopData; method::String) -Calculate pairwise FST between populations in a `PopData` object. + pairwise_fst(data::PopData; method::String, iterations::Int64) +Calculate pairwise FST between populations in a `PopData` object. Set `iterations` +to a value greater than `0` to perform a single-tailed permutation test to obtain +P-values of statistical significance. #### Methods: - `"Nei87"`: Nei (1987) method - `"WC84"` : Weir-Cockerham (1984) method (default) + +**Examples** +```julia +data = @nancycats +wc = pairwise_fst(data, method = "WC84") +wc_sig = pairwise_fst(data, iterations = 1000) +``` """ -function pairwise_fst(data::PopData; method::String = "WC") - occursin("nei", lowercase(method)) && return _pairwise_Nei(data) - occursin("wc", lowercase(method)) && return _pairwise_WeirCockerham(data) +function pairwise_fst(data::PopData; method::String = "WC84", iterations::Int64 = 0) + if occursin("nei", lowercase(method)) + iterations > 0 && return _permuted_Nei(data, iterations) + iterations == 0 && return _pairwise_Nei(data) + elseif occursin("wc", lowercase(method)) + iterations > 0 && return _permuted_WeirCockerham(data, iterations) + iterations == 0 && return _pairwise_WeirCockerham(data) + else + throw(ArgumentError("please use one of \"WC84\" or \"Nei87\" methods")) + end end \ No newline at end of file diff --git a/src/PopGen.jl b/src/PopGen.jl index 43edd3a2f..f531c3fd8 100755 --- a/src/PopGen.jl +++ b/src/PopGen.jl @@ -78,6 +78,7 @@ include("SummaryInfo.jl") include("HardyWeinberg.jl") include("FStatistics/FstMethods.jl") include("FStatistics/PairwiseFST.jl") +include("FStatistics/FstPermutations.jl") include("Relatedness/PairwiseRelatedness.jl") include("Relatedness/RelatednessMoments.jl") include("Relatedness/RelatednessPostHocs.jl")