Merge pull request #63 from BioJulia/dev

v0.5.2 release
BioJulia · Mar 24, 2021 · 3a54b77 · 3a54b77 · pdimens · Mar 24, 2021
2 parents f12f3fb + a64ea3d
commit 3a54b77
Show file tree

Hide file tree

Showing 6 changed files with 205 additions and 77 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "PopGen"
 uuid = "af524d12-c74b-11e9-22a8-3b091653023f"
 authors = ["Pavel Dimens", "Jason Selwyn"]
-version = "0.5.1"
+version = "0.5.2"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/src/AlleleFreq.jl b/src/AlleleFreq.jl
@@ -66,7 +66,7 @@ end
 #TODO add to docs (API/AlleleFreq)
 #BUG
 """
-    avg_allele_freq(allele_dicts::AbstractVector{T}, power::Int) where T<:Dict{Signed,Float32}
+    avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::Int = 1)
 Takes a Vector of Dicts generated by `allele_freq` and returns a Dict of the average
 allele frequencies raised to the `power` (exponent) specified (default: `1`). 
 This is typically done to calculate average allele frequencies across populations.
@@ -87,7 +87,7 @@ DataFrames.combine(
 function avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::Int = 1) where T<:Integer   
    sum_dict = Dict{Int16, Tuple{Float32, Int}}()
    # remove any dicts with no entries (i.e. from a group without that locus)
-   allele_dicts = allele_dicts[length.(allele_dicts) .> 0]
+   allele_dicts = allele_dicts[findall(i -> length(i) > 0, allele_dicts)]
    # create a list of all the alleles
    all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique
    # populate the sum dict with allele frequency and n for each allele
@@ -109,6 +109,33 @@ function avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::
    return avg_dict
 end
 
+# method for nei_fst (pairwise)
+function avg_allele_freq(allele_dicts::T, power::Int = 1) where T<:Tuple
+    sum_dict = Dict{Int16, Tuple{Float32, Int}}()
+    # remove any dicts with no entries (i.e. from a group without that locus)
+    allele_dicts = allele_dicts[findall(i -> length(i) > 0, allele_dicts)]
+    # create a list of all the alleles
+    all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique
+    # populate the sum dict with allele frequency and n for each allele
+    @inbounds for allele in all_alleles
+        for allele_dict in allele_dicts
+            @inbounds sum_dict[allele] = get!(sum_dict, allele, (0., 0)) .+ (get!(allele_dict, allele, 0.), 1)
+        end
+    end
+    avg_dict = Dict{Int16, Float32}()
+    # collapse the sum dict into a dict of averages
+    @inbounds for (key, value) in sum_dict
+        freq_sum, n = value
+        avg = (freq_sum / n) ^ power
+        # drop zeroes
+        if !iszero(avg)
+            @inbounds avg_dict[key] = avg
+        end
+    end
+    return avg_dict
+ end
+
+
 """
     allele_freq(data::PopData, locus::String; population::Bool = false)
 Return a `Dict` of allele frequencies of a single locus in a `PopData`

diff --git a/src/FStatistics/FstMethods.jl b/src/FStatistics/FstMethods.jl
@@ -1,103 +1,101 @@
 ## Nei 1987 FST ##
-function nei_fst(data::AbstractDataFrame)
-    # observed/expected het per locus per pop
-    het_df = DataFrames.combine(
-        groupby(data, [:locus, :population]),
-        :genotype => nonmissing => :n,
-        :genotype => hetero_o => :het_obs,
-        :genotype => hetero_e => :het_exp,
-        :genotype => allele_freq => :alleles
-    )
-    # collapse down to retrieve averages and counts
-    n_df = DataFrames.combine(
-        groupby(het_df, :locus),
-        :n => count_nonzeros => :count,
-        :n => (n -> count_nonzeros(n) / reciprocal_sum(n)) => :mn,
-        [:het_obs, :het_exp, :n] => ((o,e,n) -> mean(skipmissing(gene_diversity_nei87.(e, o, count_nonzeros(n) / reciprocal_sum(n))))) => :HS,
-        :het_obs => (o -> mean(skipmissing(o)))=> :Het_obs,
-        :alleles => (alleles ->  sum(values(avg_allele_freq(alleles, 2))))=> :avg_freq
-        )
+function nei_fst(population_1::T, population_2::T) where T<:AbstractMatrix
+    n =  hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2)))
+    # number of populations represented per locus
+    n_pop_per_loc = map(count_nonzeros, eachrow(n)) 
+    # corrected n for population size
+    corr_n_per_loc = n_pop_per_loc ./ map(reciprocal_sum, eachrow(n))
+    # observed heterozygosity
+    het_obs_p1 = map(hetero_o, eachcol(population_1))
+    het_obs_p2 = map(hetero_o, eachcol(population_2))
+    # expected heterozygosity
+    het_exp_p1 = map(hetero_e, eachcol(population_1))
+    het_exp_p2 = map(hetero_e, eachcol(population_2))
+    # genic diversity for population 1 and 2
+    p1_nei = gene_diversity_nei87.(het_exp_p1, het_obs_p1, corr_n_per_loc)
+    p2_nei = gene_diversity_nei87.(het_exp_p2, het_obs_p2, corr_n_per_loc)
+    # mean genic diversity
+    HS = map(i -> mean(skipmissing(i)), zip(p1_nei, p2_nei))
+    # allele freqs for population 1 and 2
+    alle_freq_p1 = map(allele_freq, eachcol(population_1))
+    alle_freq_p2 = map(allele_freq, eachcol(population_2))
+    # sum of the squared average allele frequencies
+    avg_freq = avg_allele_freq.(zip(alle_freq_p1, alle_freq_p2),2) .|> values .|> sum
+    # average observed heterozygosity per locus
+    Het_obs = map(i -> mean(skipmissing(i)), zip(het_obs_p1, het_obs_p2))
 
-    Ht = @inbounds 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count)
-    DST = @inbounds Ht .- n_df.HS
-    DST′ = @inbounds n_df.count ./ (n_df.count .- 1) .* DST
-    HT′ = @inbounds n_df.HS .+ DST′
-    #TODO replace safemean
+    Ht = @inbounds 1.0 .- avg_freq .+ (HS ./ corr_n_per_loc ./ n_pop_per_loc) - (Het_obs ./ 2.0 ./ corr_n_per_loc ./ n_pop_per_loc)
+    DST = @inbounds Ht .- HS
+    DST′ = @inbounds n_pop_per_loc ./ (n_pop_per_loc .- 1) .* DST
+    HT′ = @inbounds HS .+ DST′
     round(mean(skipinfnan(DST′)) / mean(skipinfnan(HT′)), digits = 5)
 end
 
 function _pairwise_Nei(data::PopData)
     idx_pdata = groupby(data.loci, :population)
     pops = getindex.(keys(idx_pdata), :population)
     npops = length(idx_pdata)
+    n_loci = size(data)[2]
     results = zeros(npops, npops)
     @sync for i in 2:npops
         Base.Threads.@spawn begin
             for j in 1:(i-1)
-                results[i,j] = nei_fst(DataFrame(idx_pdata[[j,i]]))
-            end
+                pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
+                pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
+                results[i,j] = nei_fst(pop1,pop2)
+           end
         end
     end
     return PairwiseFST(DataFrame(results, Symbol.(pops)), "Nei 1987")
 end
 
 
 ## Weir & Cockerham 1984 FST ##
-function _wc_helper(x::AbstractArray{Float64,2})
-    view(x,:,1) ./ sum(x, dims = 2)
-end
-
-function weircockerham_fst(data::AbstractDataFrame)
-    loc_names = unique(data.locus)
-    n_loci = length(loc_names)
-    n_pops = length(unique(data.population))
+function weircockerham_fst(population_1::T, population_2::T) where T<:AbstractMatrix
+    n_loci = size(population_1, 2)
+    n_pops = 2
     # get genotype counts
-    per_locpop = groupby(data, [:locus, :population])
-    nonmissing_counts = DataFrames.combine(per_locpop, :genotype => nonmissing => :n)
     # reshape as a matrix of loci x pop (row x col)
-    n_per_locpop = reshape(nonmissing_counts.n', (n_pops, n_loci))'
-    n_total = reduce(+, eachcol(n_per_locpop))
+    n_per_locpop =  hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2)))
+    n_total = sum(n_per_locpop, dims = 2)
     # screen for completely absent loci
-    missing_loc = findall(iszero, n_total)
-    if !isempty(missing_loc)
-        present_loc = n_loci - length(missing_loc)
+    present_loc = 0 .∉ eachrow(n_per_locpop)
+    if 0 ∈ present_loc
         # index locus names by the missing indices
-        tmp_data = filter(:locus => loc -> loc ∉ loc_names[missing_loc], data)
-        loc_names = unique(tmp_data.locus)
-        n_loci = length(loc_names)
-        per_locpop = groupby(tmp_data, [:locus, :population])
-        nonmissing_counts = DataFrames.combine(per_locpop, :genotype => nonmissing => :n)
-        # reshape as a matrix of loci x pop (row x col)
-        n_per_locpop = reshape(nonmissing_counts.n', (n_pops, present_loci))'
-        n_total = reduce(+, eachcol(n_per_locpop))
+        pop_1 = @view population_1[:,present_loc]
+        pop_2 = @view population_2[:,present_loc]
+        n_per_locpop =  hcat(map(nonmissing, eachcol(pop_1)), map(nonmissing, eachcol(pop_2)))
+        n_total = sum(n_per_locpop, dims = 2)
     else
-        tmp_data = data
+        pop_1 = population_1
+        pop_2 = population_2
     end
-
+    merged = vcat(pop_1, pop_2)
     n_pop_per_loc = map(count_nonzeros, eachrow(n_per_locpop))
-    per_loc = groupby(tmp_data, :locus)
-    # find all the alleles for
-    allele_counts = DataFrames.combine(
-        per_loc, 
-        :genotype => allele_count => :allele_count,  # allele counts (global)
-        :genotype => allele_freq => :freqs,          # allele freqs (global)
-    )
+
+    # global allele counts
+    glob_allele_counts = map(allele_count, eachcol(merged))
+    # global allele frequencies
+    glob_allele_freqs = map(allele_freq, eachcol(merged))
+
     # allele freqs per locus per population
-    alle_freqs = DataFrames.combine(per_locpop, :genotype => allele_freq => :freqs)
+    pop_1_freq = map(allele_freq, eachcol(pop_1))
+    pop_2_freq = map(allele_freq, eachcol(pop_2))
+    #alle_freqs = DataFrames.combine(per_locpop, :genotype => allele_freq => :freqs)
     # expand out the n matrix to be same dimensions as unique_alleles x pop
-    n_expanded = reduce(hcat, repeat.(eachrow(n_per_locpop), 1, allele_counts.allele_count)) |> permutedims
+    n_expanded = reduce(hcat, repeat.(eachrow(n_per_locpop), 1, glob_allele_counts)) |> permutedims
     # expand n_total matrix to be same dimensions as unique_alleles x pop
-    n_tot_expanded = reduce(vcat, repeat.(eachrow(n_total), allele_counts.allele_count))
+    n_tot_expanded = reduce(vcat, repeat.(eachrow(n_total), glob_allele_counts))
     # calculate corrected n per locus
     corr_n_per_loc = (n_total .- (sum(n_per_locpop .^2, dims = 2) ./ n_total)) ./ (n_pop_per_loc .- 1) 
     # expand corr_n matrix to be same dimensions as unique_alleles x pop
-    corr_n_per_loc_exp = reduce(vcat, repeat.(eachrow(corr_n_per_loc), allele_counts.allele_count))
+    corr_n_per_loc_exp = reduce(vcat, repeat.(eachrow(corr_n_per_loc), glob_allele_counts))
     # list of alleles in each locus
-    _alleles_perloc = [sort(unique_alleles(i.genotype)) for i in per_loc]
+    _alleles_perloc = [sort(unique_alleles(i)) for i in eachcol(merged)]
     # extremely convoluted, creates a big matrix of allele freqs per locus per population
     # TODO are there too many reshapes going on?
     #TODO move into its own function? This seems like it could be a recurring thing
-    afreq_tmp = permutedims(reshape(alle_freqs.freqs, n_pops, :))
+    afreq_tmp = hcat(pop_1_freq, pop_2_freq)
     allele_freq_pop = reshape(
         reduce(vcat,
             map(zip(eachrow(afreq_tmp), _alleles_perloc)) do (_freqs_p, _alle)
@@ -111,12 +109,13 @@ function weircockerham_fst(data::AbstractDataFrame)
        :, n_pops  # reshape by these dimensions
     )
     # global allele freqs
-    _freqs = map(i -> values(sort(i)), allele_counts.freqs) |> Base.Iterators.flatten |> collect
+    _freqs = map(i -> values(sort(i)), glob_allele_freqs) |> Base.Iterators.flatten |> collect
 
     #heterozygotes per allele per locus per population
     # gets emptied from the popfirst! calls below(!)
-    genos_vec = [i.genotype for i in per_locpop]
-
+    _p1 = collect(eachcol(pop_1))
+    _p2 = collect(eachcol(pop_2))
+    genos_vec = permutedims([_p1 _p2])[:]
     # create matrix of heterozygote occurrences per allele per pop
     het_mtx = reduce(vcat,     # vcat will concatenate the returned matrices into a single matrix
         map(_alleles_perloc) do _alleles          # map across the vector of alleles for each locus
@@ -133,7 +132,7 @@ function weircockerham_fst(data::AbstractDataFrame)
     SSG = sum(n_expanded .* allele_freq_pop - μ_het, dims = 2)
     SSi = sum(n_expanded .* (allele_freq_pop - 2 * allele_freq_pop .^ 2) + μ_het, dims = 2)
     SSP = 2 .* sum(n_expanded .* reduce(hcat, map(i -> (i .- _freqs) .^ 2, eachcol(allele_freq_pop))), dims = 2)
-    n_correction = reduce(vcat, fill.(n_pop_per_loc, allele_counts.allele_count))
+    n_correction = reduce(vcat, fill.(n_pop_per_loc, glob_allele_counts))
 
     MSG = SSG ./ n_tot_expanded
     MSP = SSP ./ (n_correction .- 1)
@@ -151,12 +150,15 @@ function _pairwise_WeirCockerham(data::PopData)
     idx_pdata = groupby(data.loci, :population)
     pops = getindex.(keys(idx_pdata), :population)
     npops = length(idx_pdata)
+    n_loci = size(data)[2]
     results = zeros(npops, npops)
     @sync for i in 2:npops
         Base.Threads.@spawn begin
             for j in 1:(i-1)
-                results[i,j] = weircockerham_fst(DataFrame(idx_pdata[[j,i]]))
-            end
+                pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
+                pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
+                results[i,j] = weircockerham_fst(pop1,pop2)
+           end
         end
     end
     return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham")

diff --git a/src/FStatistics/FstPermutations.jl b/src/FStatistics/FstPermutations.jl
@@ -0,0 +1,82 @@
+#TODO add to docs/API
+
+function _permuted_WeirCockerham(data::PopData, iterations::Int64)
+    idx_pdata = groupby(data.loci, :population)
+    pops = getindex.(keys(idx_pdata), :population)
+    npops = length(idx_pdata)
+    n_loci = size(data)[2]
+    results = zeros(npops, npops)
+    perm_vector = zeros(iterations-1)
+    p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30)
+    @inbounds for i in 2:npops
+        @inbounds for j in 1:(i-1)
+            pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
+            pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
+            fst_val = weircockerham_fst(pop1,pop2)
+            @inbounds @sync for iter in 1:iterations-1
+                Base.Threads.@spawn begin
+                    perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
+                    perm_vector[iter] = weircockerham_fst(perm_p1, perm_p2)
+                    pops_text = string(pops[i]) * " & " * string(pops[j])
+                    ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
+                end
+            end
+            @inbounds results[i,j] = fst_val
+            @inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations
+        end
+    end
+    println("Below diagonal: FST values | Above diagonal: P values")
+    return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham (with p-values)")
+end
+
+function _permuted_Nei(data::PopData, iterations::Int64)
+    idx_pdata = groupby(data.loci, :population)
+    pops = getindex.(keys(idx_pdata), :population)
+    npops = length(idx_pdata)
+    n_loci = size(data)[2]
+    results = zeros(npops, npops)
+    perm_vector = zeros(iterations-1)
+    p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30)
+    @inbounds for i in 2:npops
+        @inbounds for j in 1:(i-1)
+            pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
+            pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
+            fst_val = nei_fst(pop1,pop2)
+            @inbounds @sync for iter in 1:iterations-1
+                Base.Threads.@spawn begin
+                    perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
+                    perm_vector[iter] = nei_fst(perm_p1, perm_p2)
+                    pops_text = string(pops[i]) * " & " * string(pops[j])
+                    ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
+                end
+            end
+            @inbounds results[i,j] = fst_val
+            @inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations
+        end
+    end
+    println("Below diagonal: FST values | Above diagonal: P values")
+    return PairwiseFST(DataFrame(results, Symbol.(pops)), "Nei 1987 (with p-values)")
+end
+
+
+"""
+    _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
+Returns two matrices with rows (samples) shuffled between them. Respects the
+number of rows of the original matrices (i.e. population sizes).
+"""
+function _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
+    merged = vcat(population_1, population_2)
+    # get n for each population
+    p1_size = size(population_1, 1)
+    p2_size = size(population_2, 1)
+    # total number of samples
+    n_samples = p1_size + p2_size
+    # generate permutation indices for the first "population"
+    perm1 = sample(Xoroshiro128Star(), 1:n_samples, p1_size, replace = false)
+    # generate permutation indices for the second "population"
+    perm2 = @inbounds (1:n_samples)[Not(perm1)]
+    # index the merged matrix with the permuted indices
+    new_pop_1 = @inbounds @view merged[perm1,:]
+    new_pop_2 = @inbounds @view merged[perm2,:]
+    return new_pop_1, new_pop_2 
+end
diff --git a/src/FStatistics/PairwiseFST.jl b/src/FStatistics/PairwiseFST.jl
@@ -22,13 +22,29 @@ end
 
 #TODO add methods and complete docstring
 """
-    pairwise_fst(data::PopData; method::String)
-Calculate pairwise FST between populations in a `PopData` object.
+    pairwise_fst(data::PopData; method::String, iterations::Int64)
+Calculate pairwise FST between populations in a `PopData` object. Set `iterations` 
+to a value greater than `0` to perform a single-tailed permutation test to obtain
+P-values of statistical significance.
 #### Methods:
 - `"Nei87"`: Nei (1987) method
 - `"WC84"` : Weir-Cockerham (1984) method (default)
+
+**Examples**
+```julia
+data = @nancycats
+wc = pairwise_fst(data, method = "WC84")
+wc_sig = pairwise_fst(data, iterations = 1000)
+```
 """
-function pairwise_fst(data::PopData; method::String = "WC")
-    occursin("nei", lowercase(method)) && return _pairwise_Nei(data)
-    occursin("wc", lowercase(method)) && return _pairwise_WeirCockerham(data)
+function pairwise_fst(data::PopData; method::String = "WC84", iterations::Int64 = 0)
+    if occursin("nei", lowercase(method))
+        iterations > 0 && return _permuted_Nei(data, iterations)
+        iterations == 0 && return _pairwise_Nei(data)
+    elseif occursin("wc", lowercase(method))
+        iterations > 0 && return _permuted_WeirCockerham(data, iterations)
+        iterations == 0 && return _pairwise_WeirCockerham(data)
+    else
+        throw(ArgumentError("please use one of \"WC84\" or \"Nei87\" methods"))
+    end
 end
diff --git a/src/PopGen.jl b/src/PopGen.jl
@@ -78,6 +78,7 @@ include("SummaryInfo.jl")
 include("HardyWeinberg.jl")
 include("FStatistics/FstMethods.jl")
 include("FStatistics/PairwiseFST.jl")
+include("FStatistics/FstPermutations.jl")
 include("Relatedness/PairwiseRelatedness.jl")
 include("Relatedness/RelatednessMoments.jl")
 include("Relatedness/RelatednessPostHocs.jl")