Skip to content

Commit

Permalink
Merge pull request #63 from BioJulia/dev
Browse files Browse the repository at this point in the history
v0.5.2 release
  • Loading branch information
pdimens authored Mar 24, 2021
2 parents f12f3fb + a64ea3d commit 3a54b77
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 77 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "PopGen"
uuid = "af524d12-c74b-11e9-22a8-3b091653023f"
authors = ["Pavel Dimens", "Jason Selwyn"]
version = "0.5.1"
version = "0.5.2"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
31 changes: 29 additions & 2 deletions src/AlleleFreq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ end
#TODO add to docs (API/AlleleFreq)
#BUG
"""
avg_allele_freq(allele_dicts::AbstractVector{T}, power::Int) where T<:Dict{Signed,Float32}
avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::Int = 1)
Takes a Vector of Dicts generated by `allele_freq` and returns a Dict of the average
allele frequencies raised to the `power` (exponent) specified (default: `1`).
This is typically done to calculate average allele frequencies across populations.
Expand All @@ -87,7 +87,7 @@ DataFrames.combine(
function avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::Int = 1) where T<:Integer
sum_dict = Dict{Int16, Tuple{Float32, Int}}()
# remove any dicts with no entries (i.e. from a group without that locus)
allele_dicts = allele_dicts[length.(allele_dicts) .> 0]
allele_dicts = allele_dicts[findall(i -> length(i) > 0, allele_dicts)]
# create a list of all the alleles
all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique
# populate the sum dict with allele frequency and n for each allele
Expand All @@ -109,6 +109,33 @@ function avg_allele_freq(allele_dicts::AbstractVector{Dict{T, Float64}}, power::
return avg_dict
end

# method for nei_fst (pairwise)
function avg_allele_freq(allele_dicts::T, power::Int = 1) where T<:Tuple
sum_dict = Dict{Int16, Tuple{Float32, Int}}()
# remove any dicts with no entries (i.e. from a group without that locus)
allele_dicts = allele_dicts[findall(i -> length(i) > 0, allele_dicts)]
# create a list of all the alleles
all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique
# populate the sum dict with allele frequency and n for each allele
@inbounds for allele in all_alleles
for allele_dict in allele_dicts
@inbounds sum_dict[allele] = get!(sum_dict, allele, (0., 0)) .+ (get!(allele_dict, allele, 0.), 1)
end
end
avg_dict = Dict{Int16, Float32}()
# collapse the sum dict into a dict of averages
@inbounds for (key, value) in sum_dict
freq_sum, n = value
avg = (freq_sum / n) ^ power
# drop zeroes
if !iszero(avg)
@inbounds avg_dict[key] = avg
end
end
return avg_dict
end


"""
allele_freq(data::PopData, locus::String; population::Bool = false)
Return a `Dict` of allele frequencies of a single locus in a `PopData`
Expand Down
140 changes: 71 additions & 69 deletions src/FStatistics/FstMethods.jl
Original file line number Diff line number Diff line change
@@ -1,103 +1,101 @@
## Nei 1987 FST ##
function nei_fst(data::AbstractDataFrame)
# observed/expected het per locus per pop
het_df = DataFrames.combine(
groupby(data, [:locus, :population]),
:genotype => nonmissing => :n,
:genotype => hetero_o => :het_obs,
:genotype => hetero_e => :het_exp,
:genotype => allele_freq => :alleles
)
# collapse down to retrieve averages and counts
n_df = DataFrames.combine(
groupby(het_df, :locus),
:n => count_nonzeros => :count,
:n => (n -> count_nonzeros(n) / reciprocal_sum(n)) => :mn,
[:het_obs, :het_exp, :n] => ((o,e,n) -> mean(skipmissing(gene_diversity_nei87.(e, o, count_nonzeros(n) / reciprocal_sum(n))))) => :HS,
:het_obs => (o -> mean(skipmissing(o)))=> :Het_obs,
:alleles => (alleles -> sum(values(avg_allele_freq(alleles, 2))))=> :avg_freq
)
function nei_fst(population_1::T, population_2::T) where T<:AbstractMatrix
n = hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2)))
# number of populations represented per locus
n_pop_per_loc = map(count_nonzeros, eachrow(n))
# corrected n for population size
corr_n_per_loc = n_pop_per_loc ./ map(reciprocal_sum, eachrow(n))
# observed heterozygosity
het_obs_p1 = map(hetero_o, eachcol(population_1))
het_obs_p2 = map(hetero_o, eachcol(population_2))
# expected heterozygosity
het_exp_p1 = map(hetero_e, eachcol(population_1))
het_exp_p2 = map(hetero_e, eachcol(population_2))
# genic diversity for population 1 and 2
p1_nei = gene_diversity_nei87.(het_exp_p1, het_obs_p1, corr_n_per_loc)
p2_nei = gene_diversity_nei87.(het_exp_p2, het_obs_p2, corr_n_per_loc)
# mean genic diversity
HS = map(i -> mean(skipmissing(i)), zip(p1_nei, p2_nei))
# allele freqs for population 1 and 2
alle_freq_p1 = map(allele_freq, eachcol(population_1))
alle_freq_p2 = map(allele_freq, eachcol(population_2))
# sum of the squared average allele frequencies
avg_freq = avg_allele_freq.(zip(alle_freq_p1, alle_freq_p2),2) .|> values .|> sum
# average observed heterozygosity per locus
Het_obs = map(i -> mean(skipmissing(i)), zip(het_obs_p1, het_obs_p2))

Ht = @inbounds 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count)
DST = @inbounds Ht .- n_df.HS
DST′ = @inbounds n_df.count ./ (n_df.count .- 1) .* DST
HT′ = @inbounds n_df.HS .+ DST′
#TODO replace safemean
Ht = @inbounds 1.0 .- avg_freq .+ (HS ./ corr_n_per_loc ./ n_pop_per_loc) - (Het_obs ./ 2.0 ./ corr_n_per_loc ./ n_pop_per_loc)
DST = @inbounds Ht .- HS
DST′ = @inbounds n_pop_per_loc ./ (n_pop_per_loc .- 1) .* DST
HT′ = @inbounds HS .+ DST′
round(mean(skipinfnan(DST′)) / mean(skipinfnan(HT′)), digits = 5)
end

function _pairwise_Nei(data::PopData)
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
npops = length(idx_pdata)
n_loci = size(data)[2]
results = zeros(npops, npops)
@sync for i in 2:npops
Base.Threads.@spawn begin
for j in 1:(i-1)
results[i,j] = nei_fst(DataFrame(idx_pdata[[j,i]]))
end
pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
results[i,j] = nei_fst(pop1,pop2)
end
end
end
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Nei 1987")
end


## Weir & Cockerham 1984 FST ##
function _wc_helper(x::AbstractArray{Float64,2})
view(x,:,1) ./ sum(x, dims = 2)
end

function weircockerham_fst(data::AbstractDataFrame)
loc_names = unique(data.locus)
n_loci = length(loc_names)
n_pops = length(unique(data.population))
function weircockerham_fst(population_1::T, population_2::T) where T<:AbstractMatrix
n_loci = size(population_1, 2)
n_pops = 2
# get genotype counts
per_locpop = groupby(data, [:locus, :population])
nonmissing_counts = DataFrames.combine(per_locpop, :genotype => nonmissing => :n)
# reshape as a matrix of loci x pop (row x col)
n_per_locpop = reshape(nonmissing_counts.n', (n_pops, n_loci))'
n_total = reduce(+, eachcol(n_per_locpop))
n_per_locpop = hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2)))
n_total = sum(n_per_locpop, dims = 2)
# screen for completely absent loci
missing_loc = findall(iszero, n_total)
if !isempty(missing_loc)
present_loc = n_loci - length(missing_loc)
present_loc = 0 .∉ eachrow(n_per_locpop)
if 0 present_loc
# index locus names by the missing indices
tmp_data = filter(:locus => loc -> loc loc_names[missing_loc], data)
loc_names = unique(tmp_data.locus)
n_loci = length(loc_names)
per_locpop = groupby(tmp_data, [:locus, :population])
nonmissing_counts = DataFrames.combine(per_locpop, :genotype => nonmissing => :n)
# reshape as a matrix of loci x pop (row x col)
n_per_locpop = reshape(nonmissing_counts.n', (n_pops, present_loci))'
n_total = reduce(+, eachcol(n_per_locpop))
pop_1 = @view population_1[:,present_loc]
pop_2 = @view population_2[:,present_loc]
n_per_locpop = hcat(map(nonmissing, eachcol(pop_1)), map(nonmissing, eachcol(pop_2)))
n_total = sum(n_per_locpop, dims = 2)
else
tmp_data = data
pop_1 = population_1
pop_2 = population_2
end

merged = vcat(pop_1, pop_2)
n_pop_per_loc = map(count_nonzeros, eachrow(n_per_locpop))
per_loc = groupby(tmp_data, :locus)
# find all the alleles for
allele_counts = DataFrames.combine(
per_loc,
:genotype => allele_count => :allele_count, # allele counts (global)
:genotype => allele_freq => :freqs, # allele freqs (global)
)

# global allele counts
glob_allele_counts = map(allele_count, eachcol(merged))
# global allele frequencies
glob_allele_freqs = map(allele_freq, eachcol(merged))

# allele freqs per locus per population
alle_freqs = DataFrames.combine(per_locpop, :genotype => allele_freq => :freqs)
pop_1_freq = map(allele_freq, eachcol(pop_1))
pop_2_freq = map(allele_freq, eachcol(pop_2))
#alle_freqs = DataFrames.combine(per_locpop, :genotype => allele_freq => :freqs)
# expand out the n matrix to be same dimensions as unique_alleles x pop
n_expanded = reduce(hcat, repeat.(eachrow(n_per_locpop), 1, allele_counts.allele_count)) |> permutedims
n_expanded = reduce(hcat, repeat.(eachrow(n_per_locpop), 1, glob_allele_counts)) |> permutedims
# expand n_total matrix to be same dimensions as unique_alleles x pop
n_tot_expanded = reduce(vcat, repeat.(eachrow(n_total), allele_counts.allele_count))
n_tot_expanded = reduce(vcat, repeat.(eachrow(n_total), glob_allele_counts))
# calculate corrected n per locus
corr_n_per_loc = (n_total .- (sum(n_per_locpop .^2, dims = 2) ./ n_total)) ./ (n_pop_per_loc .- 1)
# expand corr_n matrix to be same dimensions as unique_alleles x pop
corr_n_per_loc_exp = reduce(vcat, repeat.(eachrow(corr_n_per_loc), allele_counts.allele_count))
corr_n_per_loc_exp = reduce(vcat, repeat.(eachrow(corr_n_per_loc), glob_allele_counts))
# list of alleles in each locus
_alleles_perloc = [sort(unique_alleles(i.genotype)) for i in per_loc]
_alleles_perloc = [sort(unique_alleles(i)) for i in eachcol(merged)]
# extremely convoluted, creates a big matrix of allele freqs per locus per population
# TODO are there too many reshapes going on?
#TODO move into its own function? This seems like it could be a recurring thing
afreq_tmp = permutedims(reshape(alle_freqs.freqs, n_pops, :))
afreq_tmp = hcat(pop_1_freq, pop_2_freq)
allele_freq_pop = reshape(
reduce(vcat,
map(zip(eachrow(afreq_tmp), _alleles_perloc)) do (_freqs_p, _alle)
Expand All @@ -111,12 +109,13 @@ function weircockerham_fst(data::AbstractDataFrame)
:, n_pops # reshape by these dimensions
)
# global allele freqs
_freqs = map(i -> values(sort(i)), allele_counts.freqs) |> Base.Iterators.flatten |> collect
_freqs = map(i -> values(sort(i)), glob_allele_freqs) |> Base.Iterators.flatten |> collect

#heterozygotes per allele per locus per population
# gets emptied from the popfirst! calls below(!)
genos_vec = [i.genotype for i in per_locpop]

_p1 = collect(eachcol(pop_1))
_p2 = collect(eachcol(pop_2))
genos_vec = permutedims([_p1 _p2])[:]
# create matrix of heterozygote occurrences per allele per pop
het_mtx = reduce(vcat, # vcat will concatenate the returned matrices into a single matrix
map(_alleles_perloc) do _alleles # map across the vector of alleles for each locus
Expand All @@ -133,7 +132,7 @@ function weircockerham_fst(data::AbstractDataFrame)
SSG = sum(n_expanded .* allele_freq_pop - μ_het, dims = 2)
SSi = sum(n_expanded .* (allele_freq_pop - 2 * allele_freq_pop .^ 2) + μ_het, dims = 2)
SSP = 2 .* sum(n_expanded .* reduce(hcat, map(i -> (i .- _freqs) .^ 2, eachcol(allele_freq_pop))), dims = 2)
n_correction = reduce(vcat, fill.(n_pop_per_loc, allele_counts.allele_count))
n_correction = reduce(vcat, fill.(n_pop_per_loc, glob_allele_counts))

MSG = SSG ./ n_tot_expanded
MSP = SSP ./ (n_correction .- 1)
Expand All @@ -151,12 +150,15 @@ function _pairwise_WeirCockerham(data::PopData)
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
npops = length(idx_pdata)
n_loci = size(data)[2]
results = zeros(npops, npops)
@sync for i in 2:npops
Base.Threads.@spawn begin
for j in 1:(i-1)
results[i,j] = weircockerham_fst(DataFrame(idx_pdata[[j,i]]))
end
pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
results[i,j] = weircockerham_fst(pop1,pop2)
end
end
end
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham")
Expand Down
82 changes: 82 additions & 0 deletions src/FStatistics/FstPermutations.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#TODO add to docs/API

function _permuted_WeirCockerham(data::PopData, iterations::Int64)
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
npops = length(idx_pdata)
n_loci = size(data)[2]
results = zeros(npops, npops)
perm_vector = zeros(iterations-1)
p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30)
@inbounds for i in 2:npops
@inbounds for j in 1:(i-1)
pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
fst_val = weircockerham_fst(pop1,pop2)
@inbounds @sync for iter in 1:iterations-1
Base.Threads.@spawn begin
perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
perm_vector[iter] = weircockerham_fst(perm_p1, perm_p2)
pops_text = string(pops[i]) * " & " * string(pops[j])
ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
end
end
@inbounds results[i,j] = fst_val
@inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations
end
end
println("Below diagonal: FST values | Above diagonal: P values")
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham (with p-values)")
end

function _permuted_Nei(data::PopData, iterations::Int64)
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
npops = length(idx_pdata)
n_loci = size(data)[2]
results = zeros(npops, npops)
perm_vector = zeros(iterations-1)
p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30)
@inbounds for i in 2:npops
@inbounds for j in 1:(i-1)
pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
fst_val = nei_fst(pop1,pop2)
@inbounds @sync for iter in 1:iterations-1
Base.Threads.@spawn begin
perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
perm_vector[iter] = nei_fst(perm_p1, perm_p2)
pops_text = string(pops[i]) * " & " * string(pops[j])
ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
end
end
@inbounds results[i,j] = fst_val
@inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations
end
end
println("Below diagonal: FST values | Above diagonal: P values")
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Nei 1987 (with p-values)")
end


"""
_fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
Returns two matrices with rows (samples) shuffled between them. Respects the
number of rows of the original matrices (i.e. population sizes).
"""
function _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
merged = vcat(population_1, population_2)
# get n for each population
p1_size = size(population_1, 1)
p2_size = size(population_2, 1)
# total number of samples
n_samples = p1_size + p2_size
# generate permutation indices for the first "population"
perm1 = sample(Xoroshiro128Star(), 1:n_samples, p1_size, replace = false)
# generate permutation indices for the second "population"
perm2 = @inbounds (1:n_samples)[Not(perm1)]
# index the merged matrix with the permuted indices
new_pop_1 = @inbounds @view merged[perm1,:]
new_pop_2 = @inbounds @view merged[perm2,:]
return new_pop_1, new_pop_2
end
26 changes: 21 additions & 5 deletions src/FStatistics/PairwiseFST.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,29 @@ end

#TODO add methods and complete docstring
"""
pairwise_fst(data::PopData; method::String)
Calculate pairwise FST between populations in a `PopData` object.
pairwise_fst(data::PopData; method::String, iterations::Int64)
Calculate pairwise FST between populations in a `PopData` object. Set `iterations`
to a value greater than `0` to perform a single-tailed permutation test to obtain
P-values of statistical significance.
#### Methods:
- `"Nei87"`: Nei (1987) method
- `"WC84"` : Weir-Cockerham (1984) method (default)
**Examples**
```julia
data = @nancycats
wc = pairwise_fst(data, method = "WC84")
wc_sig = pairwise_fst(data, iterations = 1000)
```
"""
function pairwise_fst(data::PopData; method::String = "WC")
occursin("nei", lowercase(method)) && return _pairwise_Nei(data)
occursin("wc", lowercase(method)) && return _pairwise_WeirCockerham(data)
function pairwise_fst(data::PopData; method::String = "WC84", iterations::Int64 = 0)
if occursin("nei", lowercase(method))
iterations > 0 && return _permuted_Nei(data, iterations)
iterations == 0 && return _pairwise_Nei(data)
elseif occursin("wc", lowercase(method))
iterations > 0 && return _permuted_WeirCockerham(data, iterations)
iterations == 0 && return _pairwise_WeirCockerham(data)
else
throw(ArgumentError("please use one of \"WC84\" or \"Nei87\" methods"))
end
end
1 change: 1 addition & 0 deletions src/PopGen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ include("SummaryInfo.jl")
include("HardyWeinberg.jl")
include("FStatistics/FstMethods.jl")
include("FStatistics/PairwiseFST.jl")
include("FStatistics/FstPermutations.jl")
include("Relatedness/PairwiseRelatedness.jl")
include("Relatedness/RelatednessMoments.jl")
include("Relatedness/RelatednessPostHocs.jl")
Expand Down

2 comments on commit 3a54b77

@pdimens
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register branch=release

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/32755

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.5.2 -m "<description of version>" 3a54b77156b6c3b008b6915027ae97ca6d2e99c2
git push origin v0.5.2

Please sign in to comment.