Skip to content

Commit

Permalink
Merge pull request #64 from BioJulia/dev
Browse files Browse the repository at this point in the history
v0.6.0
  • Loading branch information
pdimens authored Mar 31, 2021
2 parents 3a54b77 + 1338b9b commit c61fa81
Show file tree
Hide file tree
Showing 10 changed files with 230 additions and 809 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "PopGen"
uuid = "af524d12-c74b-11e9-22a8-3b091653023f"
authors = ["Pavel Dimens", "Jason Selwyn"]
version = "0.5.2"
version = "0.6.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
Expand Down
35 changes: 35 additions & 0 deletions data/example.vcf

Large diffs are not rendered by default.

754 changes: 0 additions & 754 deletions data/filtered_oyster.vcf

This file was deleted.

45 changes: 45 additions & 0 deletions src/FStatistics/FstMethods.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,48 @@
function pairwise_Hudson(data::PopData)
!isbiallelic(data) && throw(error("Data must be biallelic to use the Hudson estimator"))
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
npops = length(idx_pdata)
n_loci = size(data)[2]
results = zeros(npops, npops)
@sync for i in 2:npops
Base.Threads.@spawn begin
for j in 1:(i-1)
pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
results[i,j] = fst_hudson(pop1,pop2)
end
end
end
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Hudson et al. 1992")
end

function hudson_fst(population_1::T, population_2::T) where T<:AbstractMatrix
fst_perloc = @views [_hudson_fst(population_1[:,i], population_2[:,i]) for i in 1:size(population_1,2)]
return mean(skipmissing(skipinfnan(fst_perloc)))
end

# helper function to do the math for Hudson FST on a locus
function _hudson_fst(pop1::T, pop2::T) where T<:GenoArray
p1_frq = allele_freq(pop1)
p2_frq = allele_freq(pop2)
# find the shared allele(s) and choose one of them to be "P"
# this is a safeguard if one population is completely homozygous for an allele
p_allele = intersect(keys(p1_frq), keys(p2_frq)) |> first
p1 = p1_frq[p_allele]
q1 = 1.0 - p1
p2 = p2_frq[p_allele]
q2 = 1.0 - p2
# degrees of freedom is the number of alleles - 1
df1 = (count(!ismissing, pop1) * 2) - 1
df2 = (count(!ismissing, pop2) * 2) - 1
numerator = (p1 - p2)^2 - (p1*q1/df1) - (p2*q2/df2)
denominator = (p1*q2) + (p2*q1)
return numerator/denominator
end



## Nei 1987 FST ##
function nei_fst(population_1::T, population_2::T) where T<:AbstractMatrix
n = hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2)))
Expand Down
79 changes: 55 additions & 24 deletions src/FStatistics/FstPermutations.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,29 @@
#TODO add to docs/API
"""
_fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
Returns two matrices with rows (samples) shuffled between them. Respects the
number of rows of the original matrices (i.e. population sizes).
"""
function _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
merged = vcat(population_1, population_2)
# get n for each population
p1_size = size(population_1, 1)
p2_size = size(population_2, 1)
# total number of samples
n_samples = p1_size + p2_size
# generate permutation indices for the first "population"
perm1 = sample(Xoroshiro128Star(), 1:n_samples, p1_size, replace = false)
# generate permutation indices for the second "population"
perm2 = @inbounds (1:n_samples)[Not(perm1)]
# index the merged matrix with the permuted indices
new_pop_1 = @inbounds @view merged[perm1,:]
new_pop_2 = @inbounds @view merged[perm2,:]
return new_pop_1, new_pop_2
end

function _permuted_WeirCockerham(data::PopData, iterations::Int64)

function _permuted_Hudson(data::PopData, iterations::Int64)
!isbiallelic(data) && throw(error("Data must be biallelic to use the Hudson estimator"))
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
npops = length(idx_pdata)
Expand All @@ -12,11 +35,11 @@ function _permuted_WeirCockerham(data::PopData, iterations::Int64)
@inbounds for j in 1:(i-1)
pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
fst_val = weircockerham_fst(pop1,pop2)
fst_val = hudson_fst(pop1,pop2)
@inbounds @sync for iter in 1:iterations-1
Base.Threads.@spawn begin
perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
perm_vector[iter] = weircockerham_fst(perm_p1, perm_p2)
perm_vector[iter] = hudson_fst(perm_p1, perm_p2)
pops_text = string(pops[i]) * " & " * string(pops[j])
ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
end
Expand All @@ -26,9 +49,10 @@ function _permuted_WeirCockerham(data::PopData, iterations::Int64)
end
end
println("Below diagonal: FST values | Above diagonal: P values")
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham (with p-values)")
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Hudson et al. 1992 (with p-values)")
end


function _permuted_Nei(data::PopData, iterations::Int64)
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
Expand Down Expand Up @@ -59,24 +83,31 @@ function _permuted_Nei(data::PopData, iterations::Int64)
end


"""
_fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
Returns two matrices with rows (samples) shuffled between them. Respects the
number of rows of the original matrices (i.e. population sizes).
"""
function _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
merged = vcat(population_1, population_2)
# get n for each population
p1_size = size(population_1, 1)
p2_size = size(population_2, 1)
# total number of samples
n_samples = p1_size + p2_size
# generate permutation indices for the first "population"
perm1 = sample(Xoroshiro128Star(), 1:n_samples, p1_size, replace = false)
# generate permutation indices for the second "population"
perm2 = @inbounds (1:n_samples)[Not(perm1)]
# index the merged matrix with the permuted indices
new_pop_1 = @inbounds @view merged[perm1,:]
new_pop_2 = @inbounds @view merged[perm2,:]
return new_pop_1, new_pop_2
function _permuted_WeirCockerham(data::PopData, iterations::Int64)
idx_pdata = groupby(data.loci, :population)
pops = getindex.(keys(idx_pdata), :population)
npops = length(idx_pdata)
n_loci = size(data)[2]
results = zeros(npops, npops)
perm_vector = zeros(iterations-1)
p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30)
@inbounds for i in 2:npops
@inbounds for j in 1:(i-1)
pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
fst_val = weircockerham_fst(pop1,pop2)
@inbounds @sync for iter in 1:iterations-1
Base.Threads.@spawn begin
perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
perm_vector[iter] = weircockerham_fst(perm_p1, perm_p2)
pops_text = string(pops[i]) * " & " * string(pops[j])
ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
end
end
@inbounds results[i,j] = fst_val
@inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations
end
end
println("Below diagonal: FST values | Above diagonal: P values")
return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham (with p-values)")
end
6 changes: 5 additions & 1 deletion src/FStatistics/PairwiseFST.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ Calculate pairwise FST between populations in a `PopData` object. Set `iteration
to a value greater than `0` to perform a single-tailed permutation test to obtain
P-values of statistical significance.
#### Methods:
- `"Hudson92"`: Hudson et al. (1992) method (only for biallelic data)
- `"Nei87"`: Nei (1987) method
- `"WC84"` : Weir-Cockerham (1984) method (default)
- `"WC84"` : Weir & Cockerham (1984) method (default)
**Examples**
```julia
Expand All @@ -44,6 +45,9 @@ function pairwise_fst(data::PopData; method::String = "WC84", iterations::Int64
elseif occursin("wc", lowercase(method))
iterations > 0 && return _permuted_WeirCockerham(data, iterations)
iterations == 0 && return _pairwise_WeirCockerham(data)
elseif occursin("hudson", lowercase(method))
iterations > 0 && return _permuted_Hudson(data, iterations)
iterations == 0 && return _pairwise_Hudson(data)
else
throw(ArgumentError("please use one of \"WC84\" or \"Nei87\" methods"))
end
Expand Down
102 changes: 81 additions & 21 deletions src/Utils.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export size, drop_monomorphic, drop_monomorphic!
export size, drop_monomorphic, drop_monomorphic!, drop_multiallelic, drop_multiallelic!

## experimental and not exported or documented!
function adjacency_matrix(data::PopData)
Expand Down Expand Up @@ -128,19 +128,18 @@ Return a `PopData` object omitting any monomorphic loci. Will inform you which
loci were removed.
"""
function drop_monomorphic(data::PopData)
rm_loci = Vector{String}()
for (loc, loc_sdf) in pairs(groupby(data.loci, :locus))
length(unique(skipmissing(loc_sdf[:, :genotype]))) == 1 && push!(rm_loci, loc.locus)
end

if length(rm_loci) == 0
all_loci = loci(data)
mtx = reshape(data.loci.genotype, length(samples(data)), :)
monomorphs = [length(unique(skipmissing(x))) == 1 for x in eachcol(mtx)]
loci_to_rm = all_loci[monomorphs]
if length(loci_to_rm) == 0
return data
elseif length(rm_loci) == 1
@info "Dropping monomorphic locus " * rm_loci[1]
elseif length(loci_to_rm) == 1
@info "Removing monomorphic locus " * loci_to_rm[1]
else
@info "Dropping $(length(rm_loci)) monomorphic loci" * "\n $rm_loci"
@info "Removing $(length(loci_to_rm)) monomorphic loci:" * "\n $loci_to_rm"
end
exclude(data, locus = rm_loci)
exclude(data, locus = loci_to_rm)
end


Expand All @@ -150,20 +149,81 @@ Edit a `PopData` object in place by omitting any monomorphic loci. Will inform y
loci were removed.
"""
function drop_monomorphic!(data::PopData)
rm_loci = Vector{String}()
for (loc, loc_sdf) in pairs(groupby(data.loci, :locus))
length(unique(skipmissing(loc_sdf[:, :genotype]))) == 1 && push!(rm_loci, loc.locus)
all_loci = loci(data)
mtx = reshape(data.loci.genotype, length(samples(data)), :)
monomorphs = [length(unique(skipmissing(x))) == 1 for x in eachcol(mtx)]
loci_to_rm = all_loci[monomorphs]
if length(loci_to_rm) == 0
return data
elseif length(loci_to_rm) == 1
@info "Removing monomorphic locus " * loci_to_rm[1]
else
@info "Removing $(length(loci_to_rm)) monomorphic loci:" * "\n $loci_to_rm"
end
exclude!(data, locus = loci_to_rm)
end


#TODO add to docs
"""
drop_multiallelic(data::PopData)
Return a `PopData` object omitting loci that are not biallelic.
"""
function drop_multiallelic(data::PopData)
all_loci = loci(data)
mtx = reshape(data.loci.genotype, length(samples(data)), :)
nonbi = [!isbiallelic(x) for x in eachcol(mtx)]
loci_to_rm = all_loci[nonbi]
if length(loci_to_rm) == 0
return data
elseif length(loci_to_rm) == 1
@info "Removing 1 multiallelic locus"
else
@info "Removing $(length(loci_to_rm)) multialleic loci"
end
if length(rm_loci) == 0
exclude(data, locus = loci_to_rm)
end


"""
drop_multiallelic!(data::PopData)
Edit a `PopData` object in place, removing loci that are not biallelic.
"""
function drop_multiallelic!(data::PopData)
all_loci = loci(data)
mtx = reshape(data.loci.genotype, length(samples(data)), :)
nonbi = [!isbiallelic(x) for x in eachcol(mtx)]
loci_to_rm = all_loci[nonbi]
if length(loci_to_rm) == 0
return data
elseif length(rm_loci) == 1
@info "Dropping monomorphic locus " * rm_loci[1]
elseif length(loci_to_rm) == 1
@info "Removing 1 multiallelic locus"
else
@info "Dropping $(length(rm_loci)) monomorphic loci" * "\n $rm_loci"
@info "Removing $(length(loci_to_rm)) multialleic loci"
end
exclude!(data, locus = rm_loci)
exclude!(data, locus = loci_to_rm)
end


"""
isbiallelic(data::GenoArray)
Returns `true` if the `GenoArray` is biallelic, `false` if not.
"""
function isbiallelic(data::T) where T<:GenoArray
length(unique(Base.Iterators.flatten(skipmissing(data)))) == 2
end


"""
isbiallelic(data::PopData)
Returns `true` all the loci in the `PopData` are biallelic, `false` if not.
"""
function isbiallelic(data::PopData)
mtx = reshape(data.loci.genotype, length(samples(data)), :)
all(map(isbiallelic, eachcol(mtx)))
end


"""
loci_dataframe(data::PopData)
Return a wide `DataFrame` of samples as columns, ommitting population information.
Expand Down Expand Up @@ -563,8 +623,8 @@ julia> cats_meta = generate_meta(cats_nometa)
function generate_meta(data::DataFrame)
grp = groupby(data, :name)
nms = map(z -> z.name, keys(grp))
pops = map(z -> first(z.population), grp)
ploids = map(z -> find_ploidy(z.genotype), grp)
pops = [first(z.population) for z in grp]
ploids = [find_ploidy(z.genotype) for z in grp]
DataFrame(
:name => nms,
:population => pops,
Expand Down
4 changes: 2 additions & 2 deletions src/io/VariantCall.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
export bcf, vcf

"""
bcf(infile::String; ; rename_snp::Bool, silent::Bool, allow_monomorphic::Bool)
bcf(infile::String; ; rename_loci::Bool, silent::Bool, allow_monomorphic::Bool)
Load a BCF file into memory as a PopData object. Population information needs to be provided separately.
- `infile` : path to BCF file (can be gzipped)
Expand Down Expand Up @@ -33,7 +33,7 @@ end
### VCF parsing ###

"""
vcf(infile::String; ; rename_snp::Bool, silent::Bool, allow_monomorphic::Bool)
vcf(infile::String; ; rename_loci::Bool, silent::Bool, allow_monomorphic::Bool)
Load a VCF file into memory as a PopData object. Population information needs to be provided separately.
- `infile` : path to VCF file (can be gzipped)
Expand Down
6 changes: 3 additions & 3 deletions src/io/VariantCallLazy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ function bcf(infile::String; rename_loci::Bool = false, silent::Bool = false, al
insertcols!(geno_df, Symbol(BCF.chrom(record) * "_" * string(BCF.pos(record))) => conv_geno)
end
close(stream)
if rename_snp
if rename_loci
rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
rename!(geno_df, rnm)
end
Expand Down Expand Up @@ -75,7 +75,7 @@ end

### VCF parsing ###

function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
function vcf(infile::String; rename_loci::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0))
stream = VCF.Reader(openvcf(infile))
nmarkers = countlines(openvcf(infile)) - length(VCF.header(stream)) - 1
Expand All @@ -97,7 +97,7 @@ function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, all
insertcols!(geno_df, Symbol(VCF.chrom(record) * "_" * string(VCF.pos(record))) => conv_geno)
end
close(stream)
if rename_snp
if rename_loci
rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
rename!(geno_df, rnm)
end
Expand Down
6 changes: 3 additions & 3 deletions test/io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using Test

cats_gen = normpath(joinpath(@__DIR__,"..","data/", "nancycats.gen"))
sharks_csv = normpath(joinpath(@__DIR__,"..","data/", "gulfsharks.csv"))
oyster_vcf = normpath(joinpath(@__DIR__,"..","data/", "filtered_oyster.vcf"))
example_vcf = normpath(joinpath(@__DIR__,"..","data/", "example.vcf"))

@testset "Genepop io" begin
@test typeof(read_from(cats_gen, silent = true)) == PopData
Expand All @@ -19,8 +19,8 @@ end
end

@testset "VCF io" begin
@test typeof(read_from(oyster_vcf, silent = true)) == PopData
@test typeof(vcf(oyster_vcf, silent = true)) == PopData
@test typeof(read_from(example_vcf, silent = true)) == PopData
@test typeof(vcf(example_vcf, silent = true)) == PopData
end

end # module
Expand Down

2 comments on commit c61fa81

@pdimens
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register branch=release

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/33220

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.6.0 -m "<description of version>" c61fa81431097d92c8470a1761a242f347271041
git push origin v0.6.0

Please sign in to comment.