Merge pull request #64 from BioJulia/dev

v0.6.0
BioJulia · Mar 31, 2021 · c61fa81 · c61fa81 · pdimens · Mar 31, 2021
2 parents 3a54b77 + 1338b9b
commit c61fa81
Show file tree

Hide file tree

Showing 10 changed files with 230 additions and 809 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "PopGen"
 uuid = "af524d12-c74b-11e9-22a8-3b091653023f"
 authors = ["Pavel Dimens", "Jason Selwyn"]
-version = "0.5.2"
+version = "0.6.0"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"

diff --git a/data/example.vcf b/data/example.vcf
diff --git a/data/filtered_oyster.vcf b/data/filtered_oyster.vcf
diff --git a/src/FStatistics/FstMethods.jl b/src/FStatistics/FstMethods.jl
@@ -1,3 +1,48 @@
+function pairwise_Hudson(data::PopData)
+    !isbiallelic(data) && throw(error("Data must be biallelic to use the Hudson estimator"))
+    idx_pdata = groupby(data.loci, :population)
+    pops = getindex.(keys(idx_pdata), :population)
+    npops = length(idx_pdata)
+    n_loci = size(data)[2]
+    results = zeros(npops, npops)
+    @sync for i in 2:npops
+        Base.Threads.@spawn begin
+            for j in 1:(i-1)
+                pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
+                pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
+                results[i,j] = fst_hudson(pop1,pop2)
+           end
+        end
+    end
+    return PairwiseFST(DataFrame(results, Symbol.(pops)), "Hudson et al. 1992")
+end
+
+function hudson_fst(population_1::T, population_2::T) where T<:AbstractMatrix
+    fst_perloc = @views [_hudson_fst(population_1[:,i], population_2[:,i]) for i in 1:size(population_1,2)]
+    return mean(skipmissing(skipinfnan(fst_perloc)))
+end
+
+# helper function to do the math for Hudson FST on a locus
+function _hudson_fst(pop1::T, pop2::T) where T<:GenoArray
+    p1_frq = allele_freq(pop1)
+    p2_frq = allele_freq(pop2)
+    # find the shared allele(s) and choose one of them to be "P"
+    # this is a safeguard if one population is completely homozygous for an allele
+    p_allele = intersect(keys(p1_frq), keys(p2_frq)) |> first
+    p1 = p1_frq[p_allele]
+    q1 = 1.0 - p1
+    p2 = p2_frq[p_allele]
+    q2 = 1.0 - p2 
+    # degrees of freedom is the number of alleles - 1
+    df1 = (count(!ismissing, pop1) * 2) - 1
+    df2 = (count(!ismissing, pop2) * 2) - 1
+    numerator = (p1 - p2)^2 - (p1*q1/df1) - (p2*q2/df2)
+    denominator = (p1*q2) + (p2*q1)
+    return numerator/denominator
+end
+
+
+
 ## Nei 1987 FST ##
 function nei_fst(population_1::T, population_2::T) where T<:AbstractMatrix
     n =  hcat(map(nonmissing, eachcol(population_1)), map(nonmissing, eachcol(population_2)))

diff --git a/src/FStatistics/FstPermutations.jl b/src/FStatistics/FstPermutations.jl
@@ -1,6 +1,29 @@
 #TODO add to docs/API
+"""
+    _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
+Returns two matrices with rows (samples) shuffled between them. Respects the
+number of rows of the original matrices (i.e. population sizes).
+"""
+function _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
+    merged = vcat(population_1, population_2)
+    # get n for each population
+    p1_size = size(population_1, 1)
+    p2_size = size(population_2, 1)
+    # total number of samples
+    n_samples = p1_size + p2_size
+    # generate permutation indices for the first "population"
+    perm1 = sample(Xoroshiro128Star(), 1:n_samples, p1_size, replace = false)
+    # generate permutation indices for the second "population"
+    perm2 = @inbounds (1:n_samples)[Not(perm1)]
+    # index the merged matrix with the permuted indices
+    new_pop_1 = @inbounds @view merged[perm1,:]
+    new_pop_2 = @inbounds @view merged[perm2,:]
+    return new_pop_1, new_pop_2 
+end
 
-function _permuted_WeirCockerham(data::PopData, iterations::Int64)
+
+function _permuted_Hudson(data::PopData, iterations::Int64)
+    !isbiallelic(data) && throw(error("Data must be biallelic to use the Hudson estimator"))
     idx_pdata = groupby(data.loci, :population)
     pops = getindex.(keys(idx_pdata), :population)
     npops = length(idx_pdata)
@@ -12,11 +35,11 @@ function _permuted_WeirCockerham(data::PopData, iterations::Int64)
         @inbounds for j in 1:(i-1)
             pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
             pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
-            fst_val = weircockerham_fst(pop1,pop2)
+            fst_val = hudson_fst(pop1,pop2)
             @inbounds @sync for iter in 1:iterations-1
                 Base.Threads.@spawn begin
                     perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
-                    perm_vector[iter] = weircockerham_fst(perm_p1, perm_p2)
+                    perm_vector[iter] = hudson_fst(perm_p1, perm_p2)
                     pops_text = string(pops[i]) * " & " * string(pops[j])
                     ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
                 end
@@ -26,9 +49,10 @@ function _permuted_WeirCockerham(data::PopData, iterations::Int64)
         end
     end
     println("Below diagonal: FST values | Above diagonal: P values")
-    return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham (with p-values)")
+    return PairwiseFST(DataFrame(results, Symbol.(pops)), "Hudson et al. 1992 (with p-values)")
 end
 
+
 function _permuted_Nei(data::PopData, iterations::Int64)
     idx_pdata = groupby(data.loci, :population)
     pops = getindex.(keys(idx_pdata), :population)
@@ -59,24 +83,31 @@ function _permuted_Nei(data::PopData, iterations::Int64)
 end
 
 
-"""
-    _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
-Returns two matrices with rows (samples) shuffled between them. Respects the
-number of rows of the original matrices (i.e. population sizes).
-"""
-function _fst_permutation(population_1::T, population_2::T) where T<:AbstractMatrix
-    merged = vcat(population_1, population_2)
-    # get n for each population
-    p1_size = size(population_1, 1)
-    p2_size = size(population_2, 1)
-    # total number of samples
-    n_samples = p1_size + p2_size
-    # generate permutation indices for the first "population"
-    perm1 = sample(Xoroshiro128Star(), 1:n_samples, p1_size, replace = false)
-    # generate permutation indices for the second "population"
-    perm2 = @inbounds (1:n_samples)[Not(perm1)]
-    # index the merged matrix with the permuted indices
-    new_pop_1 = @inbounds @view merged[perm1,:]
-    new_pop_2 = @inbounds @view merged[perm2,:]
-    return new_pop_1, new_pop_2 
+function _permuted_WeirCockerham(data::PopData, iterations::Int64)
+    idx_pdata = groupby(data.loci, :population)
+    pops = getindex.(keys(idx_pdata), :population)
+    npops = length(idx_pdata)
+    n_loci = size(data)[2]
+    results = zeros(npops, npops)
+    perm_vector = zeros(iterations-1)
+    p = Progress(sum(1:(npops - 1)) * (iterations - 1), dt = 1, color = :blue, barglyphs = BarGlyphs("|=> |"), barlen = 30)
+    @inbounds for i in 2:npops
+        @inbounds for j in 1:(i-1)
+            pop1 = reshape(idx_pdata[i].genotype, :, n_loci)
+            pop2 = reshape(idx_pdata[j].genotype, :, n_loci)
+            fst_val = weircockerham_fst(pop1,pop2)
+            @inbounds @sync for iter in 1:iterations-1
+                Base.Threads.@spawn begin
+                    perm_p1, perm_p2 = _fst_permutation(pop1, pop2)
+                    perm_vector[iter] = weircockerham_fst(perm_p1, perm_p2)
+                    pops_text = string(pops[i]) * " & " * string(pops[j])
+                    ProgressMeter.next!(p; showvalues = [(:Populations, pops_text), (:Iteration, "$iter")])
+                end
+            end
+            @inbounds results[i,j] = fst_val
+            @inbounds results[j,i] = (sum(fst_val .<= perm_vector) + 1) / iterations
+        end
+    end
+    println("Below diagonal: FST values | Above diagonal: P values")
+    return PairwiseFST(DataFrame(results, Symbol.(pops)), "Weir & Cockerham (with p-values)")
 end
diff --git a/src/FStatistics/PairwiseFST.jl b/src/FStatistics/PairwiseFST.jl
@@ -27,8 +27,9 @@ Calculate pairwise FST between populations in a `PopData` object. Set `iteration
 to a value greater than `0` to perform a single-tailed permutation test to obtain
 P-values of statistical significance.
 #### Methods:
+- `"Hudson92"`: Hudson et al. (1992) method (only for biallelic data)
 - `"Nei87"`: Nei (1987) method
-- `"WC84"` : Weir-Cockerham (1984) method (default)
+- `"WC84"` : Weir & Cockerham (1984) method (default)
 
 **Examples**
 ```julia
@@ -44,6 +45,9 @@ function pairwise_fst(data::PopData; method::String = "WC84", iterations::Int64
     elseif occursin("wc", lowercase(method))
         iterations > 0 && return _permuted_WeirCockerham(data, iterations)
         iterations == 0 && return _pairwise_WeirCockerham(data)
+    elseif occursin("hudson", lowercase(method))
+        iterations > 0 && return _permuted_Hudson(data, iterations)
+        iterations == 0 && return _pairwise_Hudson(data)
     else
         throw(ArgumentError("please use one of \"WC84\" or \"Nei87\" methods"))
     end

diff --git a/src/Utils.jl b/src/Utils.jl
@@ -1,4 +1,4 @@
-export size, drop_monomorphic, drop_monomorphic!
+export size, drop_monomorphic, drop_monomorphic!, drop_multiallelic, drop_multiallelic!
 
 ## experimental and not exported or documented!
 function adjacency_matrix(data::PopData)
@@ -128,19 +128,18 @@ Return a `PopData` object omitting any monomorphic loci. Will inform you which
 loci were removed.
 """
 function drop_monomorphic(data::PopData)
-    rm_loci = Vector{String}()
-    for (loc, loc_sdf) in pairs(groupby(data.loci, :locus))
-        length(unique(skipmissing(loc_sdf[:, :genotype]))) == 1 && push!(rm_loci, loc.locus)
-    end
-
-    if length(rm_loci) == 0
+    all_loci = loci(data)
+    mtx = reshape(data.loci.genotype, length(samples(data)), :)
+    monomorphs = [length(unique(skipmissing(x))) == 1 for x in eachcol(mtx)]
+    loci_to_rm = all_loci[monomorphs]
+    if length(loci_to_rm) == 0
         return data
-    elseif length(rm_loci) == 1
-        @info "Dropping monomorphic locus " * rm_loci[1]
+    elseif length(loci_to_rm) == 1
+        @info "Removing monomorphic locus " * loci_to_rm[1]
     else
-        @info "Dropping $(length(rm_loci)) monomorphic loci" * "\n $rm_loci"
+        @info "Removing $(length(loci_to_rm)) monomorphic loci:" * "\n $loci_to_rm"
     end
-    exclude(data, locus = rm_loci)
+    exclude(data, locus = loci_to_rm)
 end
 
 
@@ -150,20 +149,81 @@ Edit a `PopData` object in place by omitting any monomorphic loci. Will inform y
 loci were removed.
 """
 function drop_monomorphic!(data::PopData)
-    rm_loci = Vector{String}()
-    for (loc, loc_sdf) in pairs(groupby(data.loci, :locus))
-        length(unique(skipmissing(loc_sdf[:, :genotype]))) == 1 && push!(rm_loci, loc.locus)
+    all_loci = loci(data)
+    mtx = reshape(data.loci.genotype, length(samples(data)), :)
+    monomorphs = [length(unique(skipmissing(x))) == 1 for x in eachcol(mtx)]
+    loci_to_rm = all_loci[monomorphs]
+    if length(loci_to_rm) == 0
+        return data
+    elseif length(loci_to_rm) == 1
+        @info "Removing monomorphic locus " * loci_to_rm[1]
+    else
+        @info "Removing $(length(loci_to_rm)) monomorphic loci:" * "\n $loci_to_rm"
+    end
+    exclude!(data, locus = loci_to_rm)
+end
+
+
+#TODO add to docs
+"""
+    drop_multiallelic(data::PopData)
+Return a `PopData` object omitting loci that are not biallelic.
+"""
+function drop_multiallelic(data::PopData)
+    all_loci = loci(data)
+    mtx = reshape(data.loci.genotype, length(samples(data)), :)
+    nonbi = [!isbiallelic(x) for x in eachcol(mtx)]
+    loci_to_rm = all_loci[nonbi]
+    if length(loci_to_rm) == 0
+        return data
+    elseif length(loci_to_rm) == 1
+        @info "Removing 1 multiallelic locus"
+    else
+        @info "Removing $(length(loci_to_rm)) multialleic loci"
     end
-    if length(rm_loci) == 0
+    exclude(data, locus = loci_to_rm)
+end
+
+
+"""
+    drop_multiallelic!(data::PopData)
+Edit a `PopData` object in place, removing loci that are not biallelic.
+"""
+function drop_multiallelic!(data::PopData)
+    all_loci = loci(data)
+    mtx = reshape(data.loci.genotype, length(samples(data)), :)
+    nonbi = [!isbiallelic(x) for x in eachcol(mtx)]
+    loci_to_rm = all_loci[nonbi]
+    if length(loci_to_rm) == 0
         return data
-    elseif length(rm_loci) == 1
-        @info "Dropping monomorphic locus " * rm_loci[1]
+    elseif length(loci_to_rm) == 1
+        @info "Removing 1 multiallelic locus"
     else
-        @info "Dropping $(length(rm_loci)) monomorphic loci" * "\n $rm_loci"
+        @info "Removing $(length(loci_to_rm)) multialleic loci"
     end
-    exclude!(data, locus = rm_loci)
+    exclude!(data, locus = loci_to_rm)
+end
+
+
+"""
+    isbiallelic(data::GenoArray)
+Returns `true` if the `GenoArray` is biallelic, `false` if not.
+"""
+function isbiallelic(data::T) where T<:GenoArray
+    length(unique(Base.Iterators.flatten(skipmissing(data)))) == 2
 end
 
+
+"""
+    isbiallelic(data::PopData)
+Returns `true` all the loci in the `PopData` are biallelic, `false` if not.
+"""
+function isbiallelic(data::PopData)
+    mtx = reshape(data.loci.genotype, length(samples(data)), :)
+    all(map(isbiallelic, eachcol(mtx)))
+end
+
+
 """
     loci_dataframe(data::PopData)
 Return a wide `DataFrame` of samples as columns, ommitting population information.
@@ -563,8 +623,8 @@ julia> cats_meta = generate_meta(cats_nometa)
 function generate_meta(data::DataFrame)
     grp = groupby(data, :name)
     nms = map(z -> z.name, keys(grp))
-    pops = map(z -> first(z.population), grp)
-    ploids = map(z -> find_ploidy(z.genotype), grp)
+    pops = [first(z.population) for z in grp]
+    ploids = [find_ploidy(z.genotype) for z in grp]
     DataFrame(
         :name => nms,
         :population => pops,

diff --git a/src/io/VariantCall.jl b/src/io/VariantCall.jl
@@ -1,7 +1,7 @@
 export bcf, vcf
 
 """
-    bcf(infile::String; ; rename_snp::Bool, silent::Bool, allow_monomorphic::Bool)
+    bcf(infile::String; ; rename_loci::Bool, silent::Bool, allow_monomorphic::Bool)
 Load a BCF file into memory as a PopData object. Population information needs to be provided separately. 
 - `infile` : path to BCF file (can be gzipped)
 
@@ -33,7 +33,7 @@ end
 ### VCF parsing ###
 
 """
-    vcf(infile::String; ; rename_snp::Bool, silent::Bool, allow_monomorphic::Bool)
+    vcf(infile::String; ; rename_loci::Bool, silent::Bool, allow_monomorphic::Bool)
 Load a VCF file into memory as a PopData object. Population information needs to be provided separately. 
 - `infile` : path to VCF file (can be gzipped)
 

diff --git a/src/io/VariantCallLazy.jl b/src/io/VariantCallLazy.jl
@@ -37,7 +37,7 @@ function bcf(infile::String; rename_loci::Bool = false, silent::Bool = false, al
         insertcols!(geno_df, Symbol(BCF.chrom(record) * "_" * string(BCF.pos(record))) => conv_geno)
     end
     close(stream)
-    if rename_snp
+    if rename_loci
         rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
         rename!(geno_df, rnm)
     end
@@ -75,7 +75,7 @@ end
 
 ### VCF parsing ###
 
-function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
+function vcf(infile::String; rename_loci::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
     bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0))
     stream = VCF.Reader(openvcf(infile))
     nmarkers = countlines(openvcf(infile)) - length(VCF.header(stream)) - 1
@@ -97,7 +97,7 @@ function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, all
         insertcols!(geno_df, Symbol(VCF.chrom(record) * "_" * string(VCF.pos(record))) => conv_geno)
     end
     close(stream)
-    if rename_snp
+    if rename_loci
         rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
         rename!(geno_df, rnm)
     end

diff --git a/test/io.jl b/test/io.jl
@@ -6,7 +6,7 @@ using Test
 
 cats_gen = normpath(joinpath(@__DIR__,"..","data/", "nancycats.gen"))
 sharks_csv = normpath(joinpath(@__DIR__,"..","data/", "gulfsharks.csv"))
-oyster_vcf = normpath(joinpath(@__DIR__,"..","data/", "filtered_oyster.vcf"))
+example_vcf = normpath(joinpath(@__DIR__,"..","data/", "example.vcf"))
 
 @testset "Genepop io" begin
     @test typeof(read_from(cats_gen, silent = true)) == PopData
@@ -19,8 +19,8 @@ end
 end
 
 @testset "VCF io" begin
-    @test typeof(read_from(oyster_vcf, silent = true)) == PopData
-    @test typeof(vcf(oyster_vcf, silent = true)) == PopData    
+    @test typeof(read_from(example_vcf, silent = true)) == PopData
+    @test typeof(vcf(example_vcf, silent = true)) == PopData    
 end
 
 end # module