Merge pull request #39 from pdimens/dev

stage 0.1.0 release
BioJulia · Nov 13, 2020 · a218bf4 · a218bf4
2 parents 0685d4f + 36e1dfc
commit a218bf4
Show file tree

Hide file tree

Showing 37 changed files with 3,778 additions and 519 deletions.
diff --git a/.github/workflows/ci_main.yml → .github/workflows/ci_dev.yaml b/.github/workflows/ci_main.yml → .github/workflows/ci_dev.yaml
@@ -1,9 +1,9 @@
-name: CI_main
+name: CI_dev
 on:
   pull_request:
-    branches: [main]
+    branches: [dev]
   push:
-    branches: [main]
+    branches: [dev]
 jobs:
   test:
     name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
@@ -12,7 +12,8 @@ jobs:
       matrix:
         version:
           - '1.4'
-          - 'nightly'
+          - '1.5'
+#          - 'nightly'
         os:
           - ubuntu-latest
           - macOS-latest

diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,8 @@
 .DS_Store
 /Manifest.toml
+/README.md
 /dev/
+/.github/
 # Dependencies
 /node_modules
 
@@ -20,3 +22,5 @@
 npm-debug.log*
 yarn-debug.log*
 yarn-error.log*
+
+jason_work/*
diff --git a/Project.toml b/Project.toml
@@ -1,22 +1,34 @@
 name = "PopGen"
 uuid = "af524d12-c74b-11e9-22a8-3b091653023f"
 authors = ["Pavel Dimens", "Jason Selwyn"]
-version = "0.0.1"
+version = "0.1.0"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
-GeneticVariation = "9bc6ac9d-e6b2-5f70-b0a8-242a01662520"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MultipleTesting = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
+PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Requires = "ae029012-a4dd-5104-9daa-d747884805df"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
+CSV = "0.7"
+DataFrames = "0.21"
+Distributions = "0.23, 0.24"
 FileIO = "1.3"
+JLD2 = "0.1, 0.2"
+MultipleTesting = "0.4"
+PooledArrays = "0.5"
+ProgressMeter = "1.4"
+Requires = "1.1"
+StaticArrays = "0.12"
+StatsBase = "0.33"
 julia = "1.3.1"
 
 [extras]

diff --git a/README.md b/README.md
@@ -4,10 +4,10 @@ Population Genetics in Julia. We hope to merge this package into the `BioJulia`
 
 [![alt text](https://img.shields.io/badge/docs-stable-informational?style=for-the-badge&logo=Read%20The%20Docs&logoColor=white)](https://pdimens.github.io/PopGen.jl/) 
 [![alt text](https://img.shields.io/badge/slack-join%20PopGen.jl-9d72b1?style=for-the-badge&logo=slack)](https://join.slack.com/t/popgenjl/shared_invite/zt-deam65n8-DuBs2z1oDtsbBuRplJW~Pg)
-![build_status](https://img.shields.io/github/workflow/status/pdimens/PopGen.jl/CI_main?label=Build%20Status&logo=GitHub&style=for-the-badge)
+![build_status](https://img.shields.io/github/workflow/status/pdimens/PopGen.jl/CI_dev?label=Dev%20Build&logo=GitHub&style=for-the-badge)
 
 #### How to install:
-Invoke the package manager by pressing `]` on an empty line and `add` this repo
+Invoke the package manager by pressing `]` on an empty line and `add PopGen`
 
 ![install_instructions](misc/install_carbon.png)
 

diff --git a/data/datasets.jld2 b/data/datasets.jld2
diff --git a/misc/install_carbon.png b/misc/install_carbon.png
diff --git a/src/AlleleFreq.jl b/src/AlleleFreq.jl
@@ -1,22 +1,37 @@
+#TODO add to API docs
 """
-    allele_freq(locus::T) where T<:GenoArray
+allele_freq(data::PopData)
+Return a NamedTuple of `Dicts` of allele frequencies of all
+loci in a `PopData` object. These are global allele frequencies.
+"""
+function allele_freq(data::PopData)
+    NamedTuple{Tuple(Symbol.(loci(data)))}(
+            DataFrames.combine(
+            groupby(data.loci, :locus),
+            :genotype => allele_freq => :frq
+        )[:, :frq] |> Tuple
+    )
+end
+
+"""
+    allele_freq(locus::GenoArray)
 Return a `Dict` of allele frequencies of a single locus in a `PopData`
 object.
 """
 @inline function allele_freq(locus::GenoArray)
     d = Dict{Int16,Float32}()
+    all(ismissing.(locus)) == true && return d
     flat_alleles = alleles(locus)
     len = length(flat_alleles)
-    len == 0 && return d
     @inbounds @simd for allele in flat_alleles
-        @inbounds d[allele] = get!(d, allele, 0) + 1/len
+        d[allele] = @inbounds get!(d, allele, 0.0) + 1.0/len
     end
     return d
 end
 
 
 """
-    allele_freq_vec(locus::T) where T<:GenoArray
+    allele_freq_vec(locus::GenoArray)
 Return a Vector of allele frequencies of a single locus in a `PopData`
 object. Similar to `allele_freq()`, except it returns only the frequencies,
 without the allele names, meaning they can be in any order. This is useful
@@ -34,6 +49,7 @@ end
 end
 
 #TODO add to docs (API/AlleleFreq)
+#BUG
 """
     avg_allele_freq(allele_dicts::AbstractVector{T}) where T<:Dict{Int16,Float32}
 Takes a Vector of Dicts generated by `allele_freq` and returns a Dict of the average
@@ -58,11 +74,12 @@ function avg_allele_freq(allele_dicts::AbstractVector{T}) where T<:Dict{Int16,Fl
    # remove any dicts with no entries (i.e. from a group without that locus)
    allele_dicts = allele_dicts[length.(allele_dicts) .> 0]
    # create a list of all the alleles
+   #TODO replace with union?
    all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique
    # populate the sum dict with allele frequency and n for each allele
    @inbounds for allele in all_alleles
        for allele_dict in allele_dicts
-           sum_dict[allele] = get!(sum_dict, allele, (0., 0)) .+ (get!(allele_dict, allele, 0.), 1)
+           @inbounds sum_dict[allele] = get!(sum_dict, allele, (0., 0)) .+ (get!(allele_dict, allele, 0.), 1)
        end
    end
    avg_dict = Dict{Int16, Float32}()
@@ -102,7 +119,7 @@ end
 
 #TODO add to docs (API)
 """
-    allele_freq(allele::Int, genos::T) where T<:GenoArray
+    allele_freq(allele::Int, genos::GenoArray)
 Return the frequency of an `allele` from a vector of `genotypes`
 
 ### Example
@@ -120,7 +137,7 @@ function allele_freq(allele::Int, genos::T) where T<:GenoArray
 end
 
 """
-    geno_count_observed(locus::T) where T<:GenoArray
+    geno_count_observed(locus::GenoArray)
 Return a `Dict` of genotype counts of a single locus in a
 `PopData` object.
 """
@@ -130,13 +147,13 @@ Return a `Dict` of genotype counts of a single locus in a
     d = Dict{Tuple, Float32}()
     @inbounds for genotype in skipmissing(locus)
         # sum up non-missing genotypes
-        d[genotype] = get!(d, genotype, 0) + 1
+        d[genotype] = get!(d, genotype, 0.0) + 1.0 
     end
     return d
 end
 
 """
-    geno_count_expected(locus::T) where T<:GenoArray
+    geno_count_expected(locus::GenoArray)
 Return a `Dict` of the expected genotype counts of a single locus in a
 `PopData` object. Expected counts are calculated as the product of observed
 allele frequencies multiplied by the number of non-missing genotypes.
@@ -167,7 +184,7 @@ end
 
 
 """
-    geno_freq(locus::T) where T<:GenoArray
+    geno_freq(locus::GenoArray)
 Return a `Dict` of genotype frequencies of a single locus in a
 `PopData` object.
 """
@@ -177,7 +194,7 @@ Return a `Dict` of genotype frequencies of a single locus in a
     d = Dict{Tuple, Float32}()
     @inbounds for genotype in skipmissing(locus)
         # sum up non-missing genotypes
-        d[genotype] = get!(d, genotype, 0) + 1
+        d[genotype] = get!(d, genotype, 0.0) + 1.0
     end
     total = values(d) |> sum    # sum of all non-missing genotypes
     [d[i] = d[i] / total for i in keys(d)] # genotype count/total
@@ -197,7 +214,7 @@ geno_freq(cats, "fca8")
 geno_freq(cats, "fca8", population = true)
 ```
 """
-function geno_freq(data::PopData, locus::String, population::Bool=false)
+function geno_freq(data::PopData, locus::String; population::Bool=false)
     if !population
         tmp = data.loci[data.loci.locus .== locus, :genotype]
         geno_freq(tmp)
@@ -207,9 +224,8 @@ function geno_freq(data::PopData, locus::String, population::Bool=false)
     end
 end
 
-#TODO replace allele creation with Base.Iterators.product
 """
-    geno_freq_expected(locus::T) where T<:GenoArray
+    geno_freq_expected(locus::GenoArray)
 Return a `Dict` of the expected genotype frequencies of a single locus in a
 `PopData` object. Expected frequencies are calculated as the product of
 observed allele frequencies.
@@ -231,7 +247,7 @@ function geno_freq_expected(locus::T) where T<:GenoArray
     # reform genotype frequencies into a Dict
     expected = Dict{Tuple, Float64}()
     for (geno, freq) in zip(genos, expected_genotype_freq)
-        expected[geno] = get!(expected, geno, 0) + freq
+        expected[geno] = get!(expected, geno, 0.0) + freq
     end
 
     return expected
@@ -249,7 +265,7 @@ geno_freq_expeced(cats, "fca8")
 geno_freq_expected(cats, "fca8", population = true)
 ```
 """
-function geno_freq_expected(data::PopData, locus::String, population::Bool=false)
+function geno_freq_expected(data::PopData, locus::String; population::Bool=false)
     if !population
         tmp = data.loci[data.loci.locus .== locus, :genotype]
         geno_freq_expected(tmp)

diff --git a/src/DataExploration.jl b/src/DataExploration.jl
@@ -46,27 +46,48 @@ missing(gulfsharks(), by = "pop")
     end
 end
 
-#TODO add to docs (Data Exploration page and API)
+
+"""
+    pairwise_identical(data::PopData)
+Return a table of the percent of identical genotypes at each locus between pairs of individuals.
+"""
 function pairwise_identical(data::PopData)
-    sample_names = samples(data)
-    samples_2 = @view sample_names[2:end-1]
-    sample_pairs = collect(Iterators.product(samples_2, sample_names)) |> vec
+    sample_names = collect(samples(data))
+    pairwise_identical(data, sample_names)
+end
+
+"""
+    pairwise_identical(data::PopData, sample_names::Vector{String})
+Return a table of the percent of identical genotypes at each locus
+between all pairs of provided individuals.
+"""
+function pairwise_identical(data::PopData, sample_names::Vector{String})
+    errs = ""
+    all_samples = samples(data)
+    if sample_names != all_samples
+        [errs *= "\n  $i" for i in sample_names if i ∉ all_samples]
+        errs != "" && error("Samples not found in the PopData: " * errs)
+    end
+    sample_pairs = pairwise_pairs(sample_names)
     n = length(sample_pairs)
     perc_ident_vec = Vector{Float64}(undef, n)
     n_vec = Vector{Int}(undef, n)
+    popdata_idx = groupby(data.loci, :name)
     idx = 0
-    @inbounds for sample_1 in sample_names
-        geno_1 = get_sample_genotypes(data, sample_1)
-        len_1 = length(collect(skipmissing(geno_1)))
-        Base.Threads.@threads for sample_2 in sample_names[2:end-1]
-            idx += 1
-            geno_2 = get_sample_genotypes(data, sample_2)
-            len_2 = length(collect(skipmissing(geno_2)))
+    p = Progress(length(sample_pairs), dt = 1, color = :blue)
+    @inbounds @sync for i in 1:length(sample_pairs)
+        Base.Threads.@spawn begin
+            @inbounds geno_1 = popdata_idx[(sample_pairs[i][1],)].genotype
+            @inbounds geno_2 = popdata_idx[(sample_pairs[i][2],)].genotype
+            len_1 = nonmissing(geno_1)
+            len_2 = nonmissing(geno_2)
             shared_geno = minimum([len_1, len_2])
-            shared = sum(skipmissing(geno_1 .== geno_2))
-            perc_ident_vec[idx] = round(shared/shared_geno, digits = 2)
-            n_vec[idx] = shared_geno
+                shared_geno = minimum([len_1, len_2])
+                shared = sum(skipmissing(geno_1 .== geno_2))
+                @inbounds perc_ident_vec[i] = round(shared/shared_geno, digits = 2)
+                @inbounds n_vec[i] = shared_geno
+            next!(p)
         end
     end
-    DataFrame(:sample_1 => getindex.(sample_pairs, 2), :sample_2 => getindex.(sample_pairs, 1), :identical => perc_ident_vec, :n => n_vec)
-end
+    DataFrame(:sample_1 => map(i -> i[1], sample_pairs), :sample_2 => map(i -> i[2], sample_pairs), :identical => perc_ident_vec, :n => n_vec)
+end
diff --git a/src/FStats.jl b/src/FStats.jl
@@ -31,10 +31,10 @@
     )
     =#
 
-    HT = mean(@inbounds @avx 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count))
-    DST = mean(@inbounds @avx HT .- n_df.HS)
-    DST′ = mean(@inbounds @avx n_df.count ./ (n_df.count .- 1) .* DST)
-    HT′ = mean(@inbounds @avx n_df.HS .+ DST′)
+    HT = mean(@inbounds 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count))
+    DST = mean(@inbounds HT .- n_df.HS)
+    DST′ = mean(@inbounds n_df.count ./ (n_df.count .- 1) .* DST)
+    HT′ = mean(@inbounds n_df.HS .+ DST′)
 
     return Dict{Symbol, Float64}(
         :FST => DST / HT,
@@ -96,10 +96,10 @@ end
         groupby(het_df, :locus)
     )
 
-    HT = mean(@inbounds @avx 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count))
-    DST = mean(@inbounds @avx HT .- n_df.HS)
-    DST′ = mean(@inbounds @avx n_df.count ./ (n_df.count .- 1) .* DST)
-    HT′ = mean(@inbounds @avx n_df.HS .+ DST′)
+    HT = mean(@inbounds 1.0 .- n_df.avg_freq + (n_df.HS / n_df.mn / n_df.count) - (n_df.Het_obs / 2.0 / n_df.mn / n_df.count))
+    DST = mean(@inbounds HT - n_df.HS)
+    DST′ = mean(@inbounds n_df.count / (n_df.count .- 1) * DST)
+    HT′ = mean(@inbounds n_df.HS .+ DST′)
 
     return Dict{Symbol, Float64}(
         :FST => DST / HT,
@@ -108,21 +108,22 @@ end
 end
 
 
-
-
 function f_stat_p(data::PopData; nperm::Int = 1000)
     observed = FST_global(data)
     popnames = data.meta.population    
     perm_dict = Dict{Symbol,Vector{Float64}}(
         :FST => Vector{Float64}(undef, nperm-1),
         :FST′ => Vector{Float64}(undef, nperm-1)
         )
-
-    @sync Base.Threads.@spawn for n in 1:nperm-1
-        tmp = copy(data.loci)
-        tmp_f = FST_global(permute_samples!(tmp, popnames))
-        perm_dict[:FST][n] = tmp_f[:FST]
-        perm_dict[:FST′][n] = tmp_f[:FST′]
+    p = Progress(length(sample_pairs), dt = 1, color = :blue)
+    @inbounds @sync for n in 1:nperm-1
+        Base.Threads.@spawn begin
+            tmp = copy(data.loci)
+            tmp_f = FST_global(permute_samples!(tmp, popnames))
+            perm_dict[:FST][n] = tmp_f[:FST]
+            perm_dict[:FST′][n] = tmp_f[:FST′]
+            next!(p)
+        end
     end
 
     @inbounds for (k,v) in perm_dict