Skip to content

Commit

Permalink
Merge pull request #39 from pdimens/dev
Browse files Browse the repository at this point in the history
stage 0.1.0 release
  • Loading branch information
pdimens authored Nov 13, 2020
2 parents 0685d4f + 36e1dfc commit a218bf4
Show file tree
Hide file tree
Showing 37 changed files with 3,778 additions and 519 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: CI_main
name: CI_dev
on:
pull_request:
branches: [main]
branches: [dev]
push:
branches: [main]
branches: [dev]
jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }}
Expand All @@ -12,7 +12,8 @@ jobs:
matrix:
version:
- '1.4'
- 'nightly'
- '1.5'
# - 'nightly'
os:
- ubuntu-latest
- macOS-latest
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
.DS_Store
/Manifest.toml
/README.md
/dev/
/.github/
# Dependencies
/node_modules

Expand All @@ -20,3 +22,5 @@
npm-debug.log*
yarn-debug.log*
yarn-error.log*

jason_work/*
18 changes: 15 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,22 +1,34 @@
name = "PopGen"
uuid = "af524d12-c74b-11e9-22a8-3b091653023f"
authors = ["Pavel Dimens", "Jason Selwyn"]
version = "0.0.1"
version = "0.1.0"

[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
GeneticVariation = "9bc6ac9d-e6b2-5f70-b0a8-242a01662520"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
MultipleTesting = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"

[compat]
CSV = "0.7"
DataFrames = "0.21"
Distributions = "0.23, 0.24"
FileIO = "1.3"
JLD2 = "0.1, 0.2"
MultipleTesting = "0.4"
PooledArrays = "0.5"
ProgressMeter = "1.4"
Requires = "1.1"
StaticArrays = "0.12"
StatsBase = "0.33"
julia = "1.3.1"

[extras]
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ Population Genetics in Julia. We hope to merge this package into the `BioJulia`

[![alt text](https://img.shields.io/badge/docs-stable-informational?style=for-the-badge&logo=Read%20The%20Docs&logoColor=white)](https://pdimens.github.io/PopGen.jl/)
[![alt text](https://img.shields.io/badge/slack-join%20PopGen.jl-9d72b1?style=for-the-badge&logo=slack)](https://join.slack.com/t/popgenjl/shared_invite/zt-deam65n8-DuBs2z1oDtsbBuRplJW~Pg)
![build_status](https://img.shields.io/github/workflow/status/pdimens/PopGen.jl/CI_main?label=Build%20Status&logo=GitHub&style=for-the-badge)
![build_status](https://img.shields.io/github/workflow/status/pdimens/PopGen.jl/CI_dev?label=Dev%20Build&logo=GitHub&style=for-the-badge)

#### How to install:
Invoke the package manager by pressing `]` on an empty line and `add` this repo
Invoke the package manager by pressing `]` on an empty line and `add PopGen`

![install_instructions](misc/install_carbon.png)

Expand Down
Binary file modified data/datasets.jld2
Binary file not shown.
Binary file modified misc/install_carbon.png
100755 → 100644
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
48 changes: 32 additions & 16 deletions src/AlleleFreq.jl
Original file line number Diff line number Diff line change
@@ -1,22 +1,37 @@
#TODO add to API docs
"""
allele_freq(locus::T) where T<:GenoArray
allele_freq(data::PopData)
Return a NamedTuple of `Dicts` of allele frequencies of all
loci in a `PopData` object. These are global allele frequencies.
"""
function allele_freq(data::PopData)
NamedTuple{Tuple(Symbol.(loci(data)))}(
DataFrames.combine(
groupby(data.loci, :locus),
:genotype => allele_freq => :frq
)[:, :frq] |> Tuple
)
end

"""
allele_freq(locus::GenoArray)
Return a `Dict` of allele frequencies of a single locus in a `PopData`
object.
"""
@inline function allele_freq(locus::GenoArray)
d = Dict{Int16,Float32}()
all(ismissing.(locus)) == true && return d
flat_alleles = alleles(locus)
len = length(flat_alleles)
len == 0 && return d
@inbounds @simd for allele in flat_alleles
@inbounds d[allele] = get!(d, allele, 0) + 1/len
d[allele] = @inbounds get!(d, allele, 0.0) + 1.0/len
end
return d
end


"""
allele_freq_vec(locus::T) where T<:GenoArray
allele_freq_vec(locus::GenoArray)
Return a Vector of allele frequencies of a single locus in a `PopData`
object. Similar to `allele_freq()`, except it returns only the frequencies,
without the allele names, meaning they can be in any order. This is useful
Expand All @@ -34,6 +49,7 @@ end
end

#TODO add to docs (API/AlleleFreq)
#BUG
"""
avg_allele_freq(allele_dicts::AbstractVector{T}) where T<:Dict{Int16,Float32}
Takes a Vector of Dicts generated by `allele_freq` and returns a Dict of the average
Expand All @@ -58,11 +74,12 @@ function avg_allele_freq(allele_dicts::AbstractVector{T}) where T<:Dict{Int16,Fl
# remove any dicts with no entries (i.e. from a group without that locus)
allele_dicts = allele_dicts[length.(allele_dicts) .> 0]
# create a list of all the alleles
#TODO replace with union?
all_alleles = keys.(allele_dicts) |> Base.Iterators.flatten |> collect |> unique
# populate the sum dict with allele frequency and n for each allele
@inbounds for allele in all_alleles
for allele_dict in allele_dicts
sum_dict[allele] = get!(sum_dict, allele, (0., 0)) .+ (get!(allele_dict, allele, 0.), 1)
@inbounds sum_dict[allele] = get!(sum_dict, allele, (0., 0)) .+ (get!(allele_dict, allele, 0.), 1)
end
end
avg_dict = Dict{Int16, Float32}()
Expand Down Expand Up @@ -102,7 +119,7 @@ end

#TODO add to docs (API)
"""
allele_freq(allele::Int, genos::T) where T<:GenoArray
allele_freq(allele::Int, genos::GenoArray)
Return the frequency of an `allele` from a vector of `genotypes`
### Example
Expand All @@ -120,7 +137,7 @@ function allele_freq(allele::Int, genos::T) where T<:GenoArray
end

"""
geno_count_observed(locus::T) where T<:GenoArray
geno_count_observed(locus::GenoArray)
Return a `Dict` of genotype counts of a single locus in a
`PopData` object.
"""
Expand All @@ -130,13 +147,13 @@ Return a `Dict` of genotype counts of a single locus in a
d = Dict{Tuple, Float32}()
@inbounds for genotype in skipmissing(locus)
# sum up non-missing genotypes
d[genotype] = get!(d, genotype, 0) + 1
d[genotype] = get!(d, genotype, 0.0) + 1.0
end
return d
end

"""
geno_count_expected(locus::T) where T<:GenoArray
geno_count_expected(locus::GenoArray)
Return a `Dict` of the expected genotype counts of a single locus in a
`PopData` object. Expected counts are calculated as the product of observed
allele frequencies multiplied by the number of non-missing genotypes.
Expand Down Expand Up @@ -167,7 +184,7 @@ end


"""
geno_freq(locus::T) where T<:GenoArray
geno_freq(locus::GenoArray)
Return a `Dict` of genotype frequencies of a single locus in a
`PopData` object.
"""
Expand All @@ -177,7 +194,7 @@ Return a `Dict` of genotype frequencies of a single locus in a
d = Dict{Tuple, Float32}()
@inbounds for genotype in skipmissing(locus)
# sum up non-missing genotypes
d[genotype] = get!(d, genotype, 0) + 1
d[genotype] = get!(d, genotype, 0.0) + 1.0
end
total = values(d) |> sum # sum of all non-missing genotypes
[d[i] = d[i] / total for i in keys(d)] # genotype count/total
Expand All @@ -197,7 +214,7 @@ geno_freq(cats, "fca8")
geno_freq(cats, "fca8", population = true)
```
"""
function geno_freq(data::PopData, locus::String, population::Bool=false)
function geno_freq(data::PopData, locus::String; population::Bool=false)
if !population
tmp = data.loci[data.loci.locus .== locus, :genotype]
geno_freq(tmp)
Expand All @@ -207,9 +224,8 @@ function geno_freq(data::PopData, locus::String, population::Bool=false)
end
end

#TODO replace allele creation with Base.Iterators.product
"""
geno_freq_expected(locus::T) where T<:GenoArray
geno_freq_expected(locus::GenoArray)
Return a `Dict` of the expected genotype frequencies of a single locus in a
`PopData` object. Expected frequencies are calculated as the product of
observed allele frequencies.
Expand All @@ -231,7 +247,7 @@ function geno_freq_expected(locus::T) where T<:GenoArray
# reform genotype frequencies into a Dict
expected = Dict{Tuple, Float64}()
for (geno, freq) in zip(genos, expected_genotype_freq)
expected[geno] = get!(expected, geno, 0) + freq
expected[geno] = get!(expected, geno, 0.0) + freq
end

return expected
Expand All @@ -249,7 +265,7 @@ geno_freq_expeced(cats, "fca8")
geno_freq_expected(cats, "fca8", population = true)
```
"""
function geno_freq_expected(data::PopData, locus::String, population::Bool=false)
function geno_freq_expected(data::PopData, locus::String; population::Bool=false)
if !population
tmp = data.loci[data.loci.locus .== locus, :genotype]
geno_freq_expected(tmp)
Expand Down
53 changes: 37 additions & 16 deletions src/DataExploration.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,48 @@ missing(gulfsharks(), by = "pop")
end
end

#TODO add to docs (Data Exploration page and API)

"""
pairwise_identical(data::PopData)
Return a table of the percent of identical genotypes at each locus between pairs of individuals.
"""
function pairwise_identical(data::PopData)
sample_names = samples(data)
samples_2 = @view sample_names[2:end-1]
sample_pairs = collect(Iterators.product(samples_2, sample_names)) |> vec
sample_names = collect(samples(data))
pairwise_identical(data, sample_names)
end

"""
pairwise_identical(data::PopData, sample_names::Vector{String})
Return a table of the percent of identical genotypes at each locus
between all pairs of provided individuals.
"""
function pairwise_identical(data::PopData, sample_names::Vector{String})
errs = ""
all_samples = samples(data)
if sample_names != all_samples
[errs *= "\n $i" for i in sample_names if i all_samples]
errs != "" && error("Samples not found in the PopData: " * errs)
end
sample_pairs = pairwise_pairs(sample_names)
n = length(sample_pairs)
perc_ident_vec = Vector{Float64}(undef, n)
n_vec = Vector{Int}(undef, n)
popdata_idx = groupby(data.loci, :name)
idx = 0
@inbounds for sample_1 in sample_names
geno_1 = get_sample_genotypes(data, sample_1)
len_1 = length(collect(skipmissing(geno_1)))
Base.Threads.@threads for sample_2 in sample_names[2:end-1]
idx += 1
geno_2 = get_sample_genotypes(data, sample_2)
len_2 = length(collect(skipmissing(geno_2)))
p = Progress(length(sample_pairs), dt = 1, color = :blue)
@inbounds @sync for i in 1:length(sample_pairs)
Base.Threads.@spawn begin
@inbounds geno_1 = popdata_idx[(sample_pairs[i][1],)].genotype
@inbounds geno_2 = popdata_idx[(sample_pairs[i][2],)].genotype
len_1 = nonmissing(geno_1)
len_2 = nonmissing(geno_2)
shared_geno = minimum([len_1, len_2])
shared = sum(skipmissing(geno_1 .== geno_2))
perc_ident_vec[idx] = round(shared/shared_geno, digits = 2)
n_vec[idx] = shared_geno
shared_geno = minimum([len_1, len_2])
shared = sum(skipmissing(geno_1 .== geno_2))
@inbounds perc_ident_vec[i] = round(shared/shared_geno, digits = 2)
@inbounds n_vec[i] = shared_geno
next!(p)
end
end
DataFrame(:sample_1 => getindex.(sample_pairs, 2), :sample_2 => getindex.(sample_pairs, 1), :identical => perc_ident_vec, :n => n_vec)
end
DataFrame(:sample_1 => map(i -> i[1], sample_pairs), :sample_2 => map(i -> i[2], sample_pairs), :identical => perc_ident_vec, :n => n_vec)
end
33 changes: 17 additions & 16 deletions src/FStats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
)
=#

HT = mean(@inbounds @avx 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count))
DST = mean(@inbounds @avx HT .- n_df.HS)
DST′ = mean(@inbounds @avx n_df.count ./ (n_df.count .- 1) .* DST)
HT′ = mean(@inbounds @avx n_df.HS .+ DST′)
HT = mean(@inbounds 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count))
DST = mean(@inbounds HT .- n_df.HS)
DST′ = mean(@inbounds n_df.count ./ (n_df.count .- 1) .* DST)
HT′ = mean(@inbounds n_df.HS .+ DST′)

return Dict{Symbol, Float64}(
:FST => DST / HT,
Expand Down Expand Up @@ -96,10 +96,10 @@ end
groupby(het_df, :locus)
)

HT = mean(@inbounds @avx 1.0 .- n_df.avg_freq .+ (n_df.HS ./ n_df.mn ./ n_df.count) - (n_df.Het_obs ./ 2.0 ./ n_df.mn ./ n_df.count))
DST = mean(@inbounds @avx HT .- n_df.HS)
DST′ = mean(@inbounds @avx n_df.count ./ (n_df.count .- 1) .* DST)
HT′ = mean(@inbounds @avx n_df.HS .+ DST′)
HT = mean(@inbounds 1.0 .- n_df.avg_freq + (n_df.HS / n_df.mn / n_df.count) - (n_df.Het_obs / 2.0 / n_df.mn / n_df.count))
DST = mean(@inbounds HT - n_df.HS)
DST′ = mean(@inbounds n_df.count / (n_df.count .- 1) * DST)
HT′ = mean(@inbounds n_df.HS .+ DST′)

return Dict{Symbol, Float64}(
:FST => DST / HT,
Expand All @@ -108,21 +108,22 @@ end
end




function f_stat_p(data::PopData; nperm::Int = 1000)
observed = FST_global(data)
popnames = data.meta.population
perm_dict = Dict{Symbol,Vector{Float64}}(
:FST => Vector{Float64}(undef, nperm-1),
:FST′ => Vector{Float64}(undef, nperm-1)
)

@sync Base.Threads.@spawn for n in 1:nperm-1
tmp = copy(data.loci)
tmp_f = FST_global(permute_samples!(tmp, popnames))
perm_dict[:FST][n] = tmp_f[:FST]
perm_dict[:FST′][n] = tmp_f[:FST′]
p = Progress(length(sample_pairs), dt = 1, color = :blue)
@inbounds @sync for n in 1:nperm-1
Base.Threads.@spawn begin
tmp = copy(data.loci)
tmp_f = FST_global(permute_samples!(tmp, popnames))
perm_dict[:FST][n] = tmp_f[:FST]
perm_dict[:FST′][n] = tmp_f[:FST′]
next!(p)
end
end

@inbounds for (k,v) in perm_dict
Expand Down
Loading

0 comments on commit a218bf4

Please sign in to comment.