Skip to content

Commit

Permalink
Merge pull request #58 from pdimens/dev
Browse files Browse the repository at this point in the history
pull 0.4.5 into release
  • Loading branch information
pdimens authored Feb 9, 2021
2 parents 9e01294 + 15d5113 commit 0cbdcf9
Show file tree
Hide file tree
Showing 5 changed files with 155 additions and 147 deletions.
5 changes: 3 additions & 2 deletions src/PopGen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,12 @@ include("io/Delimited.jl")
include("io/Genepop.jl")
include("io/Read.jl")
include("io/Structure.jl")
include("io/VariantCall.jl")
@init @require GeneticVariation="9bc6ac9d-e6b2-5f70-b0a8-242a01662520" begin
include("io/VariantCall.jl")
include("io/VariantCallLazy.jl")
end
@init @require GeneticVariation="9bc6ac9d-e6b2-5f70-b0a8-242a01662520" begin
@require GZip="92fee26a-97fe-5a0c-ad85-20a5f3185b63" include("io/VariantCallGz.jl")
@require GZip="92fee26a-97fe-5a0c-ad85-20a5f3185b63" include("io/VariantCallGzLazy.jl")
end
# example data
include("io/Datasets.jl")
Expand Down
135 changes: 2 additions & 133 deletions src/io/VariantCall.jl
Original file line number Diff line number Diff line change
@@ -1,22 +1,5 @@
using .GeneticVariation

export bcf, vcf

"""
openvcf(::String)
Open VCF file (`.vcf/.gz`, or `.bcf/.gz`) and return an `IO` stream in reading mode `"r"`.
"""
function openvcf(infile::String)
if endswith(infile, ".vcf") || endswith(infile, ".bcf")
return open(infile, "r")
elseif endswith(infile, ".gz")
throw(ArgumentError("Please load in GZip.jl with \`using GZip\`"))
else
throw(ArgumentError("The filename must end with .vcf or .bcf"))
end
end


"""
bcf(infile::String; ; rename_snp::Bool, silent::Bool, allow_monomorphic::Bool)
Load a BCF file into memory as a PopData object. Population information needs to be provided separately.
Expand Down Expand Up @@ -44,64 +27,7 @@ julia> mydata.loci.genotype = mydata.loci.genotype |> Array{Union{Missing, NTup
```
"""
function bcf(infile::String; rename_loci::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0))
stream = BCF.Reader(openvcf(infile))
nmarkers = countlines(openvcf(infile)) - length(BCF.header(stream)) - 1
sample_ID = header(stream).sampleID
nsamples = length(sample_ID)
loci_names = fill("marker", nmarkers)
geno_df = DataFrame(:name => sample_ID, :population => "missing")
if silent == false
@info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---"
end
for record in stream
ref_alt = Dict(-1 => "miss", 0 => BCF.ref(record), [i => j for (i,j) in enumerate(BCF.alt(record))]...)
raw_geno = BCF.genotype(record, 1:nsamples, "GT")
conv_geno = map(raw_geno) do rg
tmp = replace.(rg, "." => "-1")
ig = collect(parse.(Int8, split(tmp, r"\/|\|")))
[bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple
end
insertcols!(geno_df, Symbol(BCF.chrom(record) * "_" * string(BCF.pos(record))) => conv_geno)
end
close(stream)
if rename_snp
rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
rename!(geno_df, rnm)
end
stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2))
rename!(stacked_geno_df, [:name, :population, :locus, :genotype])
# set columns as PooledArrays
select!(
stacked_geno_df,
:name => PooledArray => :name,
:population => PooledArray => :population,
:locus => (i -> PooledArray(i |> Vector{String})) => :locus,
:genotype
)
# replace missing genotypes as missing
stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno
if all(0 .== geno)
return missing
else
return geno
end
end
sort!(stacked_geno_df, [:name, :locus])
#meta_df = generate_meta(stacked_geno_df)
# ploidy finding
meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name),
:genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy
)
insertcols!(meta_df, 2, :population => "missing")
insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples))

if allow_monomorphic
pd_out = PopData(meta_df, stacked_geno_df)
else
pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df))
end
return pd_out
error("Please load in GeneticVariation.jl with \`using GeneticVariation\`")
end

### VCF parsing ###
Expand Down Expand Up @@ -134,62 +60,5 @@ julia> mydata.loci.genotype = mydata.loci.genotype |> Array{Union{Missing, NTup
```
"""
function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0))
stream = VCF.Reader(openvcf(infile))
nmarkers = countlines(openvcf(infile)) - length(VCF.header(stream)) - 1
sample_ID = header(stream).sampleID
nsamples = length(sample_ID)
loci_names = fill("marker", nmarkers)
geno_df = DataFrame(:name => sample_ID, :population => "missing")
if silent == false
@info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---"
end
for record in stream
ref_alt = Dict(-1 => "miss", 0 => VCF.ref(record), [i => j for (i,j) in enumerate(VCF.alt(record))]...)
raw_geno = VCF.genotype(record, 1:nsamples, "GT")
conv_geno = map(raw_geno) do rg
tmp = replace.(rg, "." => "-1")
ig = collect(parse.(Int8, split(tmp, r"\/|\|")))
[bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple
end
insertcols!(geno_df, Symbol(VCF.chrom(record) * "_" * string(VCF.pos(record))) => conv_geno)
end
close(stream)
if rename_snp
rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
rename!(geno_df, rnm)
end
stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2))
rename!(stacked_geno_df, [:name, :population, :locus, :genotype])
# set columns as PooledArrays
select!(
stacked_geno_df,
:name => PooledArray => :name,
:population => PooledArray => :population,
:locus => (i -> PooledArray(i |> Vector{String})) => :locus,
:genotype
)
# replace missing genotypes as missing
stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno
if all(0 .== geno)
return missing
else
return geno
end
end
sort!(stacked_geno_df, [:name, :locus])
# ploidy finding
#meta_df = generate_meta(stacked_geno_df)
meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name),
:genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy
)
insertcols!(meta_df, 2, :population => "missing")
insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples))

if allow_monomorphic
pd_out = PopData(meta_df, stacked_geno_df)
else
pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df))
end
return pd_out
error("Please load in GeneticVariation.jl with \`using GeneticVariation\`")
end
12 changes: 0 additions & 12 deletions src/io/VariantCallGz.jl

This file was deleted.

10 changes: 10 additions & 0 deletions src/io/VariantCallGzLazy.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
using .GZip

# if GZip is loaded in, overwrite openvcf to this method
function openvcf(infile::String)
if endswith(infile, ".gz")
return GZip.open(infile, "r")
else
return open(infile, "r")
end
end
140 changes: 140 additions & 0 deletions src/io/VariantCallLazy.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
using .GeneticVariation

export bcf, vcf

"""
openvcf(::String)
Open VCF file (`.vcf(.gz)`, or `.bcf(.gz)`) and return an `IO` stream in reading mode `"r"`.
"""
function openvcf(infile::String)
if endswith(infile, ".gz")
throw(ArgumentError("Please load in GZip.jl with \`using GZip\`"))
else
return open(infile, "r")
end
end


function bcf(infile::String; rename_loci::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0))
stream = BCF.Reader(openvcf(infile))
nmarkers = countlines(openvcf(infile)) - length(BCF.header(stream)) - 1
sample_ID = header(stream).sampleID
nsamples = length(sample_ID)
loci_names = fill("marker", nmarkers)
geno_df = DataFrame(:name => sample_ID, :population => "missing")
if silent == false
@info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---"
end
for record in stream
ref_alt = Dict(-1 => "miss", 0 => BCF.ref(record), [i => j for (i,j) in enumerate(BCF.alt(record))]...)
raw_geno = BCF.genotype(record, 1:nsamples, "GT")
conv_geno = map(raw_geno) do rg
tmp = replace.(rg, "." => "-1")
ig = collect(parse.(Int8, split(tmp, r"\/|\|")))
[bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple
end
insertcols!(geno_df, Symbol(BCF.chrom(record) * "_" * string(BCF.pos(record))) => conv_geno)
end
close(stream)
if rename_snp
rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
rename!(geno_df, rnm)
end
stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2))
rename!(stacked_geno_df, [:name, :population, :locus, :genotype])
# set columns as PooledArrays
select!(
stacked_geno_df,
:name => PooledArray => :name,
:population => PooledArray => :population,
:locus => (i -> PooledArray(i |> Vector{String})) => :locus,
:genotype
)
# replace missing genotypes as missing
stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno
if all(0 .== geno)
return missing
else
return geno
end
end
sort!(stacked_geno_df, [:name, :locus])
#meta_df = generate_meta(stacked_geno_df)
# ploidy finding
meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name),
:genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy
)
insertcols!(meta_df, 2, :population => "missing")
insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples))

if allow_monomorphic
pd_out = PopData(meta_df, stacked_geno_df)
else
pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df))
end
return pd_out
end

### VCF parsing ###

function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false)
bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0))
stream = VCF.Reader(openvcf(infile))
nmarkers = countlines(openvcf(infile)) - length(VCF.header(stream)) - 1
sample_ID = header(stream).sampleID
nsamples = length(sample_ID)
loci_names = fill("marker", nmarkers)
geno_df = DataFrame(:name => sample_ID, :population => "missing")
if silent == false
@info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---"
end
for record in stream
ref_alt = Dict(-1 => "miss", 0 => VCF.ref(record), [i => j for (i,j) in enumerate(VCF.alt(record))]...)
raw_geno = VCF.genotype(record, 1:nsamples, "GT")
conv_geno = map(raw_geno) do rg
tmp = replace.(rg, "." => "-1")
ig = collect(parse.(Int8, split(tmp, r"\/|\|")))
[bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple
end
insertcols!(geno_df, Symbol(VCF.chrom(record) * "_" * string(VCF.pos(record))) => conv_geno)
end
close(stream)
if rename_snp
rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)])
rename!(geno_df, rnm)
end
stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2))
rename!(stacked_geno_df, [:name, :population, :locus, :genotype])
# set columns as PooledArrays
select!(
stacked_geno_df,
:name => PooledArray => :name,
:population => PooledArray => :population,
:locus => (i -> PooledArray(i |> Vector{String})) => :locus,
:genotype
)
# replace missing genotypes as missing
stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno
if all(0 .== geno)
return missing
else
return geno
end
end
sort!(stacked_geno_df, [:name, :locus])
# ploidy finding
#meta_df = generate_meta(stacked_geno_df)
meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name),
:genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy
)
insertcols!(meta_df, 2, :population => "missing")
insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples))

if allow_monomorphic
pd_out = PopData(meta_df, stacked_geno_df)
else
pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df))
end
return pd_out
end

0 comments on commit 0cbdcf9

Please sign in to comment.