diff --git a/src/PopGen.jl b/src/PopGen.jl index 99f3cf186..fc0db7c79 100755 --- a/src/PopGen.jl +++ b/src/PopGen.jl @@ -56,11 +56,12 @@ include("io/Delimited.jl") include("io/Genepop.jl") include("io/Read.jl") include("io/Structure.jl") +include("io/VariantCall.jl") @init @require GeneticVariation="9bc6ac9d-e6b2-5f70-b0a8-242a01662520" begin - include("io/VariantCall.jl") + include("io/VariantCallLazy.jl") end @init @require GeneticVariation="9bc6ac9d-e6b2-5f70-b0a8-242a01662520" begin - @require GZip="92fee26a-97fe-5a0c-ad85-20a5f3185b63" include("io/VariantCallGz.jl") + @require GZip="92fee26a-97fe-5a0c-ad85-20a5f3185b63" include("io/VariantCallGzLazy.jl") end # example data include("io/Datasets.jl") diff --git a/src/io/VariantCall.jl b/src/io/VariantCall.jl index 393f424a7..efa17b979 100755 --- a/src/io/VariantCall.jl +++ b/src/io/VariantCall.jl @@ -1,22 +1,5 @@ -using .GeneticVariation - export bcf, vcf -""" - openvcf(::String) -Open VCF file (`.vcf/.gz`, or `.bcf/.gz`) and return an `IO` stream in reading mode `"r"`. -""" -function openvcf(infile::String) - if endswith(infile, ".vcf") || endswith(infile, ".bcf") - return open(infile, "r") - elseif endswith(infile, ".gz") - throw(ArgumentError("Please load in GZip.jl with \`using GZip\`")) - else - throw(ArgumentError("The filename must end with .vcf or .bcf")) - end -end - - """ bcf(infile::String; ; rename_snp::Bool, silent::Bool, allow_monomorphic::Bool) Load a BCF file into memory as a PopData object. Population information needs to be provided separately. @@ -44,64 +27,7 @@ julia> mydata.loci.genotype = mydata.loci.genotype |> Array{Union{Missing, NTup ``` """ function bcf(infile::String; rename_loci::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false) - bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0)) - stream = BCF.Reader(openvcf(infile)) - nmarkers = countlines(openvcf(infile)) - length(BCF.header(stream)) - 1 - sample_ID = header(stream).sampleID - nsamples = length(sample_ID) - loci_names = fill("marker", nmarkers) - geno_df = DataFrame(:name => sample_ID, :population => "missing") - if silent == false - @info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---" - end - for record in stream - ref_alt = Dict(-1 => "miss", 0 => BCF.ref(record), [i => j for (i,j) in enumerate(BCF.alt(record))]...) - raw_geno = BCF.genotype(record, 1:nsamples, "GT") - conv_geno = map(raw_geno) do rg - tmp = replace.(rg, "." => "-1") - ig = collect(parse.(Int8, split(tmp, r"\/|\|"))) - [bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple - end - insertcols!(geno_df, Symbol(BCF.chrom(record) * "_" * string(BCF.pos(record))) => conv_geno) - end - close(stream) - if rename_snp - rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)]) - rename!(geno_df, rnm) - end - stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2)) - rename!(stacked_geno_df, [:name, :population, :locus, :genotype]) - # set columns as PooledArrays - select!( - stacked_geno_df, - :name => PooledArray => :name, - :population => PooledArray => :population, - :locus => (i -> PooledArray(i |> Vector{String})) => :locus, - :genotype - ) - # replace missing genotypes as missing - stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno - if all(0 .== geno) - return missing - else - return geno - end - end - sort!(stacked_geno_df, [:name, :locus]) - #meta_df = generate_meta(stacked_geno_df) - # ploidy finding - meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name), - :genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy - ) - insertcols!(meta_df, 2, :population => "missing") - insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples)) - - if allow_monomorphic - pd_out = PopData(meta_df, stacked_geno_df) - else - pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df)) - end - return pd_out + error("Please load in GeneticVariation.jl with \`using GeneticVariation\`") end ### VCF parsing ### @@ -134,62 +60,5 @@ julia> mydata.loci.genotype = mydata.loci.genotype |> Array{Union{Missing, NTup ``` """ function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false) - bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0)) - stream = VCF.Reader(openvcf(infile)) - nmarkers = countlines(openvcf(infile)) - length(VCF.header(stream)) - 1 - sample_ID = header(stream).sampleID - nsamples = length(sample_ID) - loci_names = fill("marker", nmarkers) - geno_df = DataFrame(:name => sample_ID, :population => "missing") - if silent == false - @info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---" - end - for record in stream - ref_alt = Dict(-1 => "miss", 0 => VCF.ref(record), [i => j for (i,j) in enumerate(VCF.alt(record))]...) - raw_geno = VCF.genotype(record, 1:nsamples, "GT") - conv_geno = map(raw_geno) do rg - tmp = replace.(rg, "." => "-1") - ig = collect(parse.(Int8, split(tmp, r"\/|\|"))) - [bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple - end - insertcols!(geno_df, Symbol(VCF.chrom(record) * "_" * string(VCF.pos(record))) => conv_geno) - end - close(stream) - if rename_snp - rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)]) - rename!(geno_df, rnm) - end - stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2)) - rename!(stacked_geno_df, [:name, :population, :locus, :genotype]) - # set columns as PooledArrays - select!( - stacked_geno_df, - :name => PooledArray => :name, - :population => PooledArray => :population, - :locus => (i -> PooledArray(i |> Vector{String})) => :locus, - :genotype - ) - # replace missing genotypes as missing - stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno - if all(0 .== geno) - return missing - else - return geno - end - end - sort!(stacked_geno_df, [:name, :locus]) - # ploidy finding - #meta_df = generate_meta(stacked_geno_df) - meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name), - :genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy - ) - insertcols!(meta_df, 2, :population => "missing") - insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples)) - - if allow_monomorphic - pd_out = PopData(meta_df, stacked_geno_df) - else - pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df)) - end - return pd_out + error("Please load in GeneticVariation.jl with \`using GeneticVariation\`") end diff --git a/src/io/VariantCallGz.jl b/src/io/VariantCallGz.jl deleted file mode 100644 index 161e5c3a6..000000000 --- a/src/io/VariantCallGz.jl +++ /dev/null @@ -1,12 +0,0 @@ -using .GZip - -# if GZip is loaded in, overwrite openvcf to this method -function openvcf(infile::String) - if endswith(infile, ".vcf") || endswith(infile, ".bcf") - return open(infile, "r") - elseif endswith(infile, ".vcf.gz") || endswith(infile, ".bcf.gz") - return GZip.open(infile, "r") - else - throw(ArgumentError("The filename must end with .vcf/.bcf or .vcf.gz/.bcf.gz")) - end -end diff --git a/src/io/VariantCallGzLazy.jl b/src/io/VariantCallGzLazy.jl new file mode 100644 index 000000000..4318003b0 --- /dev/null +++ b/src/io/VariantCallGzLazy.jl @@ -0,0 +1,10 @@ +using .GZip + +# if GZip is loaded in, overwrite openvcf to this method +function openvcf(infile::String) + if endswith(infile, ".gz") + return GZip.open(infile, "r") + else + return open(infile, "r") + end +end \ No newline at end of file diff --git a/src/io/VariantCallLazy.jl b/src/io/VariantCallLazy.jl new file mode 100755 index 000000000..6baf5bcd4 --- /dev/null +++ b/src/io/VariantCallLazy.jl @@ -0,0 +1,140 @@ +using .GeneticVariation + +export bcf, vcf + +""" + openvcf(::String) +Open VCF file (`.vcf(.gz)`, or `.bcf(.gz)`) and return an `IO` stream in reading mode `"r"`. +""" +function openvcf(infile::String) + if endswith(infile, ".gz") + throw(ArgumentError("Please load in GZip.jl with \`using GZip\`")) + else + return open(infile, "r") + end +end + + +function bcf(infile::String; rename_loci::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false) + bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0)) + stream = BCF.Reader(openvcf(infile)) + nmarkers = countlines(openvcf(infile)) - length(BCF.header(stream)) - 1 + sample_ID = header(stream).sampleID + nsamples = length(sample_ID) + loci_names = fill("marker", nmarkers) + geno_df = DataFrame(:name => sample_ID, :population => "missing") + if silent == false + @info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---" + end + for record in stream + ref_alt = Dict(-1 => "miss", 0 => BCF.ref(record), [i => j for (i,j) in enumerate(BCF.alt(record))]...) + raw_geno = BCF.genotype(record, 1:nsamples, "GT") + conv_geno = map(raw_geno) do rg + tmp = replace.(rg, "." => "-1") + ig = collect(parse.(Int8, split(tmp, r"\/|\|"))) + [bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple + end + insertcols!(geno_df, Symbol(BCF.chrom(record) * "_" * string(BCF.pos(record))) => conv_geno) + end + close(stream) + if rename_snp + rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)]) + rename!(geno_df, rnm) + end + stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2)) + rename!(stacked_geno_df, [:name, :population, :locus, :genotype]) + # set columns as PooledArrays + select!( + stacked_geno_df, + :name => PooledArray => :name, + :population => PooledArray => :population, + :locus => (i -> PooledArray(i |> Vector{String})) => :locus, + :genotype + ) + # replace missing genotypes as missing + stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno + if all(0 .== geno) + return missing + else + return geno + end + end + sort!(stacked_geno_df, [:name, :locus]) + #meta_df = generate_meta(stacked_geno_df) + # ploidy finding + meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name), + :genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy + ) + insertcols!(meta_df, 2, :population => "missing") + insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples)) + + if allow_monomorphic + pd_out = PopData(meta_df, stacked_geno_df) + else + pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df)) + end + return pd_out +end + +### VCF parsing ### + +function vcf(infile::String; rename_snp::Bool = false, silent::Bool = false, allow_monomorphic::Bool = false) + bases = (A = Int8(1), T = Int8(2), C = Int8(3), G = Int8(4), miss = Int8(0)) + stream = VCF.Reader(openvcf(infile)) + nmarkers = countlines(openvcf(infile)) - length(VCF.header(stream)) - 1 + sample_ID = header(stream).sampleID + nsamples = length(sample_ID) + loci_names = fill("marker", nmarkers) + geno_df = DataFrame(:name => sample_ID, :population => "missing") + if silent == false + @info "\n$(abspath(infile))\n$nsamples samples detected\n$nmarkers markers detected\npopulation info must be added <---" + end + for record in stream + ref_alt = Dict(-1 => "miss", 0 => VCF.ref(record), [i => j for (i,j) in enumerate(VCF.alt(record))]...) + raw_geno = VCF.genotype(record, 1:nsamples, "GT") + conv_geno = map(raw_geno) do rg + tmp = replace.(rg, "." => "-1") + ig = collect(parse.(Int8, split(tmp, r"\/|\|"))) + [bases[Symbol(ref_alt[i])] for i in ig] |> sort |> Tuple + end + insertcols!(geno_df, Symbol(VCF.chrom(record) * "_" * string(VCF.pos(record))) => conv_geno) + end + close(stream) + if rename_snp + rnm = append!([:name, :population], [Symbol.("snp_" * i) for i in string.(1:nmarkers)]) + rename!(geno_df, rnm) + end + stacked_geno_df = DataFrames.stack(geno_df, DataFrames.Not(1:2)) + rename!(stacked_geno_df, [:name, :population, :locus, :genotype]) + # set columns as PooledArrays + select!( + stacked_geno_df, + :name => PooledArray => :name, + :population => PooledArray => :population, + :locus => (i -> PooledArray(i |> Vector{String})) => :locus, + :genotype + ) + # replace missing genotypes as missing + stacked_geno_df.genotype = map(stacked_geno_df.genotype) do geno + if all(0 .== geno) + return missing + else + return geno + end + end + sort!(stacked_geno_df, [:name, :locus]) + # ploidy finding + #meta_df = generate_meta(stacked_geno_df) + meta_df = DataFrames.combine(DataFrames.groupby(stacked_geno_df, :name), + :genotype => (i -> Int8(length(first(skipmissing(i))))) => :ploidy + ) + insertcols!(meta_df, 2, :population => "missing") + insertcols!(meta_df, 4, :longitude => Vector{Union{Missing, Float32}}(undef, nsamples), :latitude => Vector{Union{Missing, Float32}}(undef, nsamples)) + + if allow_monomorphic + pd_out = PopData(meta_df, stacked_geno_df) + else + pd_out = drop_monomorphic!(PopData(meta_df, stacked_geno_df)) + end + return pd_out +end