Skip to content

Commit

Permalink
initial LOCO updates
Browse files Browse the repository at this point in the history
  • Loading branch information
joshua-slaughter committed Mar 8, 2024
1 parent 66af05f commit e7c25bb
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 1 deletion.
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ version = "0.1.0"
ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
BGEN = "6db4b851-9beb-4b83-9d64-eb1cfb37721d"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
Expand All @@ -18,12 +19,12 @@ TMLE = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"

[compat]
TMLE = "0.14"
ArgParse = "1.1"
BGEN = "0.2.1"
CSV = "0.9, 0.10"
CategoricalArrays = "0.10"
Combinatorics = "1.0"
DataFrames = "1.2"
SnpArrays = "0.3"
TMLE = "0.14"
YAML = "0.4"
1 change: 1 addition & 0 deletions src/TargeneCore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ include(joinpath("tmle_inputs", "tmle_inputs.jl"))
include(joinpath("tmle_inputs", "from_actors.jl"))
include(joinpath("tmle_inputs", "from_param_files.jl"))
include(joinpath("tmle_inputs", "allele_independent_estimands.jl"))
include(joinpath("tmle_inputs", "loco_gwas.jl"))

###############################################################################
### EXPORTS ###
Expand Down
3 changes: 3 additions & 0 deletions src/tmle_inputs/allele_independent_estimands.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,10 @@ function allele_independent_estimands(parsed_args)
# Genotypes and final dataset
variants_set = Set(retrieve_variants_list(variants_config))
genotypes = call_genotypes(bgen_prefix, variants_set, call_threshold)
# snparrays()

dataset = merge(traits, pcs, genotypes)
println(dataset)
Arrow.write(string(outprefix, ".data.arrow"), dataset)

# Estimands
Expand Down
78 changes: 78 additions & 0 deletions src/tmle_inputs/loco_gwas.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
mutable struct BatchManager
outprefix::String
max_batch_size::Union{Int, Nothing}
current_batch_id::Int
current_estimands::Vector
current_batch_size::Int
batch_id::Int
BatchManager(outprefix, max_batch_size) = new(outprefix, max_batch_size, 1, [], 0, 1)
end

function save_batch!(batch_saver::BatchManager, groupname)
batchfilename = batch_name(string(batch_saver.outprefix, ".", groupname), batch_saver.batch_id)
serialize(batchfilename, Configuration(estimands=batch_saver.current_estimands))
batch_saver.batch_id += 1
batch_saver.current_estimands = []
batch_saver.current_batch_size = 0
end

"""
Here we aim to accomplish a few things to perform variant effect size estimates across the genome
1. Loading the merged {.bed, .bim, .fam} files using SnpArrays this can be grabbed after RunPCALoco process
2. Loading the relative PC files also from the RunPCALoco process
3. Iterating through the merged genotype files and computing the factorialATE() for each variant on a specified target
"""
function loco_gwas(parsed_args)
outprefix = parsed_args["out-prefix"]
batch_saver = BatchManager(outprefix, parsed_args["batch-size"])
call_threshold = parsed_args["call-threshold"]
bgen_prefix = parsed_args["bgen-prefix"]
bim_prefix = parsed_args["bim-prefix"]
positivity_constraint = parsed_args["positivity-constraint"]
traits = read_data(parsed_args["traits"])
# work out logic for PCs
pcs = read_data(parsed_args["pcs"])
# logic for config needs to be updated
config = YAML.load_file(parsed_args["loco-gwas"]["config"])

# Variables
# variants_config = config["variants"]
extra_treatments = haskey(config, "extra_treatments") ? Symbol.(config["extra_treatments"]) : []
outcome_extra_covariates = haskey(config, "outcome_extra_covariates") ? Symbol.(config["outcome_extra_covariates"]) : []
extra_confounders = haskey(config, "extra_confounders") ? Symbol.(config["extra_confounders"]) : []
confounders = all_confounders(pcs, extra_confounders)
nonoutcomes = Set(vcat(:SAMPLE_ID, extra_confounders, outcome_extra_covariates, extra_treatments))
outcomes = filter(x -> x nonoutcomes, Symbol.(names(traits)))

# Genotypes and final dataset
variants_set = read_bim(bim_prefix)
# genotypes = call_genotypes(bgen_prefix, variants_set, call_threshold)

dataset = DataFrame(Arrow.Table())
for col in snparray
x = convert(Vector{Float64}, [col])
dataset[!, :x] = x
end

# dataset = merge(traits, pcs, genotypes)
# println(dataset)
# Arrow.write(string(outprefix, ".data.arrow"), dataset)

# # Estimands
# for estimand_type in config["estimands"]
# if estimand_type == "IATE"
# orders = config["orders"]
# generate_iates!(batch_saver, dataset, variants_config, outcomes, confounders;
# extra_treatments=extra_treatments,
# outcome_extra_covariates=outcome_extra_covariates,
# positivity_constraint=positivity_constraint,
# orders=orders
# )
# else
# throw(ArgumentError(string("Unknown estimand type: ", estimand_type)))
# end
# end

return 0
end
52 changes: 52 additions & 0 deletions src/tmle_inputs/tmle_inputs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,31 @@ NotAllVariantsFoundError(rsids) =
ArgumentError(string("Some variants were not found in the genotype files: ", join(rsids, ", ")))

NotBiAllelicOrUnphasedVariantError(rsid) = ArgumentError(string("Variant: ", rsid, " is not bi-allelic or not unphased."))

"""
b = read_bgen("/exports/igmm/eddie/khamseh-lab/jslaughter/targene_dev/TargeneCore.jl/test/data/ukbb/imputed/ukb_53116_chr1.bgen")
rsid = "rs76716020" # some string that is the rsid (multi example)
(file_start_position = [49141078487, 49141082493],
allele1 = ["A", "A"],
allele2 = ["G", "T"],)
"""
function is_multiallelic(b::Bgen, rsid::AbstractString)
@assert !(rsid == nothing) "provide RSID"
BGEN._check_idx(b)
q = "SELECT file_start_position, allele1, allele2 FROM Variant WHERE rsid= ?"
idx = b.idx
params = (rsid,)
r = (BGEN.DBInterface.execute(idx.db, q, params) |> BGEN.columntable)#[1]
if length(r) == 0
error("variant with rsid $rsid not found")
end
if length(r[1]) > 1
return true
else
return false
end
end

"""
bgen_files(snps, bgen_prefix)
Expand All @@ -80,9 +105,15 @@ function call_genotypes(bgen_prefix::String, query_rsids::Set{<:AbstractString},
bgenfile = read_bgen(joinpath(chr_dir_, filename))
chr_genotypes = DataFrame(SAMPLE_ID=bgenfile.samples)
bgen_rsids = Set(rsids(bgenfile))

for query_rsid in query_rsids
if query_rsid bgen_rsids
pop!(query_rsids, query_rsid)
# insert function that filters for bi-allelic variants
if is_multiallelic(bgenfile, query_rsid)
@warn("Skipping $query_rsid, not bi-allelic")
continue
end
variant = variant_by_rsid(bgenfile, query_rsid)
if n_alleles(variant) != 2
@warn("Skipping $query_rsid, not bi-allelic")
Expand Down Expand Up @@ -191,9 +222,30 @@ function tmle_inputs(parsed_args)
tmle_inputs_from_param_files(parsed_args)
elseif parsed_args["%COMMAND%"] == "allele-independent"
allele_independent_estimands(parsed_args)
elseif parsed_args["%COMMAND%"] == "loco-gwas"
loco_gwas(parsed_args)
else
throw(ArgumentError("Unrecognized command."))
end
end

function read_bim(bim_prefix::String)
chr_dir, prefix_ = splitdir(bim_prefix)

query_rsids::Set{<:AbstractString} = Set{AbstractString}()

for filename in readdir(chr_dir)
if endswith(filename, ".bim") ? true : false
file = open(joinpath(chr_dir,filename), "r")
for line in eachline(file)
bim_entry = split(line, '\t')
if startswith(bim_entry[2], "rs") ? true : false
# bim_entry[2] = replace(bim_entry[2], "rs" => "RSID_")
push!(query_rsids, bim_entry[2])
end
end
close(file)
end
end
return query_rsids
end
23 changes: 23 additions & 0 deletions test/data/gwas_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
orders: [2, 3]
estimands:
- IATE
variants:
TF1:
bQTLs:
- RSID_17
- RSID_99
eQTLs:
- RSID_102
TF2:
bQTLs:
- RSID_17
- RSID_198
eQTLs:
- RSID_2
extra_treatments:
- TREAT_1
outcome_extra_covariates:
- COV_1
extra_confounders:
- 21003
- 22001
32 changes: 32 additions & 0 deletions test/tmle_inputs/loco_gwas.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
module TestLOCOGWAS

using Test
using TargeneCore
using Arrow
using DataFrames
using Serialization

TESTDIR = joinpath(pkgdir(TargeneCore), "test")

include(joinpath(TESTDIR, "tmle_inputs", "test_utils.jl"))

@testset "Test LOCO-GWAS: with positivity constraint" begin
tmpdir = mktempdir()
parsed_args = Dict(
"loco-gwas" => Dict{String, Any}(
"config" => joinpath(TESTDIR, "data", "gwas_config.yaml"),
),
"traits" => joinpath(TESTDIR, "data", "traits_1.csv"), #investigate
"pcs" => joinpath(TESTDIR, "data", "pcs.csv"), #investigate
"%COMMAND%" => "loco-gwas",
"out-prefix" => joinpath(tmpdir, "final"),
"bgen-prefix" => joinpath(TESTDIR, "data", "ukbb", "imputed", "ukb_53116"),
"bim-prefix" => joinpath(TESTDIR, "data", "ukbb", "genotypes", "ukb_53116"),
"call-threshold" => 0.8,
"batch-size" => nothing,
"positivity-constraint" => 0.01,
)
tmle_inputs(parsed_args)
end

end

0 comments on commit e7c25bb

Please sign in to comment.