Skip to content

Commit

Permalink
Merge pull request #185 from TARGENE/brh_data_source
Browse files Browse the repository at this point in the history
Add functionality to run from_actors mode with more than one TF
  • Loading branch information
roskamsh authored Nov 17, 2023
2 parents 4dcf5c4 + 45af342 commit 33c3ff8
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 18 deletions.
2 changes: 1 addition & 1 deletion src/confounders.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,4 +106,4 @@ function adapt_flashpca(parsed_args)
pcs = CSV.File(parsed_args["input"], drop=["FID"]) |> DataFrame
rename!(pcs, :IID => :SAMPLE_ID)
CSV.write(parsed_args["output"], pcs)
end
end
35 changes: 28 additions & 7 deletions src/tmle_inputs/from_actors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@ function treatments_from_actors(bqtl_file, env_file, trans_actors_prefix)
bqtls, transactors, extraT
end

filter_snps_by_tf(df::DataFrame, tf_name::AbstractString) = filter(row -> row.TF == tf_name, df)
filter_snps_by_tf(df::DataFrame, tf_name::Nothing) = df
filter_snps_by_tf(df_vector, tf_name::AbstractString) = [filter_snps_by_tf(df, tf_name) for df in df_vector]
filter_snps_by_tf(df_vector, tf_name::Nothing) = df_vector

combine_trans_actors(trans_actors::Vector{DataFrame}, extraT::DataFrame, order) = combinations([trans_actors..., extraT], order)
combine_trans_actors(trans_actors::Vector{DataFrame}, extraT::Nothing, order) = combinations(trans_actors, order)
combine_trans_actors(trans_actors::Nothing, extraT::DataFrame, order) = [[extraT]]


function combine_by_bqtl(bqtls::DataFrame, trans_actors::Union{Vector{DataFrame}, Nothing}, extraT::Union{DataFrame, Nothing}, order::Int)
treatment_combinations = Vector{Symbol}[]
if order == 1
Expand Down Expand Up @@ -44,8 +47,11 @@ all_variants(bqtls::DataFrame, transactors::Vector{DataFrame}) = Set(vcat(bqtls.


read_snps_from_csv(path::Nothing) = nothing
read_snps_from_csv(path::String) = unique(CSV.read(path, DataFrame; select=[:ID, :CHR]), :ID)

function read_snps_from_csv(path::String)
df = CSV.read(path, DataFrame)
df = "TF" in names(df) ? unique(df[:, [:ID, :CHR, :TF]], [:ID, :TF]) : unique(df[:, [:ID, :CHR]], :ID)
return(df)
end

trans_actors_from_prefix(trans_actors_prefix::Nothing) = nothing
function trans_actors_from_prefix(trans_actors_prefix::String)
Expand Down Expand Up @@ -156,7 +162,14 @@ function parameters_from_actors(bqtls, transactors, data, variables, orders, out
for order in orders
# First generate the `T` section
treatment_combinations = TargeneCore.combine_by_bqtl(bqtls, transactors, extraT_df, order)
# If there are duplicates here, remove them
treatment_combinations = unique(treatment_combinations)
for treatments in treatment_combinations
# If RSID is duplicated in treatments, skip
if length(treatments) != length(unique(treatments))
continue
end

addParameters!(parameters, treatments, variables, data; positivity_constraint=positivity_constraint)

if batch_size !== nothing && size(parameters, 1) >= batch_size
Expand Down Expand Up @@ -208,11 +221,19 @@ function tmle_inputs_from_actors(parsed_args)

# Parameter files
variables = TargeneCore.get_variables(pcs, traits, extraW, extraC, extraT)
TargeneCore.parameters_from_actors(
bqtls, transactors, data, variables, orders, outprefix;

# Loop through each TF present in bqtls file
tfs = "TF" in names(bqtls) ? unique(bqtls.TF) : [nothing]
for tf in tfs
outprefix_tf = tf !== nothing ? string(outprefix,".",tf) : outprefix
bqtls_tf = TargeneCore.filter_snps_by_tf(bqtls, tf)
transactors_tf = TargeneCore.filter_snps_by_tf(transactors, tf)
TargeneCore.parameters_from_actors(
bqtls_tf, transactors_tf, data, variables, orders, outprefix_tf;
positivity_constraint=positivity_constraint, batch_size=batch_size
)
)
end

# write data
Arrow.write(string(outprefix, ".data.arrow"), data)
end
end
2 changes: 1 addition & 1 deletion src/tmle_inputs/tmle_inputs.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
const CHR_REG = r"chr[1-9]+"

param_batch_name(outprefix, batch_id) = string(outprefix, ".param_", batch_id, ".yaml")
param_batch_name(outprefix, batch_id) = string(outprefix, ".param_", batch_id, ".yaml")


"""
Expand Down
4 changes: 2 additions & 2 deletions test/confounders.jl
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ end

clean(parsed_args)

# No qc file provided
# No QC file provided
parsed_args = Dict(
"input" => SnpArrays.datadir("mouse"),
"output" => joinpath("data", "filtered-mouse"),
Expand Down Expand Up @@ -138,4 +138,4 @@ end

end;

true
true
File renamed without changes.
5 changes: 5 additions & 0 deletions test/data/bqtls_2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
ID,CHR,TF
RSID_17,12,TF1
RSID_99,12,TF1
RSID_17,12,TF2
RSID_198,12,TF2
3 changes: 3 additions & 0 deletions test/data/trans_actors_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ID,CHR,TF
RSID_102,2,TF1
RSID_2,chr,TF2
91 changes: 84 additions & 7 deletions test/tmle_inputs/from_actors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ end
@test_throws ArgumentError TargeneCore.treatments_from_actors(1, nothing, nothing)
@test_throws ArgumentError TargeneCore.treatments_from_actors(nothing, 1, nothing)

bqtl_file = joinpath("data", "bqtls.csv")
bqtl_file = joinpath("data", "bqtls_1.csv")
trans_actors_prefix = joinpath("data", "trans_actors_1.csv")
env_file = joinpath("data", "extra_treatments.txt")
# bqtls and trans_actors
Expand Down Expand Up @@ -259,7 +259,7 @@ end
# - Order 1,2
parsed_args = Dict(
"from-actors" => Dict{String, Any}(
"bqtls" => joinpath("data", "bqtls.csv"),
"bqtls" => joinpath("data", "bqtls_1.csv"),
"trans-actors-prefix" => joinpath("data", "trans_actors_1.csv"),
"extra-covariates" => joinpath("data", "extra_covariates.txt"),
"extra-treatments" => joinpath("data", "extra_treatments.txt"),
Expand Down Expand Up @@ -333,7 +333,7 @@ end
# - batched
parsed_args = Dict(
"from-actors" => Dict{String, Any}(
"bqtls" => joinpath("data", "bqtls.csv"),
"bqtls" => joinpath("data", "bqtls_1.csv"),
"trans-actors-prefix" => joinpath("data", "trans_actors_2"),
"extra-covariates" => nothing,
"extra-treatments" => nothing,
Expand Down Expand Up @@ -361,10 +361,10 @@ end
@test size(traits) == (490, 13)

# Parameter files:
ouparameters_1 = parameters_from_yaml("final.param_1.yaml")
@test size(ouparameters_1, 1) == 100
ouparameters_2 = parameters_from_yaml("final.param_2.yaml")
outparameters = vcat(ouparameters_1, ouparameters_2)
outparameters_1 = parameters_from_yaml("final.param_1.yaml")
@test size(outparameters_1, 1) == 100
outparameters_2 = parameters_from_yaml("final.param_2.yaml")
outparameters = vcat(outparameters_1, outparameters_2)

found_targets = Dict(
:BINARY_1 => 0,
Expand Down Expand Up @@ -405,6 +405,83 @@ end
cleanup()
end

@testset "Test tmle_inputs from-actors: scenario 3" begin
# Scenario:
# - Trans-actors
# - Extra Treatment
# - Extra Covariates
# - Order 1,2
# - More than 1 TF present
parsed_args = Dict(
"from-actors" => Dict{String, Any}(
"bqtls" => joinpath("data", "bqtls_2.csv"),
"trans-actors-prefix" => joinpath("data", "trans_actors_3.csv"),
"extra-covariates" => joinpath("data", "extra_covariates.txt"),
"extra-treatments" => joinpath("data", "extra_treatments.txt"),
"extra-confounders" => nothing,
"orders" => "1,2",
),
"traits" => joinpath("data", "traits_1.csv"),
"pcs" => joinpath("data", "pcs.csv"),
"call-threshold" => 0.8,
"%COMMAND%" => "from-actors",
"bgen-prefix" => joinpath("data", "ukbb", "imputed" ,"ukbb"),
"out-prefix" => "final",
"batch-size" => nothing,
"positivity-constraint" => 0.
)
bqtls = Symbol.(unique(CSV.read(parsed_args["from-actors"]["bqtls"], DataFrame).ID))
tmle_inputs(parsed_args)

## Dataset file
trait_data = DataFrame(Arrow.Table("final.data.arrow"))
@test names(trait_data) == [
"SAMPLE_ID", "BINARY_1", "BINARY_2", "CONTINUOUS_1", "CONTINUOUS_2",
"COV_1", "21003", "22001", "TREAT_1", "PC1", "PC2", "RSID_2", "RSID_102",
"RSID_17", "RSID_198", "RSID_99"]
@test size(trait_data) == (490, 16)

## Parameter file:
outparameters = [parameters_from_yaml("final.TF1.param_1.yaml"), parameters_from_yaml("final.TF2.param_1.yaml")]
found_targets = Dict(
:BINARY_1 => 0,
:CONTINUOUS_2 => 0,
:CONTINUOUS_1 => 0,
:BINARY_2 => 0
)
for tf in [1,2]
outparameters_tf = outparameters[tf]
for Ψ in outparameters_tf
if Ψ isa ATE
ntreatments = length.treatment)
if ntreatments > 1
@test all.treatment[index].case == Ψ.treatment[index].control for index 2:ntreatments)
end
else
@test Ψ isa IATE
@test all(cc.case != cc.control for cc Ψ.treatment)
end
@test Ψ.covariates == [:COV_1, Symbol("21003"), Symbol("22001")]
@test Ψ.confounders == [:PC1, :PC2]
# The first treatment will be a bqtl
@test keys.treatment)[1] bqtls
@test Ψ.treatment[1].case isa AbstractString
@test length.treatment) [1, 2]
found_targets[Ψ.target] += 1
end
end
# The number of parameters with various targets should be the same
@test all(x == found_targets[:BINARY_1] for x in values(found_targets))
# This is difficult to really check the ordering
# Those correspond to the simple bQTL ATE
for tf in [1,2]
first_treatments = keys(outparameters[tf][1].treatment)
@test all(keys.treatment) == first_treatments for Ψ in outparameters[tf][1:12])
end

cleanup()
end

end

true

0 comments on commit 33c3ff8

Please sign in to comment.