diff --git a/src/confounders.jl b/src/confounders.jl index 4f6175e..73ecf39 100644 --- a/src/confounders.jl +++ b/src/confounders.jl @@ -106,4 +106,4 @@ function adapt_flashpca(parsed_args) pcs = CSV.File(parsed_args["input"], drop=["FID"]) |> DataFrame rename!(pcs, :IID => :SAMPLE_ID) CSV.write(parsed_args["output"], pcs) -end \ No newline at end of file +end diff --git a/src/tmle_inputs/from_actors.jl b/src/tmle_inputs/from_actors.jl index 894b5e9..7dfe594 100644 --- a/src/tmle_inputs/from_actors.jl +++ b/src/tmle_inputs/from_actors.jl @@ -11,12 +11,15 @@ function treatments_from_actors(bqtl_file, env_file, trans_actors_prefix) bqtls, transactors, extraT end +filter_snps_by_tf(df::DataFrame, tf_name::AbstractString) = filter(row -> row.TF == tf_name, df) +filter_snps_by_tf(df::DataFrame, tf_name::Nothing) = df +filter_snps_by_tf(df_vector, tf_name::AbstractString) = [filter_snps_by_tf(df, tf_name) for df in df_vector] +filter_snps_by_tf(df_vector, tf_name::Nothing) = df_vector combine_trans_actors(trans_actors::Vector{DataFrame}, extraT::DataFrame, order) = combinations([trans_actors..., extraT], order) combine_trans_actors(trans_actors::Vector{DataFrame}, extraT::Nothing, order) = combinations(trans_actors, order) combine_trans_actors(trans_actors::Nothing, extraT::DataFrame, order) = [[extraT]] - function combine_by_bqtl(bqtls::DataFrame, trans_actors::Union{Vector{DataFrame}, Nothing}, extraT::Union{DataFrame, Nothing}, order::Int) treatment_combinations = Vector{Symbol}[] if order == 1 @@ -44,8 +47,11 @@ all_variants(bqtls::DataFrame, transactors::Vector{DataFrame}) = Set(vcat(bqtls. read_snps_from_csv(path::Nothing) = nothing -read_snps_from_csv(path::String) = unique(CSV.read(path, DataFrame; select=[:ID, :CHR]), :ID) - +function read_snps_from_csv(path::String) + df = CSV.read(path, DataFrame) + df = "TF" in names(df) ? unique(df[:, [:ID, :CHR, :TF]], [:ID, :TF]) : unique(df[:, [:ID, :CHR]], :ID) + return(df) +end trans_actors_from_prefix(trans_actors_prefix::Nothing) = nothing function trans_actors_from_prefix(trans_actors_prefix::String) @@ -156,7 +162,14 @@ function parameters_from_actors(bqtls, transactors, data, variables, orders, out for order in orders # First generate the `T` section treatment_combinations = TargeneCore.combine_by_bqtl(bqtls, transactors, extraT_df, order) + # If there are duplicates here, remove them + treatment_combinations = unique(treatment_combinations) for treatments in treatment_combinations + # If RSID is duplicated in treatments, skip + if length(treatments) != length(unique(treatments)) + continue + end + addParameters!(parameters, treatments, variables, data; positivity_constraint=positivity_constraint) if batch_size !== nothing && size(parameters, 1) >= batch_size @@ -208,11 +221,19 @@ function tmle_inputs_from_actors(parsed_args) # Parameter files variables = TargeneCore.get_variables(pcs, traits, extraW, extraC, extraT) - TargeneCore.parameters_from_actors( - bqtls, transactors, data, variables, orders, outprefix; + + # Loop through each TF present in bqtls file + tfs = "TF" in names(bqtls) ? unique(bqtls.TF) : [nothing] + for tf in tfs + outprefix_tf = tf !== nothing ? string(outprefix,".",tf) : outprefix + bqtls_tf = TargeneCore.filter_snps_by_tf(bqtls, tf) + transactors_tf = TargeneCore.filter_snps_by_tf(transactors, tf) + TargeneCore.parameters_from_actors( + bqtls_tf, transactors_tf, data, variables, orders, outprefix_tf; positivity_constraint=positivity_constraint, batch_size=batch_size - ) + ) + end # write data Arrow.write(string(outprefix, ".data.arrow"), data) -end \ No newline at end of file +end diff --git a/src/tmle_inputs/tmle_inputs.jl b/src/tmle_inputs/tmle_inputs.jl index b6c0ff7..85f1d8b 100644 --- a/src/tmle_inputs/tmle_inputs.jl +++ b/src/tmle_inputs/tmle_inputs.jl @@ -1,6 +1,6 @@ const CHR_REG = r"chr[1-9]+" -param_batch_name(outprefix, batch_id) = string(outprefix, ".param_", batch_id, ".yaml") +param_batch_name(outprefix, batch_id) = string(outprefix, ".param_", batch_id, ".yaml") """ diff --git a/test/confounders.jl b/test/confounders.jl index ff1970d..4acf0c5 100644 --- a/test/confounders.jl +++ b/test/confounders.jl @@ -56,7 +56,7 @@ end clean(parsed_args) - # No qc file provided + # No QC file provided parsed_args = Dict( "input" => SnpArrays.datadir("mouse"), "output" => joinpath("data", "filtered-mouse"), @@ -138,4 +138,4 @@ end end; -true \ No newline at end of file +true diff --git a/test/data/bqtls.csv b/test/data/bqtls_1.csv similarity index 100% rename from test/data/bqtls.csv rename to test/data/bqtls_1.csv diff --git a/test/data/bqtls_2.csv b/test/data/bqtls_2.csv new file mode 100644 index 0000000..5376ba4 --- /dev/null +++ b/test/data/bqtls_2.csv @@ -0,0 +1,5 @@ +ID,CHR,TF +RSID_17,12,TF1 +RSID_99,12,TF1 +RSID_17,12,TF2 +RSID_198,12,TF2 diff --git a/test/data/trans_actors_3.csv b/test/data/trans_actors_3.csv new file mode 100644 index 0000000..743027b --- /dev/null +++ b/test/data/trans_actors_3.csv @@ -0,0 +1,3 @@ +ID,CHR,TF +RSID_102,2,TF1 +RSID_2,chr,TF2 diff --git a/test/tmle_inputs/from_actors.jl b/test/tmle_inputs/from_actors.jl index 0ea7904..f37b922 100644 --- a/test/tmle_inputs/from_actors.jl +++ b/test/tmle_inputs/from_actors.jl @@ -222,7 +222,7 @@ end @test_throws ArgumentError TargeneCore.treatments_from_actors(1, nothing, nothing) @test_throws ArgumentError TargeneCore.treatments_from_actors(nothing, 1, nothing) - bqtl_file = joinpath("data", "bqtls.csv") + bqtl_file = joinpath("data", "bqtls_1.csv") trans_actors_prefix = joinpath("data", "trans_actors_1.csv") env_file = joinpath("data", "extra_treatments.txt") # bqtls and trans_actors @@ -259,7 +259,7 @@ end # - Order 1,2 parsed_args = Dict( "from-actors" => Dict{String, Any}( - "bqtls" => joinpath("data", "bqtls.csv"), + "bqtls" => joinpath("data", "bqtls_1.csv"), "trans-actors-prefix" => joinpath("data", "trans_actors_1.csv"), "extra-covariates" => joinpath("data", "extra_covariates.txt"), "extra-treatments" => joinpath("data", "extra_treatments.txt"), @@ -333,7 +333,7 @@ end # - batched parsed_args = Dict( "from-actors" => Dict{String, Any}( - "bqtls" => joinpath("data", "bqtls.csv"), + "bqtls" => joinpath("data", "bqtls_1.csv"), "trans-actors-prefix" => joinpath("data", "trans_actors_2"), "extra-covariates" => nothing, "extra-treatments" => nothing, @@ -361,10 +361,10 @@ end @test size(traits) == (490, 13) # Parameter files: - ouparameters_1 = parameters_from_yaml("final.param_1.yaml") - @test size(ouparameters_1, 1) == 100 - ouparameters_2 = parameters_from_yaml("final.param_2.yaml") - outparameters = vcat(ouparameters_1, ouparameters_2) + outparameters_1 = parameters_from_yaml("final.param_1.yaml") + @test size(outparameters_1, 1) == 100 + outparameters_2 = parameters_from_yaml("final.param_2.yaml") + outparameters = vcat(outparameters_1, outparameters_2) found_targets = Dict( :BINARY_1 => 0, @@ -405,6 +405,83 @@ end cleanup() end +@testset "Test tmle_inputs from-actors: scenario 3" begin + # Scenario: + # - Trans-actors + # - Extra Treatment + # - Extra Covariates + # - Order 1,2 + # - More than 1 TF present + parsed_args = Dict( + "from-actors" => Dict{String, Any}( + "bqtls" => joinpath("data", "bqtls_2.csv"), + "trans-actors-prefix" => joinpath("data", "trans_actors_3.csv"), + "extra-covariates" => joinpath("data", "extra_covariates.txt"), + "extra-treatments" => joinpath("data", "extra_treatments.txt"), + "extra-confounders" => nothing, + "orders" => "1,2", + ), + "traits" => joinpath("data", "traits_1.csv"), + "pcs" => joinpath("data", "pcs.csv"), + "call-threshold" => 0.8, + "%COMMAND%" => "from-actors", + "bgen-prefix" => joinpath("data", "ukbb", "imputed" ,"ukbb"), + "out-prefix" => "final", + "batch-size" => nothing, + "positivity-constraint" => 0. + ) + bqtls = Symbol.(unique(CSV.read(parsed_args["from-actors"]["bqtls"], DataFrame).ID)) + tmle_inputs(parsed_args) + + ## Dataset file + trait_data = DataFrame(Arrow.Table("final.data.arrow")) + @test names(trait_data) == [ + "SAMPLE_ID", "BINARY_1", "BINARY_2", "CONTINUOUS_1", "CONTINUOUS_2", + "COV_1", "21003", "22001", "TREAT_1", "PC1", "PC2", "RSID_2", "RSID_102", + "RSID_17", "RSID_198", "RSID_99"] + @test size(trait_data) == (490, 16) + + ## Parameter file: + outparameters = [parameters_from_yaml("final.TF1.param_1.yaml"), parameters_from_yaml("final.TF2.param_1.yaml")] + found_targets = Dict( + :BINARY_1 => 0, + :CONTINUOUS_2 => 0, + :CONTINUOUS_1 => 0, + :BINARY_2 => 0 + ) + for tf in [1,2] + outparameters_tf = outparameters[tf] + for Ψ in outparameters_tf + if Ψ isa ATE + ntreatments = length(Ψ.treatment) + if ntreatments > 1 + @test all(Ψ.treatment[index].case == Ψ.treatment[index].control for index ∈ 2:ntreatments) + end + else + @test Ψ isa IATE + @test all(cc.case != cc.control for cc ∈ Ψ.treatment) + end + @test Ψ.covariates == [:COV_1, Symbol("21003"), Symbol("22001")] + @test Ψ.confounders == [:PC1, :PC2] + # The first treatment will be a bqtl + @test keys(Ψ.treatment)[1] ∈ bqtls + @test Ψ.treatment[1].case isa AbstractString + @test length(Ψ.treatment) ∈ [1, 2] + found_targets[Ψ.target] += 1 + end + end + # The number of parameters with various targets should be the same + @test all(x == found_targets[:BINARY_1] for x in values(found_targets)) + # This is difficult to really check the ordering + # Those correspond to the simple bQTL ATE + for tf in [1,2] + first_treatments = keys(outparameters[tf][1].treatment) + @test all(keys(Ψ.treatment) == first_treatments for Ψ in outparameters[tf][1:12]) + end + + cleanup() +end + end true \ No newline at end of file