TARGENE · joshua-slaughter · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024 · Nov 9, 2024
diff --git a/Manifest.toml b/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.10.4"
 manifest_format = "2.0"
-project_hash = "2b58dcc0ffd21f9ddd37c869a6c32c55af516134"
+project_hash = "11c5af9ff209336cb724f866877ee918e2b37f35"
 
 [[deps.ARFFFiles]]
 deps = ["CategoricalArrays", "Dates", "Parsers", "Tables"]
@@ -938,9 +938,9 @@ version = "1.0.2"
 
 [[deps.HTTP]]
 deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"]
-git-tree-sha1 = "d1d712be3164d61d1fb98e7ce9bcbc6cc06b45ed"
+git-tree-sha1 = "bc3f416a965ae61968c20d0ad867556367f2817d"
 uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
-version = "1.10.8"
+version = "1.10.9"
 
 [[deps.HarfBuzz_jll]]
 deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll"]
@@ -1155,9 +1155,9 @@ version = "0.21.4"
 
 [[deps.JSON3]]
 deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"]
-git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b"
+git-tree-sha1 = "1d322381ef7b087548321d3f878cb4c9bd8f8f9b"
 uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
-version = "1.14.0"
+version = "1.14.1"
 weakdeps = ["ArrowTypes"]
 
     [deps.JSON3.extensions]
@@ -1589,6 +1589,12 @@ version = "0.3.4"
 uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
 version = "2023.1.10"
 
+[[deps.MultipleTesting]]
+deps = ["Distributions", "SpecialFunctions", "StatsBase"]
+git-tree-sha1 = "1e98f8f732e7035c4333135b75605b74f3462b9b"
+uuid = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
+version = "0.6.0"
+
 [[deps.NLSolversBase]]
 deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"]
 git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c"

diff --git a/Project.toml b/Project.toml
@@ -13,17 +13,21 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
 Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
+MultipleTesting = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 SnpArrays = "4e780e97-f5bf-4111-9dc4-b70aaf691b06"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TMLE = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf"
 TMLECLI = "2573d147-4098-46ba-9db2-8608d210ccac"
 YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
@@ -37,12 +41,12 @@ CairoMakie = "0.12"
 CategoricalArrays = "0.10"
 Combinatorics = "1.0"
 DataFrames = "1.2"
+MKL = "0.7"
 OrderedCollections = "1.6.3"
 PackageCompiler = "2.1.17"
 SnpArrays = "0.3"
 StableRNGs = "1.0.1"
 Statistics = "1.10"
 TMLE = "0.17"
-MKL = "0.7"
 YAML = "0.4.9"
 julia = "1.10, 1"
diff --git a/src/inputs_from_config.jl b/src/inputs_from_config.jl
@@ -167,7 +167,7 @@ end
 """
 function treatments_from_variant(variant::String, dataset::DataFrame)
     variant_levels = sort(levels(dataset[!, variant], skipmissing=true))
-    return Dict{Symbol, Vector{UInt8}}(Symbol(variant)=>variant_levels)
+    return Dict{Symbol, Vector{String}}(Symbol(variant)=>variant_levels)
 end
 
 function estimands_from_gwas(dataset, variants, outcomes, confounders; 
@@ -198,14 +198,35 @@ function read_bed_chromosome(bedprefix)
     return SnpData(bed_file, famnm=fam_file, bimnm=bim_file)
 end
 
+function map_allele(value, allele1, allele2)
+    if value == 0x00
+        return "$allele1$allele1"
+    elseif value == 0x01
+        return missing
+    elseif value == 0x02
+        return "$allele1$allele2"
+    elseif value == 0x03
+        return "$allele2$allele2"
+    end
+end
+
+function convert_string(snpdata)
+    genotypes_data = []
+    for col in 1:snpdata.snps
+        allele_col = snpdata.snparray[:,col]
+        allele1 = snpdata.snp_info[col, "allele1"]
+        allele2 = snpdata.snp_info[col, "allele2"]
+        mapped_col = map(value -> map_allele(value, allele1, allele2), allele_col)
+        push!(genotypes_data, mapped_col)
+    end
+    return DataFrame(genotypes_data, snpdata.snp_info."snpid")
+end
+
 function get_genotypes_from_beds(bedprefix)
     snpdata = read_bed_chromosome(bedprefix)
-    genotypes = DataFrame(convert(Matrix{UInt8}, snpdata.snparray), snpdata.snp_info."snpid")
-    genotype_map = Union{UInt8, Missing}[0, missing, 1, 2]
-    for col in names(genotypes)
-        genotypes[!, col] = [genotype_map[x+1] for x in genotypes[!, col]]
-    end
+    genotypes = convert_string(snpdata)
     insertcols!(genotypes, 1, :SAMPLE_ID => snpdata.person_info."iid")
+
     return genotypes
 end
 

diff --git a/test/inputs_from_gwas_config.jl b/test/inputs_from_gwas_config.jl
@@ -19,18 +19,27 @@ function get_summary_stats(estimands)
     return sort(combine(groupby(results, :OUTCOME), nrow), :OUTCOME)
 end
 
-function check_estimands_levels_order(estimands)
+function check_estimands_levels_order(estimands, snp_info)
     for Ψ in estimands
         # If the two components are present, the first is the 0 -> 1 and the second is the 1 -> 2
         variant = only(keys(Ψ.args[1].treatment_values))
+        variant_info = filter(:snpid=>x->x==String(variant),snp_info)
+        allele1, allele2 = variant_info.allele1[1], variant_info.allele2[1]
+
+        # Here, we check if the order is sufficient to be able to compute non-linear effects any of these combinations will do
         if length(Ψ.args) == 2
-            @test Ψ.args[1].treatment_values[variant] == (control = 0x00, case = 0x01)
-            @test Ψ.args[2].treatment_values[variant] == (control = 0x01, case = 0x02)
+            @test (Ψ.args[1].treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) && 
+            Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2)) || 
+            (Ψ.args[1].treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) && 
+            Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1))
         else
             # Otherwise we check they are one or the other
             arg = only(Ψ.args)
-            @test arg.treatment_values[variant]==(control = 0x00, case = 0x01) ||
-            arg.treatment_values[variant]==( control = 0x01, case = 0x02)
+            @test arg.treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2)  ||
+            arg.treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) ||
+            arg.treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2) ||
+            arg.treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1) 
+
         end
    end
 end
@@ -48,6 +57,9 @@ end
         "--positivity-constraint=0"
     ])
     TargeneCore.julia_main()
+    # Define SNP information to check string allele defintions
+    snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1."))
+    snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2])
     # Check dataset
     dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow")))
     @test size(dataset) == (1940, 886)
@@ -68,7 +80,7 @@ end
         nrow = repeat([875], 5)
     )
 
-    check_estimands_levels_order(estimands)
+    check_estimands_levels_order(estimands, snp_info)
 end
 
 @testset "Test inputs_from_config gwas: positivity constraint" begin
@@ -85,6 +97,9 @@ end
         "--positivity-constraint=0.2"
     ])
     TargeneCore.julia_main()
+    # Define SNP information to check string allele defintions
+    snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1."))
+    snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2])
     # Check dataset
     dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow")))
     @test size(dataset) == (1940, 886)
@@ -103,7 +118,7 @@ end
         nrow = repeat([777], 5)
     )
 
-    check_estimands_levels_order(estimands)
+    check_estimands_levels_order(estimands, snp_info)
 end
 
 end

diff --git a/test/testutils.jl b/test/testutils.jl
@@ -202,4 +202,30 @@ function make_estimands_configuration_file(config_generator=make_estimands_confi
     filename = joinpath(dir, "configuration.yaml")
     TMLE.write_yaml(filename, config)
     return filename
-end
+end
+
+# Additional helper functions for testing GWAS treatment strings
+
+get_only_file_with_suffix(files, suffix) = files[only(findall(x -> endswith(x, suffix), files))]
+
+function files_matching_prefix(prefix)
+    directory, _prefix = splitdir(prefix)
+    _directory = directory == "" ? "." : directory
+
+    return map(
+        f -> joinpath(directory, f),
+        filter(
+            f -> startswith(f, _prefix), 
+            readdir(_directory)
+        )
+    )
+end
+
+function read_bed_chromosome(bedprefix)
+    bed_files = files_matching_prefix(bedprefix)
+    fam_file = get_only_file_with_suffix(bed_files, "fam")
+    bim_file = get_only_file_with_suffix(bed_files, "bim")
+    bed_file = get_only_file_with_suffix(bed_files, "bed")[1:end-4]
+    return SnpData(bed_file, famnm=fam_file, bimnm=bim_file)
+end
+