diff --git a/Manifest.toml b/Manifest.toml index 5e87db8..6e2864a 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -2,7 +2,7 @@ julia_version = "1.10.4" manifest_format = "2.0" -project_hash = "2b58dcc0ffd21f9ddd37c869a6c32c55af516134" +project_hash = "11c5af9ff209336cb724f866877ee918e2b37f35" [[deps.ARFFFiles]] deps = ["CategoricalArrays", "Dates", "Parsers", "Tables"] @@ -938,9 +938,9 @@ version = "1.0.2" [[deps.HTTP]] deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] -git-tree-sha1 = "d1d712be3164d61d1fb98e7ce9bcbc6cc06b45ed" +git-tree-sha1 = "bc3f416a965ae61968c20d0ad867556367f2817d" uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" -version = "1.10.8" +version = "1.10.9" [[deps.HarfBuzz_jll]] deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll"] @@ -1155,9 +1155,9 @@ version = "0.21.4" [[deps.JSON3]] deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"] -git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b" +git-tree-sha1 = "1d322381ef7b087548321d3f878cb4c9bd8f8f9b" uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" -version = "1.14.0" +version = "1.14.1" weakdeps = ["ArrowTypes"] [deps.JSON3.extensions] @@ -1589,6 +1589,12 @@ version = "0.3.4" uuid = "14a3606d-f60d-562e-9121-12d972cd8159" version = "2023.1.10" +[[deps.MultipleTesting]] +deps = ["Distributions", "SpecialFunctions", "StatsBase"] +git-tree-sha1 = "1e98f8f732e7035c4333135b75605b74f3462b9b" +uuid = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b" +version = "0.6.0" + [[deps.NLSolversBase]] deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"] git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c" diff --git a/Project.toml b/Project.toml index f823dd3..e2f89a6 100644 --- a/Project.toml +++ b/Project.toml @@ -13,10 +13,13 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" +JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2" Mmap = "a63ad114-7e13-5084-954f-fe012c677804" +MultipleTesting = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -24,6 +27,7 @@ Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" SnpArrays = "4e780e97-f5bf-4111-9dc4-b70aaf691b06" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TMLE = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf" TMLECLI = "2573d147-4098-46ba-9db2-8608d210ccac" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" @@ -37,12 +41,12 @@ CairoMakie = "0.12" CategoricalArrays = "0.10" Combinatorics = "1.0" DataFrames = "1.2" +MKL = "0.7" OrderedCollections = "1.6.3" PackageCompiler = "2.1.17" SnpArrays = "0.3" StableRNGs = "1.0.1" Statistics = "1.10" TMLE = "0.17" -MKL = "0.7" YAML = "0.4.9" julia = "1.10, 1" diff --git a/src/inputs_from_config.jl b/src/inputs_from_config.jl index f5bc918..886709b 100644 --- a/src/inputs_from_config.jl +++ b/src/inputs_from_config.jl @@ -167,7 +167,7 @@ end """ function treatments_from_variant(variant::String, dataset::DataFrame) variant_levels = sort(levels(dataset[!, variant], skipmissing=true)) - return Dict{Symbol, Vector{UInt8}}(Symbol(variant)=>variant_levels) + return Dict{Symbol, Vector{String}}(Symbol(variant)=>variant_levels) end function estimands_from_gwas(dataset, variants, outcomes, confounders; @@ -198,14 +198,35 @@ function read_bed_chromosome(bedprefix) return SnpData(bed_file, famnm=fam_file, bimnm=bim_file) end +function map_allele(value, allele1, allele2) + if value == 0x00 + return "$allele1$allele1" + elseif value == 0x01 + return missing + elseif value == 0x02 + return "$allele1$allele2" + elseif value == 0x03 + return "$allele2$allele2" + end +end + +function convert_string(snpdata) + genotypes_data = [] + for col in 1:snpdata.snps + allele_col = snpdata.snparray[:,col] + allele1 = snpdata.snp_info[col, "allele1"] + allele2 = snpdata.snp_info[col, "allele2"] + mapped_col = map(value -> map_allele(value, allele1, allele2), allele_col) + push!(genotypes_data, mapped_col) + end + return DataFrame(genotypes_data, snpdata.snp_info."snpid") +end + function get_genotypes_from_beds(bedprefix) snpdata = read_bed_chromosome(bedprefix) - genotypes = DataFrame(convert(Matrix{UInt8}, snpdata.snparray), snpdata.snp_info."snpid") - genotype_map = Union{UInt8, Missing}[0, missing, 1, 2] - for col in names(genotypes) - genotypes[!, col] = [genotype_map[x+1] for x in genotypes[!, col]] - end + genotypes = convert_string(snpdata) insertcols!(genotypes, 1, :SAMPLE_ID => snpdata.person_info."iid") + return genotypes end diff --git a/test/inputs_from_gwas_config.jl b/test/inputs_from_gwas_config.jl index 919daff..fc80ff5 100644 --- a/test/inputs_from_gwas_config.jl +++ b/test/inputs_from_gwas_config.jl @@ -19,18 +19,27 @@ function get_summary_stats(estimands) return sort(combine(groupby(results, :OUTCOME), nrow), :OUTCOME) end -function check_estimands_levels_order(estimands) +function check_estimands_levels_order(estimands, snp_info) for Ψ in estimands # If the two components are present, the first is the 0 -> 1 and the second is the 1 -> 2 variant = only(keys(Ψ.args[1].treatment_values)) + variant_info = filter(:snpid=>x->x==String(variant),snp_info) + allele1, allele2 = variant_info.allele1[1], variant_info.allele2[1] + + # Here, we check if the order is sufficient to be able to compute non-linear effects any of these combinations will do if length(Ψ.args) == 2 - @test Ψ.args[1].treatment_values[variant] == (control = 0x00, case = 0x01) - @test Ψ.args[2].treatment_values[variant] == (control = 0x01, case = 0x02) + @test (Ψ.args[1].treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) && + Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2)) || + (Ψ.args[1].treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) && + Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1)) else # Otherwise we check they are one or the other arg = only(Ψ.args) - @test arg.treatment_values[variant]==(control = 0x00, case = 0x01) || - arg.treatment_values[variant]==( control = 0x01, case = 0x02) + @test arg.treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) || + arg.treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) || + arg.treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2) || + arg.treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1) + end end end @@ -48,6 +57,9 @@ end "--positivity-constraint=0" ]) TargeneCore.julia_main() + # Define SNP information to check string allele defintions + snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1.")) + snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2]) # Check dataset dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow"))) @test size(dataset) == (1940, 886) @@ -68,7 +80,7 @@ end nrow = repeat([875], 5) ) - check_estimands_levels_order(estimands) + check_estimands_levels_order(estimands, snp_info) end @testset "Test inputs_from_config gwas: positivity constraint" begin @@ -85,6 +97,9 @@ end "--positivity-constraint=0.2" ]) TargeneCore.julia_main() + # Define SNP information to check string allele defintions + snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1.")) + snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2]) # Check dataset dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow"))) @test size(dataset) == (1940, 886) @@ -103,7 +118,7 @@ end nrow = repeat([777], 5) ) - check_estimands_levels_order(estimands) + check_estimands_levels_order(estimands, snp_info) end end diff --git a/test/testutils.jl b/test/testutils.jl index 87c2fb8..6bfdb0f 100644 --- a/test/testutils.jl +++ b/test/testutils.jl @@ -202,4 +202,30 @@ function make_estimands_configuration_file(config_generator=make_estimands_confi filename = joinpath(dir, "configuration.yaml") TMLE.write_yaml(filename, config) return filename -end \ No newline at end of file +end + +# Additional helper functions for testing GWAS treatment strings + +get_only_file_with_suffix(files, suffix) = files[only(findall(x -> endswith(x, suffix), files))] + +function files_matching_prefix(prefix) + directory, _prefix = splitdir(prefix) + _directory = directory == "" ? "." : directory + + return map( + f -> joinpath(directory, f), + filter( + f -> startswith(f, _prefix), + readdir(_directory) + ) + ) +end + +function read_bed_chromosome(bedprefix) + bed_files = files_matching_prefix(bedprefix) + fam_file = get_only_file_with_suffix(bed_files, "fam") + bim_file = get_only_file_with_suffix(bed_files, "bim") + bed_file = get_only_file_with_suffix(bed_files, "bed")[1:end-4] + return SnpData(bed_file, famnm=fam_file, bimnm=bim_file) +end +