Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

String treatments (alleles) implemented for GWAS #197

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

julia_version = "1.10.4"
manifest_format = "2.0"
project_hash = "2b58dcc0ffd21f9ddd37c869a6c32c55af516134"
project_hash = "11c5af9ff209336cb724f866877ee918e2b37f35"

[[deps.ARFFFiles]]
deps = ["CategoricalArrays", "Dates", "Parsers", "Tables"]
Expand Down Expand Up @@ -938,9 +938,9 @@ version = "1.0.2"

[[deps.HTTP]]
deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"]
git-tree-sha1 = "d1d712be3164d61d1fb98e7ce9bcbc6cc06b45ed"
git-tree-sha1 = "bc3f416a965ae61968c20d0ad867556367f2817d"
uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
version = "1.10.8"
version = "1.10.9"

[[deps.HarfBuzz_jll]]
deps = ["Artifacts", "Cairo_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "Graphite2_jll", "JLLWrappers", "Libdl", "Libffi_jll"]
Expand Down Expand Up @@ -1155,9 +1155,9 @@ version = "0.21.4"

[[deps.JSON3]]
deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"]
git-tree-sha1 = "eb3edce0ed4fa32f75a0a11217433c31d56bd48b"
git-tree-sha1 = "1d322381ef7b087548321d3f878cb4c9bd8f8f9b"
uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
version = "1.14.0"
version = "1.14.1"
weakdeps = ["ArrowTypes"]

[deps.JSON3.extensions]
Expand Down Expand Up @@ -1589,6 +1589,12 @@ version = "0.3.4"
uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
version = "2023.1.10"

[[deps.MultipleTesting]]
deps = ["Distributions", "SpecialFunctions", "StatsBase"]
git-tree-sha1 = "1e98f8f732e7035c4333135b75605b74f3462b9b"
uuid = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
version = "0.6.0"

[[deps.NLSolversBase]]
deps = ["DiffResults", "Distributed", "FiniteDiff", "ForwardDiff"]
git-tree-sha1 = "a0b464d183da839699f4c79e7606d9d186ec172c"
Expand Down
6 changes: 5 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,21 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
MultipleTesting = "f8716d33-7c4a-5097-896f-ce0ecbd3ef6b"
OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
SnpArrays = "4e780e97-f5bf-4111-9dc4-b70aaf691b06"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
TMLE = "8afdd2fb-6e73-43df-8b62-b1650cd9c8cf"
TMLECLI = "2573d147-4098-46ba-9db2-8608d210ccac"
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
Expand All @@ -37,12 +41,12 @@ CairoMakie = "0.12"
CategoricalArrays = "0.10"
Combinatorics = "1.0"
DataFrames = "1.2"
MKL = "0.7"
OrderedCollections = "1.6.3"
PackageCompiler = "2.1.17"
SnpArrays = "0.3"
StableRNGs = "1.0.1"
Statistics = "1.10"
TMLE = "0.17"
MKL = "0.7"
YAML = "0.4.9"
julia = "1.10, 1"
33 changes: 27 additions & 6 deletions src/inputs_from_config.jl
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ end
"""
function treatments_from_variant(variant::String, dataset::DataFrame)
variant_levels = sort(levels(dataset[!, variant], skipmissing=true))
return Dict{Symbol, Vector{UInt8}}(Symbol(variant)=>variant_levels)
return Dict{Symbol, Vector{String}}(Symbol(variant)=>variant_levels)
end

function estimands_from_gwas(dataset, variants, outcomes, confounders;
Expand Down Expand Up @@ -198,14 +198,35 @@ function read_bed_chromosome(bedprefix)
return SnpData(bed_file, famnm=fam_file, bimnm=bim_file)
end

function map_allele(value, allele1, allele2)
if value == 0x00
return "$allele1$allele1"
elseif value == 0x01
return missing
elseif value == 0x02
return "$allele1$allele2"
elseif value == 0x03
return "$allele2$allele2"
end
end

function convert_string(snpdata)
genotypes_data = []
for col in 1:snpdata.snps
allele_col = snpdata.snparray[:,col]
allele1 = snpdata.snp_info[col, "allele1"]
allele2 = snpdata.snp_info[col, "allele2"]
mapped_col = map(value -> map_allele(value, allele1, allele2), allele_col)
push!(genotypes_data, mapped_col)
end
return DataFrame(genotypes_data, snpdata.snp_info."snpid")
end

function get_genotypes_from_beds(bedprefix)
snpdata = read_bed_chromosome(bedprefix)
genotypes = DataFrame(convert(Matrix{UInt8}, snpdata.snparray), snpdata.snp_info."snpid")
genotype_map = Union{UInt8, Missing}[0, missing, 1, 2]
for col in names(genotypes)
genotypes[!, col] = [genotype_map[x+1] for x in genotypes[!, col]]
end
genotypes = convert_string(snpdata)
insertcols!(genotypes, 1, :SAMPLE_ID => snpdata.person_info."iid")

return genotypes
end

Expand Down
29 changes: 22 additions & 7 deletions test/inputs_from_gwas_config.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,27 @@ function get_summary_stats(estimands)
return sort(combine(groupby(results, :OUTCOME), nrow), :OUTCOME)
end

function check_estimands_levels_order(estimands)
function check_estimands_levels_order(estimands, snp_info)
for Ψ in estimands
# If the two components are present, the first is the 0 -> 1 and the second is the 1 -> 2
variant = only(keys(Ψ.args[1].treatment_values))
variant_info = filter(:snpid=>x->x==String(variant),snp_info)
allele1, allele2 = variant_info.allele1[1], variant_info.allele2[1]

# Here, we check if the order is sufficient to be able to compute non-linear effects any of these combinations will do
if length(Ψ.args) == 2
@test Ψ.args[1].treatment_values[variant] == (control = 0x00, case = 0x01)
@test Ψ.args[2].treatment_values[variant] == (control = 0x01, case = 0x02)
@test (Ψ.args[1].treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) &&
Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2)) ||
(Ψ.args[1].treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) &&
Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1))
else
# Otherwise we check they are one or the other
arg = only(Ψ.args)
@test arg.treatment_values[variant]==(control = 0x00, case = 0x01) ||
arg.treatment_values[variant]==( control = 0x01, case = 0x02)
@test arg.treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) ||
arg.treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) ||
arg.treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2) ||
arg.treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1)

end
end
end
Expand All @@ -48,6 +57,9 @@ end
"--positivity-constraint=0"
])
TargeneCore.julia_main()
# Define SNP information to check string allele defintions
snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1."))
snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2])
# Check dataset
dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow")))
@test size(dataset) == (1940, 886)
Expand All @@ -68,7 +80,7 @@ end
nrow = repeat([875], 5)
)

check_estimands_levels_order(estimands)
check_estimands_levels_order(estimands, snp_info)
end

@testset "Test inputs_from_config gwas: positivity constraint" begin
Expand All @@ -85,6 +97,9 @@ end
"--positivity-constraint=0.2"
])
TargeneCore.julia_main()
# Define SNP information to check string allele defintions
snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1."))
snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2])
# Check dataset
dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow")))
@test size(dataset) == (1940, 886)
Expand All @@ -103,7 +118,7 @@ end
nrow = repeat([777], 5)
)

check_estimands_levels_order(estimands)
check_estimands_levels_order(estimands, snp_info)
end

end
Expand Down
28 changes: 27 additions & 1 deletion test/testutils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -202,4 +202,30 @@ function make_estimands_configuration_file(config_generator=make_estimands_confi
filename = joinpath(dir, "configuration.yaml")
TMLE.write_yaml(filename, config)
return filename
end
end

# Additional helper functions for testing GWAS treatment strings

get_only_file_with_suffix(files, suffix) = files[only(findall(x -> endswith(x, suffix), files))]

function files_matching_prefix(prefix)
directory, _prefix = splitdir(prefix)
_directory = directory == "" ? "." : directory

return map(
f -> joinpath(directory, f),
filter(
f -> startswith(f, _prefix),
readdir(_directory)
)
)
end

function read_bed_chromosome(bedprefix)
bed_files = files_matching_prefix(bedprefix)
fam_file = get_only_file_with_suffix(bed_files, "fam")
bim_file = get_only_file_with_suffix(bed_files, "bim")
bed_file = get_only_file_with_suffix(bed_files, "bed")[1:end-4]
return SnpData(bed_file, famnm=fam_file, bimnm=bim_file)
end

Loading