Skip to content

Commit

Permalink
Treatment are now allele strings
Browse files Browse the repository at this point in the history
  • Loading branch information
joshua-slaughter committed Oct 8, 2024
1 parent 74a4239 commit 99cf6aa
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

julia_version = "1.10.4"
manifest_format = "2.0"
project_hash = "2b58dcc0ffd21f9ddd37c869a6c32c55af516134"
project_hash = "b1a916e4f68d2fb953c6eba21c0fc7546e7dc5c6"

[[deps.ARFFFiles]]
deps = ["CategoricalArrays", "Dates", "Parsers", "Tables"]
Expand Down
33 changes: 27 additions & 6 deletions src/inputs_from_config.jl
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ end
"""
function treatments_from_variant(variant::String, dataset::DataFrame)
variant_levels = sort(levels(dataset[!, variant], skipmissing=true))
return Dict{Symbol, Vector{UInt8}}(Symbol(variant)=>variant_levels)
return Dict{Symbol, Vector{String}}(Symbol(variant)=>variant_levels)
end

function estimands_from_gwas(dataset, variants, outcomes, confounders;
Expand Down Expand Up @@ -198,14 +198,35 @@ function read_bed_chromosome(bedprefix)
return SnpData(bed_file, famnm=fam_file, bimnm=bim_file)
end

function map_allele(value, allele1, allele2)
if value == 0x00
return "$allele1$allele1"
elseif value == 0x01
return missing
elseif value == 0x02
return "$allele1$allele2"
elseif value == 0x03
return "$allele2$allele2"
end
end

function convert_string(snpdata)
genotypes_data = []
for col in 1:snpdata.snps
allele_col = snpdata.snparray[:,col]
allele1 = snpdata.snp_info[col, "allele1"]
allele2 = snpdata.snp_info[col, "allele2"]
mapped_col = map(value -> map_allele(value, allele1, allele2), allele_col)
push!(genotypes_data, mapped_col)
end
return DataFrame(genotypes_data, snpdata.snp_info."snpid")
end

function get_genotypes_from_beds(bedprefix)
snpdata = read_bed_chromosome(bedprefix)
genotypes = DataFrame(convert(Matrix{UInt8}, snpdata.snparray), snpdata.snp_info."snpid")
genotype_map = Union{UInt8, Missing}[0, missing, 1, 2]
for col in names(genotypes)
genotypes[!, col] = [genotype_map[x+1] for x in genotypes[!, col]]
end
genotypes = convert_string(snpdata)
insertcols!(genotypes, 1, :SAMPLE_ID => snpdata.person_info."iid")

return genotypes
end

Expand Down
25 changes: 19 additions & 6 deletions test/inputs_from_gwas_config.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,27 @@ function get_summary_stats(estimands)
return sort(combine(groupby(results, :OUTCOME), nrow), :OUTCOME)
end

function check_estimands_levels_order(estimands)
function check_estimands_levels_order(estimands, snp_info)
for Ψ in estimands
# If the two components are present, the first is the 0 -> 1 and the second is the 1 -> 2
variant = only(keys.args[1].treatment_values))
variant_info = filter(:snpid=>x->x==String(variant),snp_info)
allele1, allele2 = variant_info.allele1[1], variant_info.allele2[1]

# Here, we check if the order is sufficient to be able to compute non-linear effects any of these combinations will do
if length.args) == 2
@test Ψ.args[1].treatment_values[variant] == (control = 0x00, case = 0x01)
@test Ψ.args[2].treatment_values[variant] == (control = 0x01, case = 0x02)
@test.args[1].treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) &&
Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2)) ||
.args[1].treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) &&
Ψ.args[2].treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1))
else
# Otherwise we check they are one or the other
arg = only.args)
@test arg.treatment_values[variant]==(control = 0x00, case = 0x01) ||
arg.treatment_values[variant]==( control = 0x01, case = 0x02)
@test arg.treatment_values[variant] == (control = allele1*allele1, case = allele1*allele2) ||
arg.treatment_values[variant] == (control = allele2*allele2, case = allele1*allele2) ||
arg.treatment_values[variant] == (control = allele1*allele2, case = allele2*allele2) ||
arg.treatment_values[variant] == (control = allele1*allele2, case = allele1*allele1)

end
end
end
Expand All @@ -48,6 +57,10 @@ end
"--positivity-constraint=0"
])
TargeneCore.julia_main()

# Define SNP information to check string allele defintions
snpdata = read_bed_chromosome(joinpath(TESTDIR, "data", "ukbb", "genotypes" , "ukbb_1."))
snp_info = select(DataFrame(snpdata.snp_info), [:snpid, :allele1, :allele2])
# Check dataset
dataset = DataFrame(Arrow.Table(joinpath(tmpdir, "final.data.arrow")))
@test size(dataset) == (1940, 886)
Expand All @@ -68,7 +81,7 @@ end
nrow = repeat([875], 5)
)

check_estimands_levels_order(estimands)
check_estimands_levels_order(estimands, snp_info)
end

@testset "Test inputs_from_config gwas: positivity constraint" begin
Expand Down

0 comments on commit 99cf6aa

Please sign in to comment.