Skip to content

Commit

Permalink
Merge pull request #3 from TARGENE/ol_forward_unknown_fields
Browse files Browse the repository at this point in the history
add support for custom columns
  • Loading branch information
olivierlabayle authored Feb 2, 2023
2 parents 1744738 + d986117 commit ef69116
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 22 deletions.
19 changes: 12 additions & 7 deletions src/datasets_extraction.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ const ORDINAL_FIELDS = Set([
1319, 1498
])

asint(x::Real) = Int(x)
asint(x::String) = parse(Int, x)
asint(field_ids::AbstractArray) = [asint(x) for x in field_ids]
asint(x) = Int(x)

function get_fields_metadata(fields_metadata::DataFrame, field_id)
row_id = findfirst(x -> x.field_id == field_id, eachrow(fields_metadata))
row_id = findfirst(x -> x.field_id == string(field_id), eachrow(fields_metadata))
row_id === nothing && return nothing, nothing, nothing
f_meta = fields_metadata[row_id, :]
return field_id, f_meta.value_type, f_meta.encoding_id
return asint(field_id), f_meta.value_type, f_meta.encoding_id
end

"""
Expand All @@ -24,20 +24,25 @@ all field_ids in the list.
function get_fields_metadata(fields_metadata::DataFrame, field_ids::AbstractVector)
value_types = []
encoding_ids = []
field_ids_ = []
for f_id in field_ids
_, value_type, encoding_id = get_fields_metadata(fields_metadata, f_id)
field_id, value_type, encoding_id = get_fields_metadata(fields_metadata, f_id)
push!(field_ids_, field_id)
push!(value_types, value_type)
push!(encoding_ids, encoding_id)
end
@assert all(==(value_types[1]), value_types)
@assert all(==(encoding_ids[1]), encoding_ids)

return first(field_ids), first(value_types), first(encoding_ids)
return first(field_ids_), first(value_types), first(encoding_ids)
end

function build_from_fields_entry(fields_entry, dataset, fields_metadata)
field_ids = asint(fields_entry["fields"])
field_ids = fields_entry["fields"]
field_id, value_type, encoding_id = UKBMain.get_fields_metadata(fields_metadata, field_ids)
if value_type === nothing
return process_custom(dataset, fields_entry)
end
# Ordinal data
if field_id ORDINAL_FIELDS
return process_ordinal(dataset, fields_entry)
Expand Down
8 changes: 8 additions & 0 deletions src/fields_processing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ function process_binary_arrayed(dataset, fields_entry)
return DataFrame(collect(output), string.(phenotypes))
end

"""
process_custom(dataset, fields_entry)
Processing function for a manually added column. SImply forward the column
"""
process_custom(dataset, fields_entry) =
dataset[!, asvector(fields_entry["fields"])]

"""
process_ordinal(dataset, fields_entry)
"""
Expand Down
2 changes: 1 addition & 1 deletion src/ukb_download.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ download_fields_metadata(;output="fields_metadata.txt") =
Downloads.download("biobank.ndph.ox.ac.uk/ukb/scdown.cgi?fmt=txt&id=1", output)

read_fields_metadata(;input="fields_metadata.txt") =
CSV.read(input, DataFrame)
CSV.read(input, DataFrame, types=Dict("field_id" => String))

function download_and_read_fields_metadata(;path="fields_metadata.txt")
if !isfile(path)
Expand Down
77 changes: 77 additions & 0 deletions test/config/config_with_custom_fields.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
traits:
- fields:
- 1408
phenotypes:
- name: 1408
- fields:
- 1727
phenotypes:
- name: 1727
- fields:
- 1379
phenotypes:
- name: 1379
- fields:
- 1329
phenotypes:
- name: 1329
- fields: 1339
phenotypes:
- name: 1339
- fields:
- 30270
phenotypes:
- name: 30270
- fields:
- 1548
phenotypes:
- name: 1548
- fields: 1707
phenotypes:
- name: 1707
- fields:
- 1777
phenotypes:
- name: 1777
# Arrayed/Instanced traits
- fields: "40006"
phenotypes:
- name: disease_1
codings: [C439, D414, C440]
- name: disease_2
codings: D487
- fields: 20002
phenotypes:
- name: disease_3
codings:
- [1674, 1065, "1066", 1067]
- name: disease_4
codings: [1452, 1453, 1454, 1455, 1548, 1549, 1550, 1625, 1660, 1661, 1667, 1680]
- name: disease_5
codings: 1762
- fields: [41202, 41204]
phenotypes:
- name: disease_6
codings: ["O021", D649, "O26", "E669"]
- name: disease_7
codings: ["G20"]
- name: disease_8
codings:
- "I849"
- "O209"

- fields: 21000
phenotypes:
- name: ethnicity

- fields: 22001
phenotypes:
- name: genetic sex

- fields: 21003
phenotypes:
- name: age
# This fields do not originally pertain to
# the UKB dataset but have been manually added
# to a decrypted dataset
- fields: [customfield, dummyfield]
Loading

0 comments on commit ef69116

Please sign in to comment.