-
Notifications
You must be signed in to change notification settings - Fork 0
extract_cgmlist_from_json.R
AnderssonOlivia edited this page Dec 20, 2024
·
5 revisions
library(rjson)
library(tidyverse)
library(readr)
# arg[1] = path to folder-with json file input
# arg[2] = path folder to output file
## set command line arguments ----
args <- commandArgs(trailingOnly = TRUE)
# stop the script if no command line argument
if(length(args)==0){
print("Please include differential expression results!")
stop("Requires command line argument.")
}
output_path=paste(args[2],"cgmlst.profile",sep="")
output_path
output_path_meta=paste(args[2],"metadata.tsv",sep="")
output_path_meta
files <- list.files(args[1],pattern = "*.json")
# create vectors to collect dta
list_of_dfs <- vector("list", length(files))
list_of_row_names <- vector("list", length(files))
# function to compare rows (to find missing data codes), not used but could be useful
compare_rows <- function(df, row1, row2) {
# Ensure row indices are valid
if (row1 > nrow(df) || row2 > nrow(df) || row1 <= 0 || row2 <= 0) {
stop("Invalid row indices")
}
# Extract the two rows
r1 <- df[row1, ]
r2 <- df[row2, ]
# Compare the rows
differences <- mapply(function(x, y, name) {
if (!identical(x, y)) {
return(data.frame(Column = name, Row1 = x, Row2 = y, stringsAsFactors = FALSE))
}
return(NULL)
}, r1, r2, names(df), SIMPLIFY = FALSE)
# Remove NULL entries and combine results
differences <- do.call(rbind, differences[!sapply(differences, is.null)])
return(differences)
}
# Loop throgh json file to collect cgMLSts in list
for (f in files) {
my.JSON <- fromJSON(file=paste(args[1],f,sep=""))
list_of_dfs[[f]] <- my.JSON[["typing_result"]][[2]][["result"]][["alleles"]] %>% unlist() %>% as_tibble_row()
list_of_row_names[[f]] <- my.JSON[[1]]%>% as_tibble_row(.name_repair = c("unique"))
}
# kollaps list into dataframe and add allele names as row names
df <- bind_rows(list_of_dfs)
rownames <- bind_rows(list_of_row_names)
names(rownames) <- "jasenid"
df2 <- cbind(rownames,df) %>%
as_tibble()
# uncomment to use the compare_rows function
#result <- compare_rows(df2, 10, 11)
#result
## replace missing data values
df3 <- df2 %>%
replace(. == "ASM","0") %>%
replace(. == "EXC","0") %>%
replace(. == "INF","0") %>%
replace(. == "LNF","0") %>%
replace(. == "PLNF","0") %>%
replace(. == "PLOT3","0") %>%
replace(. == "PLOT5","0") %>%
replace(. == "LOTC","0") %>%
replace(. == "NIPH","0") %>%
replace(. == "NIPHEM","0") %>%
replace(. == "PAMA","0") %>%
replace(. == "ALM","0") %>%
replace(. == "ASM","0")
# compare_rows(df3, 10, 11)
write_tsv(
x=df3,
file=output_path
)
## Create fake metadata
mimosa_count <- c("Sverige","Norge","Danmark")
mimosa_regions <- c("A","B","C","D")
mimosa_dates <- seq(Sys.Date()-months(3),Sys.Date(), by = '1 day')
# Example dataframe
set.seed(69)
fake_metadata <- data.frame(
sample = dplyr::pull(rownames,jasenid),
Country = sample(mimosa_count, size = nrow(rownames), replace = T),
Region = sample(mimosa_regions, size = nrow(rownames), replace = T),
Source = rep("clinical", nrow(rownames)),
Date = sample(mimosa_dates, size = nrow(rownames), replace = T),
Note = rep("some notes", nrow(rownames))
)
write_tsv(
x=fake_metadata ,
file=output_path_meta
)
import os
import json
import pandas as pd
def process_json_files(input_folder, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
output_file = os.path.join(output_folder, "cgmlst.tsv")
# Collect data from JSON files
json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]
if not json_files:
print("No JSON files found in the specified folder.")
return
data_frames = []
for json_file in json_files:
with open(os.path.join(input_folder, json_file), 'r') as f:
data = json.load(f)
# Extract alleles from the specific cgmlst typing result
typing_result = next((result for result in data.get("typing_result", []) if result.get("type") == "cgmlst"), None)
if not typing_result:
print(f"Skipping file {json_file}: No 'cgmlst' typing result found.")
continue
alleles = typing_result.get("result", {}).get("alleles", {})
df = pd.DataFrame([alleles])
df["jasenid"] = json_file # Add identifier as a column
data_frames.append(df)
# Combine data into a single DataFrame
if data_frames:
df_combined = pd.concat(data_frames, ignore_index=True)
# Replace missing data codes
missing_codes = ["ASM", "EXC", "INF", "LNF", "PLNF", "PLOT3", "PLOT5",
"LOTSC", "NIPH", "NIPHEM", "PAMA", "ALM"]
df_combined.replace(missing_codes, "0", inplace=True)
print("DataFrame Preview:")
print(df_combined.head())
# Save the result to a TSV file
df_combined.to_csv(output_file, sep='\t', index=False)
print(f"Processed data saved to {output_file}")
else:
print("No valid data to process.")
# Usage
if __name__ == "__main__":
import sys
if len(sys.argv) != 3:
print("Usage: python script.py <input_json_folder> <output_folder>")
sys.exit(1)
input_folder = sys.argv[1]
output_folder = sys.argv[2]
process_json_files(input_folder, output_folder)