Skip to content

extract_cgmlist_from_json.R

AnderssonOlivia edited this page Dec 20, 2024 · 5 revisions

R

library(rjson)
library(tidyverse)
library(readr)


# arg[1] = path to folder-with json file input
# arg[2] = path folder to output file

## set command line arguments ----
args <- commandArgs(trailingOnly = TRUE)

# stop the script if no command line argument
if(length(args)==0){
  print("Please include differential expression results!")
  stop("Requires command line argument.")
}

output_path=paste(args[2],"cgmlst.profile",sep="")
output_path

output_path_meta=paste(args[2],"metadata.tsv",sep="")
output_path_meta


files <- list.files(args[1],pattern = "*.json")

# create vectors to collect dta 
list_of_dfs <- vector("list", length(files))
list_of_row_names <- vector("list", length(files))

# function to compare rows (to find missing data codes), not used but could be useful

compare_rows <- function(df, row1, row2) {
  # Ensure row indices are valid
  if (row1 > nrow(df) || row2 > nrow(df) || row1 <= 0 || row2 <= 0) {
    stop("Invalid row indices")
  }
  
  # Extract the two rows
  r1 <- df[row1, ]
  r2 <- df[row2, ]
  
  # Compare the rows
  differences <- mapply(function(x, y, name) {
    if (!identical(x, y)) {
      return(data.frame(Column = name, Row1 = x, Row2 = y, stringsAsFactors = FALSE))
    }
    return(NULL)
  }, r1, r2, names(df), SIMPLIFY = FALSE)
  
  # Remove NULL entries and combine results
  differences <- do.call(rbind, differences[!sapply(differences, is.null)])
  
  return(differences)
}

# Loop throgh json file to collect cgMLSts in list

for (f in files) {
  
  my.JSON <- fromJSON(file=paste(args[1],f,sep=""))
  list_of_dfs[[f]] <- my.JSON[["typing_result"]][[2]][["result"]][["alleles"]] %>% unlist() %>% as_tibble_row()
  list_of_row_names[[f]] <- my.JSON[[1]]%>% as_tibble_row(.name_repair = c("unique"))
}

# kollaps list into dataframe and add allele names as row names

df <- bind_rows(list_of_dfs)
rownames <- bind_rows(list_of_row_names)
names(rownames) <- "jasenid"
df2 <- cbind(rownames,df) %>%
  as_tibble()

# uncomment to use the compare_rows function
#result <- compare_rows(df2, 10, 11)
#result


## replace missing data values

df3 <- df2 %>%
  replace(. == "ASM","0") %>%
  replace(. == "EXC","0") %>%
  replace(. == "INF","0") %>%
  replace(. == "LNF","0") %>%
  replace(. == "PLNF","0") %>%
  replace(. == "PLOT3","0") %>%
  replace(. == "PLOT5","0") %>%
  replace(. == "LOTC","0") %>%
  replace(. == "NIPH","0") %>%
  replace(. == "NIPHEM","0") %>%
  replace(. == "PAMA","0") %>%
  replace(. == "ALM","0") %>%
  replace(. == "ASM","0")


# compare_rows(df3, 10, 11)

write_tsv(
  x=df3,
  file=output_path
)


## Create fake metadata


mimosa_count <- c("Sverige","Norge","Danmark")
mimosa_regions <- c("A","B","C","D")
mimosa_dates <- seq(Sys.Date()-months(3),Sys.Date(), by = '1 day')


# Example dataframe

set.seed(69)

fake_metadata <- data.frame(
  sample = dplyr::pull(rownames,jasenid),
  Country = sample(mimosa_count, size = nrow(rownames), replace = T),
  Region = sample(mimosa_regions, size = nrow(rownames), replace = T),
  Source = rep("clinical", nrow(rownames)),
  Date = sample(mimosa_dates, size = nrow(rownames), replace = T),
  Note = rep("some notes", nrow(rownames))
)


write_tsv(
  x=fake_metadata ,
  file=output_path_meta
)





Python

import os
import json
import pandas as pd

def process_json_files(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    output_file = os.path.join(output_folder, "cgmlst.tsv")

    # Collect data from JSON files
    json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]
    if not json_files:
        print("No JSON files found in the specified folder.")
        return

    data_frames = []

    for json_file in json_files:
        with open(os.path.join(input_folder, json_file), 'r') as f:
            data = json.load(f)

            # Extract alleles from the specific cgmlst typing result
            typing_result = next((result for result in data.get("typing_result", []) if result.get("type") == "cgmlst"), None)
            if not typing_result:
                print(f"Skipping file {json_file}: No 'cgmlst' typing result found.")
                continue

            alleles = typing_result.get("result", {}).get("alleles", {})
            df = pd.DataFrame([alleles])
            df["jasenid"] = json_file  # Add identifier as a column
            data_frames.append(df)

    # Combine data into a single DataFrame
    if data_frames:
        df_combined = pd.concat(data_frames, ignore_index=True)

        # Replace missing data codes
        missing_codes = ["ASM", "EXC", "INF", "LNF", "PLNF", "PLOT3", "PLOT5",
                         "LOTSC", "NIPH", "NIPHEM", "PAMA", "ALM"]
        df_combined.replace(missing_codes, "0", inplace=True)


        print("DataFrame Preview:")
        print(df_combined.head())


# Save the result to a TSV file
        df_combined.to_csv(output_file, sep='\t', index=False)
        print(f"Processed data saved to {output_file}")
    else:
        print("No valid data to process.")

# Usage
if __name__ == "__main__":
    import sys
    if len(sys.argv) != 3:
        print("Usage: python script.py <input_json_folder> <output_folder>")
        sys.exit(1)

    input_folder = sys.argv[1]
    output_folder = sys.argv[2]
    process_json_files(input_folder, output_folder)
Clone this wiki locally