diff --git a/R/abm_resampling.R b/R/abm_resampling.R new file mode 100644 index 00000000..39430d61 --- /dev/null +++ b/R/abm_resampling.R @@ -0,0 +1,217 @@ +#################################################### +# Rescale the output of ABM such that it matches with US +# serology data +# This is done by first subsampling the pre-omicron +# infections in the ABM to match with seroprevalence +# on 2021-11-15 +# Then subsampling the omicron infections to match with +# seroprevalence on 2022-02-10 +# Store the subsample as a `retention` table in the +# sqlite database (1 to be retained, 0 to be removed) +#################################################### + +pacman::p_load(dplyr, data.table, lubridate, tidyr, ggplot2) +theme_set(theme_bw(base_size = 13)) +theme_update( + text = element_text(family = "Open Sans"), + strip.background = element_blank(), + strip.text = element_text(face = "bold"), + panel.grid = element_line(linewidth = rel(0.43), colour = "#D1D3D4"), + panel.border = element_rect(linewidth = rel(0.43)), + axis.ticks = element_line(linewidth = rel(0.43)) +) + +sero_flus <- fread("./data/abm-data/serology_flus.csv") +sero_abm <- fread("./data/abm-data/serology_abm.csv") + +crude_auc <- function(days, values, rebase = FALSE) { + if (rebase) values <- values - values[1] + xout <- seq(min(days), max(days), by = 1) + yout <- approx(days, values, xout)$y + return(sum(yout)) +} + +sample_inf_hist <- function(df, probs) { + retain <- rbinom(nrow(df), 1, probs) + return(df[retain == 1, ]) +} + +calc_sero <- function(inf_df) { + first_infections <- inf_df |> + group_by(pid, age) |> + summarise(infected_time = min(infected_time)) + + serology <- first_infections |> + group_by(age, time = infected_time) |> + summarise(n = n()) |> + complete(time = 0:874, fill = list(n = 0)) |> + left_join(pop_age, by = "age") |> + mutate( + seroprevalence = cumsum(n) / pop * 100, + Site = "ABM_scaled", + date = ymd("2020-02-10") + time + ) + + return(serology) +} + +plot_sero <- function(abm_df) { + ggplot() + + geom_line(aes(x = mid_date, y = value, colour = Site), + data = sero_flus |> filter(Site == "US") + ) + + geom_line(aes(x = date, y = seroprevalence, colour = Site), + data = abm_df + ) + + geom_vline( + xintercept = ymd(c("2021-11-15", "2022-02-10")), + lty = 2 + ) + + facet_wrap(~age) + + scale_x_date(date_breaks = "6 months", date_labels = "%m/%y") + + theme(axis.title.x = element_blank()) + + labs(y = "Seroprevalence %") +} + +#### ABM infection history +age_df <- data.table::fread("./data/abm-data/sim_ages.txt") +db_path <- "./data/abm-data/sim_data_0.sqlite" +db_out_path <- "./data/abm-data/sim_data_scaled_us.sqlite" +file.copy(db_path, db_out_path, overwrite = TRUE) +con <- dbConnect(SQLite(), db_out_path) +dbListTables(con) +q <- dbSendQuery( + con, + "SELECT inf, inf_owner_id, infected_time, strain FROM infection_history" +) +query_inf_history <- dbFetch(q) +dbClearResult(q) + +age_df <- age_df |> + mutate(age = case_when( + age <= 17 ~ "0-17", + age <= 49 ~ "18-49", + age <= 64 ~ "50-64", + TRUE ~ "65+" + )) +pop_age <- age_df |> + group_by(age) |> + summarise(pop = n()) +infection_history <- query_inf_history |> + left_join(age_df, by = join_by(inf_owner_id == pid)) |> + rename(pid = inf_owner_id) |> + mutate( + date = ymd("2020-02-10") + infected_time, + strain = ifelse(strain == "OMICRON", "OMICRON", "PREOM") + ) + +#### Pre-Omicron scaling +preom_obs <- sero_flus |> + filter(Site == "US") |> + select(date = mid_date, age, value) |> + filter(date <= ymd("2021-11-15")) +preom_start <- min(preom_obs$date) +preom_stop <- max(preom_obs$date) + +preom_obs_auc <- preom_obs |> + group_by(age) |> + summarise( + auc_obs = crude_auc(date, value, rebase = FALSE), + last_obs = value[date == preom_stop] + ) + +preom_abm_auc <- sero_abm |> + filter(date >= preom_start, date <= preom_stop) |> + group_by(age) |> + summarise( + auc_abm = crude_auc(date, seroprevalence, rebase = FALSE), + last_abm = seroprevalence[date == preom_stop], + first_abm = 0 + ) + +preom_scale <- preom_obs_auc |> + left_join(preom_abm_auc, by = "age") |> + mutate( + scale_preom_auc = auc_obs / auc_abm, + scale_om_lin = (last_obs - first_abm) / (last_abm - first_abm) + ) |> + select(-contains("obs"), -contains("abm")) + +#### Sample preom +preom_samp <- infection_history |> + left_join(preom_scale, by = "age") +probs <- ifelse( + preom_samp$strain == "OMICRON", + 1, + preom_samp$scale_preom_auc # auc scale is better fit +) +preom_samp <- sample_inf_hist(infection_history, probs) +preom_sero <- calc_sero(preom_samp) + +plot_sero(preom_sero) # Matches well on first dash line + +#### Omicron scaling +om_obs <- sero_flus |> + filter(Site == "US") |> + select(date = mid_date, age, value) |> + filter(date >= ymd("2021-11-15"), date <= ymd("2022-02-11")) +om_start <- min(om_obs$date) +om_stop <- max(om_obs$date) + +om_obs_auc <- om_obs |> + group_by(age) |> + summarise( + auc_obs = crude_auc(date, value, rebase = TRUE), + last_obs = value[date == om_stop] + ) + +om_abm_auc <- preom_sero |> + filter(date >= om_start, date <= om_stop) |> + group_by(age) |> + summarise( + auc_abm = crude_auc(date, seroprevalence, rebase = TRUE), + last_abm = seroprevalence[date == om_stop], + first_abm = seroprevalence[date == om_start] + ) + +om_scale <- om_obs_auc |> + left_join(om_abm_auc, by = "age") |> + left_join(pintersect, by = "age") |> + mutate( + scale_om_auc = auc_obs / auc_abm / (1 - p), + scale_om_lin = (last_obs - first_abm) / (last_abm - first_abm) # / (1 - p) + ) |> + select(-contains("obs"), -contains("abm")) + +#### Sample pre- and post-om +om_samp <- preom_samp |> + left_join(om_scale, by = "age") +probs <- ifelse( + om_samp$strain == "OMICRON", + om_samp$scale_om_lin, # linear scale is better here + 1 +) +om_samp <- sample_inf_hist(preom_samp, probs) +om_sero <- calc_sero(om_samp) + +plot_sero(om_sero) + +ggsave("./fig/scaled_seroprevalence.png", + width = 2800, height = 1800, + units = "px" +) + +#### write to database +sql <- "CREATE TABLE retention( + inf TEXT PRIMARY KEY, + retain INTEGER + )" +dbExecute(con, sql) +dbReadTable(con, "retention") + +retention <- infection_history |> select(inf) +retention$retain <- as.integer(retention$inf %in% om_samp$inf) + +dbAppendTable(con, "retention", retention) +dbGetQuery(con, "SELECT SUM(retain) AS sum FROM retention") +dbDisconnect(con) diff --git a/R/seroprevalence.R b/R/seroprevalence.R new file mode 100644 index 00000000..42cf9a2d --- /dev/null +++ b/R/seroprevalence.R @@ -0,0 +1,140 @@ +#################################################### +# Process the serology data downloaded from CDC +# Process the FL ABM output and extract serology information +# Plot the seroprevalence from actual observation vs ABM +# Write out processed serology data as CSVs +#################################################### + +pacman::p_load(dplyr, ggplot2, lubridate, tidyr, glue) +pacman::p_load(RSQLite) +theme_set(theme_bw(base_size = 13)) +theme_update( + text = element_text(family = "Open Sans"), + strip.background = element_blank(), + strip.text = element_text(face = "bold"), + panel.grid = element_line(linewidth = rel(0.43), colour = "#D1D3D4"), + panel.border = element_rect(linewidth = rel(0.43)), + axis.ticks = element_line(linewidth = rel(0.43)) +) + +# Observed Serology +csv <- file.path( + "data", + "serological-data", + "Nationwide_Commercial_Laboratory_Seroprevalence_Survey_20231018.csv" +) +df <- data.table::fread(csv) +df + +cols <- c( + "Rate (%) [Anti-N, 0-17 Years Prevalence]", + "Rate (%) [Anti-N, 18-49 Years Prevalence, Rounds 1-30 only]", + "Rate (%) [Anti-N, 50-64 Years Prevalence, Rounds 1-30 only]", + "Rate (%) [Anti-N, 65+ Years Prevalence, Rounds 1-30 only]" +) + +df_flus <- df |> + filter(Site %in% c("FL", "US")) |> + select( + Site, `Date Range of Specimen Collection`, + all_of(cols) + ) + +df_flus <- df_flus |> + separate_wider_delim( + `Date Range of Specimen Collection`, + delim = "-", names = c("start_date", "end_date") + ) |> + mutate( + end_date = mdy(end_date), + start_date = ifelse(stringr::str_detect(start_date, ","), + start_date, glue("{start_date}{year(end_date)}") + ), + start_date = mdy(start_date), + mid_date = start_date + (end_date - start_date) / 2 + ) + + +df_flus_long <- df_flus |> + tidyr::pivot_longer( + contains("Rate"), + names_to = c("type", "age"), + names_pattern = "Rate \\(%\\) \\[(.*), (.*) Years Prevalence.*\\]" + ) + +ggplot(df_flus_long) + + geom_smooth(aes(x = mid_date, y = value, colour = Site), + span = 0.3 + ) + + geom_vline( + xintercept = ymd(c("2021-11-15", "2022-02-10")), + lty = 2 + ) + + facet_wrap(~age) + +#### Florida ABM + +age_df <- data.table::fread("./data/abm-data/sim_ages.txt") +con <- dbConnect(SQLite(), "./data/abm-data/sim_data_0.sqlite") +dbListTables(con) +q <- dbSendQuery( + con, + "SELECT inf_owner_id, infected_time FROM infection_history" +) +infection_history <- dbFetch(q) +dbClearResult(q) + +colnames(infection_history) +infection_history <- age_df |> + left_join(infection_history, by = join_by(pid == inf_owner_id)) +head(infection_history) +first_inf <- infection_history |> + group_by(pid, age) |> + summarise(infected_time = min(infected_time), .groups = "drop") |> + mutate(age = case_when( + age <= 17 ~ "0-17", + age <= 49 ~ "18-49", + age <= 64 ~ "50-64", + TRUE ~ "65+" + )) +pop_age <- first_inf |> + group_by(age) |> + summarise(total = n()) +serology <- first_inf |> + group_by(age, time = infected_time) |> + summarise(n = n()) |> + filter(!is.na(time)) |> + complete(time = 0:874, fill = list(n = 0)) |> + left_join(pop_age, by = "age") |> + mutate(seroprevalence = cumsum(n) / total) + +# Combine +serology_abm <- serology |> + mutate( + Site = "FL_ABM", + date = ymd("2020-02-10") + ddays(time), + seroprevalence = seroprevalence * 100 + ) |> + select(Site, age, date, seroprevalence) + +ggplot() + + geom_line(aes(x = mid_date, y = value, colour = Site), + data = df_flus_long # , alpha = 0.3, span = 0.4 + ) + + geom_line(aes(x = date, y = seroprevalence, colour = Site), + data = serology_abm + ) + + geom_vline( + xintercept = ymd(c("2021-11-15", "2022-02-10")), + lty = 2 + ) + + facet_wrap(~age) + + scale_x_date(date_breaks = "6 months", date_labels = "%m/%y") + + theme(axis.title.x = element_blank()) + + labs(y = "Seroprevalence %") + +ggsave("./fig/seroprevalence.png", width = 2800, height = 1800, units = "px") + +#### +data.table::fwrite(serology_abm, "serology_abm.csv") +data.table::fwrite(df_flus_long, "serology_flus.csv") diff --git a/sim_data_to_sero_generator.py b/sim_data_to_sero_generator.py index 30f8ccac..e4ee7d1c 100644 --- a/sim_data_to_sero_generator.py +++ b/sim_data_to_sero_generator.py @@ -4,7 +4,7 @@ import pandas as pd SIM_START_DATE = datetime.date(2020, 2, 10) -SIM_INPUT_PATH = "data/sim_data_0.sqlite" +SIM_INPUT_PATH = "data/abm-data/sim_data_scaled_us.sqlite" MODEL_INIT_DATE = datetime.date(2022, 2, 11) OUTPUT_DATA_PATH = "data/abm_population3.csv" @@ -13,8 +13,12 @@ cnx = sqlite3.connect(SIM_INPUT_PATH) res = cnx.execute("SELECT name FROM sqlite_master WHERE type='table';") print("available tables: ") -for name in res.fetchall(): - print(name[0]) +tables = [name[0] for name in res.fetchall()] +print(tables) + +retention = 1 if "retention" in tables else 0 +if retention: + print("subsampling of infection history detected") days_diff = (MODEL_INIT_DATE - SIM_START_DATE).days print("model init date: " + str(days_diff)) @@ -22,19 +26,28 @@ ### LOAD IN TABLES ########################################################## sim_people = pd.read_csv( - "data/sim_ages.txt", sep=" " + "data/abm-data/sim_ages.txt", sep=" " ) # all people in the simulation and their ages # all infections that occured before init date. # we exclude those who died from their infections, unless they died after the model init date -infection_history = pd.read_sql_query( - sql="""SELECT inf_owner_id, strain, infected_time, infectious_start, infectious_end +if retention: + sql = """SELECT inf.inf_owner_id, inf.strain, inf.infected_time, inf.infectious_start, + inf.infectious_end, r.retain + FROM infection_history AS inf + JOIN retention AS r ON inf.inf = r.inf + WHERE ((death_time == 2147483647 AND infected_time <= {}) OR + (death_time > {} AND infected_time <= {})) AND retain = 1""" +else: + sql = """SELECT inf_owner_id, strain, infected_time, infectious_start, infectious_end FROM infection_history WHERE (death_time == 2147483647 AND infected_time <= {}) OR - (death_time > {} AND infected_time <= {})""".format( - days_diff, days_diff, days_diff - ), + (death_time > {} AND infected_time <= {})""" + +infection_history = pd.read_sql_query( + sql=sql.format(days_diff, days_diff, days_diff), con=cnx, ) + vaccination_history = ( pd.read_sql_query( "SELECT * FROM vaccination_history WHERE vax_sim_day <= "