diff --git a/R/abm_resampling.R b/R/abm_resampling.R
new file mode 100644
index 00000000..39430d61
--- /dev/null
+++ b/R/abm_resampling.R
@@ -0,0 +1,217 @@
+####################################################
+# Rescale the output of ABM such that it matches with US
+#   serology data
+# This is done by first subsampling the pre-omicron
+#   infections in the ABM to match with seroprevalence
+#   on 2021-11-15
+# Then subsampling the omicron infections to match with
+#   seroprevalence on 2022-02-10
+# Store the subsample as a `retention` table in the
+#   sqlite database (1 to be retained, 0 to be removed)
+####################################################
+
+pacman::p_load(dplyr, data.table, lubridate, tidyr, ggplot2)
+theme_set(theme_bw(base_size = 13))
+theme_update(
+  text = element_text(family = "Open Sans"),
+  strip.background = element_blank(),
+  strip.text = element_text(face = "bold"),
+  panel.grid = element_line(linewidth = rel(0.43), colour = "#D1D3D4"),
+  panel.border = element_rect(linewidth = rel(0.43)),
+  axis.ticks = element_line(linewidth = rel(0.43))
+)
+
+sero_flus <- fread("./data/abm-data/serology_flus.csv")
+sero_abm <- fread("./data/abm-data/serology_abm.csv")
+
+crude_auc <- function(days, values, rebase = FALSE) {
+  if (rebase) values <- values - values[1]
+  xout <- seq(min(days), max(days), by = 1)
+  yout <- approx(days, values, xout)$y
+  return(sum(yout))
+}
+
+sample_inf_hist <- function(df, probs) {
+  retain <- rbinom(nrow(df), 1, probs)
+  return(df[retain == 1, ])
+}
+
+calc_sero <- function(inf_df) {
+  first_infections <- inf_df |>
+    group_by(pid, age) |>
+    summarise(infected_time = min(infected_time))
+
+  serology <- first_infections |>
+    group_by(age, time = infected_time) |>
+    summarise(n = n()) |>
+    complete(time = 0:874, fill = list(n = 0)) |>
+    left_join(pop_age, by = "age") |>
+    mutate(
+      seroprevalence = cumsum(n) / pop * 100,
+      Site = "ABM_scaled",
+      date = ymd("2020-02-10") + time
+    )
+
+  return(serology)
+}
+
+plot_sero <- function(abm_df) {
+  ggplot() +
+    geom_line(aes(x = mid_date, y = value, colour = Site),
+      data = sero_flus |> filter(Site == "US")
+    ) +
+    geom_line(aes(x = date, y = seroprevalence, colour = Site),
+      data = abm_df
+    ) +
+    geom_vline(
+      xintercept = ymd(c("2021-11-15", "2022-02-10")),
+      lty = 2
+    ) +
+    facet_wrap(~age) +
+    scale_x_date(date_breaks = "6 months", date_labels = "%m/%y") +
+    theme(axis.title.x = element_blank()) +
+    labs(y = "Seroprevalence %")
+}
+
+#### ABM infection history
+age_df <- data.table::fread("./data/abm-data/sim_ages.txt")
+db_path <- "./data/abm-data/sim_data_0.sqlite"
+db_out_path <- "./data/abm-data/sim_data_scaled_us.sqlite"
+file.copy(db_path, db_out_path, overwrite = TRUE)
+con <- dbConnect(SQLite(), db_out_path)
+dbListTables(con)
+q <- dbSendQuery(
+  con,
+  "SELECT inf, inf_owner_id, infected_time, strain FROM infection_history"
+)
+query_inf_history <- dbFetch(q)
+dbClearResult(q)
+
+age_df <- age_df |>
+  mutate(age = case_when(
+    age <= 17 ~ "0-17",
+    age <= 49 ~ "18-49",
+    age <= 64 ~ "50-64",
+    TRUE ~ "65+"
+  ))
+pop_age <- age_df |>
+  group_by(age) |>
+  summarise(pop = n())
+infection_history <- query_inf_history |>
+  left_join(age_df, by = join_by(inf_owner_id == pid)) |>
+  rename(pid = inf_owner_id) |>
+  mutate(
+    date = ymd("2020-02-10") + infected_time,
+    strain = ifelse(strain == "OMICRON", "OMICRON", "PREOM")
+  )
+
+#### Pre-Omicron scaling
+preom_obs <- sero_flus |>
+  filter(Site == "US") |>
+  select(date = mid_date, age, value) |>
+  filter(date <= ymd("2021-11-15"))
+preom_start <- min(preom_obs$date)
+preom_stop <- max(preom_obs$date)
+
+preom_obs_auc <- preom_obs |>
+  group_by(age) |>
+  summarise(
+    auc_obs = crude_auc(date, value, rebase = FALSE),
+    last_obs = value[date == preom_stop]
+  )
+
+preom_abm_auc <- sero_abm |>
+  filter(date >= preom_start, date <= preom_stop) |>
+  group_by(age) |>
+  summarise(
+    auc_abm = crude_auc(date, seroprevalence, rebase = FALSE),
+    last_abm = seroprevalence[date == preom_stop],
+    first_abm = 0
+  )
+
+preom_scale <- preom_obs_auc |>
+  left_join(preom_abm_auc, by = "age") |>
+  mutate(
+    scale_preom_auc = auc_obs / auc_abm,
+    scale_om_lin = (last_obs - first_abm) / (last_abm - first_abm)
+  ) |>
+  select(-contains("obs"), -contains("abm"))
+
+#### Sample preom
+preom_samp <- infection_history |>
+  left_join(preom_scale, by = "age")
+probs <- ifelse(
+  preom_samp$strain == "OMICRON",
+  1,
+  preom_samp$scale_preom_auc # auc scale is better fit
+)
+preom_samp <- sample_inf_hist(infection_history, probs)
+preom_sero <- calc_sero(preom_samp)
+
+plot_sero(preom_sero) # Matches well on first dash line
+
+#### Omicron scaling
+om_obs <- sero_flus |>
+  filter(Site == "US") |>
+  select(date = mid_date, age, value) |>
+  filter(date >= ymd("2021-11-15"), date <= ymd("2022-02-11"))
+om_start <- min(om_obs$date)
+om_stop <- max(om_obs$date)
+
+om_obs_auc <- om_obs |>
+  group_by(age) |>
+  summarise(
+    auc_obs = crude_auc(date, value, rebase = TRUE),
+    last_obs = value[date == om_stop]
+  )
+
+om_abm_auc <- preom_sero |>
+  filter(date >= om_start, date <= om_stop) |>
+  group_by(age) |>
+  summarise(
+    auc_abm = crude_auc(date, seroprevalence, rebase = TRUE),
+    last_abm = seroprevalence[date == om_stop],
+    first_abm = seroprevalence[date == om_start]
+  )
+
+om_scale <- om_obs_auc |>
+  left_join(om_abm_auc, by = "age") |>
+  left_join(pintersect, by = "age") |>
+  mutate(
+    scale_om_auc = auc_obs / auc_abm / (1 - p),
+    scale_om_lin = (last_obs - first_abm) / (last_abm - first_abm) # / (1 - p)
+  ) |>
+  select(-contains("obs"), -contains("abm"))
+
+#### Sample pre- and post-om
+om_samp <- preom_samp |>
+  left_join(om_scale, by = "age")
+probs <- ifelse(
+  om_samp$strain == "OMICRON",
+  om_samp$scale_om_lin, # linear scale is better here
+  1
+)
+om_samp <- sample_inf_hist(preom_samp, probs)
+om_sero <- calc_sero(om_samp)
+
+plot_sero(om_sero)
+
+ggsave("./fig/scaled_seroprevalence.png",
+  width = 2800, height = 1800,
+  units = "px"
+)
+
+#### write to database
+sql <- "CREATE TABLE retention(
+          inf TEXT PRIMARY KEY,
+          retain INTEGER
+        )"
+dbExecute(con, sql)
+dbReadTable(con, "retention")
+
+retention <- infection_history |> select(inf)
+retention$retain <- as.integer(retention$inf %in% om_samp$inf)
+
+dbAppendTable(con, "retention", retention)
+dbGetQuery(con, "SELECT SUM(retain) AS sum FROM retention")
+dbDisconnect(con)
diff --git a/R/seroprevalence.R b/R/seroprevalence.R
new file mode 100644
index 00000000..42cf9a2d
--- /dev/null
+++ b/R/seroprevalence.R
@@ -0,0 +1,140 @@
+####################################################
+# Process the serology data downloaded from CDC
+# Process the FL ABM output and extract serology information
+# Plot the seroprevalence from actual observation vs ABM
+# Write out processed serology data as CSVs
+####################################################
+
+pacman::p_load(dplyr, ggplot2, lubridate, tidyr, glue)
+pacman::p_load(RSQLite)
+theme_set(theme_bw(base_size = 13))
+theme_update(
+  text = element_text(family = "Open Sans"),
+  strip.background = element_blank(),
+  strip.text = element_text(face = "bold"),
+  panel.grid = element_line(linewidth = rel(0.43), colour = "#D1D3D4"),
+  panel.border = element_rect(linewidth = rel(0.43)),
+  axis.ticks = element_line(linewidth = rel(0.43))
+)
+
+# Observed Serology
+csv <- file.path(
+  "data",
+  "serological-data",
+  "Nationwide_Commercial_Laboratory_Seroprevalence_Survey_20231018.csv"
+)
+df <- data.table::fread(csv)
+df
+
+cols <- c(
+  "Rate (%) [Anti-N, 0-17 Years Prevalence]",
+  "Rate (%) [Anti-N, 18-49 Years Prevalence, Rounds 1-30 only]",
+  "Rate (%) [Anti-N, 50-64 Years Prevalence, Rounds 1-30 only]",
+  "Rate (%) [Anti-N, 65+ Years Prevalence, Rounds 1-30 only]"
+)
+
+df_flus <- df |>
+  filter(Site %in% c("FL", "US")) |>
+  select(
+    Site, `Date Range of Specimen Collection`,
+    all_of(cols)
+  )
+
+df_flus <- df_flus |>
+  separate_wider_delim(
+    `Date Range of Specimen Collection`,
+    delim = "-", names = c("start_date", "end_date")
+  ) |>
+  mutate(
+    end_date = mdy(end_date),
+    start_date = ifelse(stringr::str_detect(start_date, ","),
+      start_date, glue("{start_date}{year(end_date)}")
+    ),
+    start_date = mdy(start_date),
+    mid_date = start_date + (end_date - start_date) / 2
+  )
+
+
+df_flus_long <- df_flus |>
+  tidyr::pivot_longer(
+    contains("Rate"),
+    names_to = c("type", "age"),
+    names_pattern = "Rate \\(%\\) \\[(.*), (.*) Years Prevalence.*\\]"
+  )
+
+ggplot(df_flus_long) +
+  geom_smooth(aes(x = mid_date, y = value, colour = Site),
+    span = 0.3
+  ) +
+  geom_vline(
+    xintercept = ymd(c("2021-11-15", "2022-02-10")),
+    lty = 2
+  ) +
+  facet_wrap(~age)
+
+#### Florida ABM
+
+age_df <- data.table::fread("./data/abm-data/sim_ages.txt")
+con <- dbConnect(SQLite(), "./data/abm-data/sim_data_0.sqlite")
+dbListTables(con)
+q <- dbSendQuery(
+  con,
+  "SELECT inf_owner_id, infected_time FROM infection_history"
+)
+infection_history <- dbFetch(q)
+dbClearResult(q)
+
+colnames(infection_history)
+infection_history <- age_df |>
+  left_join(infection_history, by = join_by(pid == inf_owner_id))
+head(infection_history)
+first_inf <- infection_history |>
+  group_by(pid, age) |>
+  summarise(infected_time = min(infected_time), .groups = "drop") |>
+  mutate(age = case_when(
+    age <= 17 ~ "0-17",
+    age <= 49 ~ "18-49",
+    age <= 64 ~ "50-64",
+    TRUE ~ "65+"
+  ))
+pop_age <- first_inf |>
+  group_by(age) |>
+  summarise(total = n())
+serology <- first_inf |>
+  group_by(age, time = infected_time) |>
+  summarise(n = n()) |>
+  filter(!is.na(time)) |>
+  complete(time = 0:874, fill = list(n = 0)) |>
+  left_join(pop_age, by = "age") |>
+  mutate(seroprevalence = cumsum(n) / total)
+
+# Combine
+serology_abm <- serology |>
+  mutate(
+    Site = "FL_ABM",
+    date = ymd("2020-02-10") + ddays(time),
+    seroprevalence = seroprevalence * 100
+  ) |>
+  select(Site, age, date, seroprevalence)
+
+ggplot() +
+  geom_line(aes(x = mid_date, y = value, colour = Site),
+    data = df_flus_long # , alpha = 0.3, span = 0.4
+  ) +
+  geom_line(aes(x = date, y = seroprevalence, colour = Site),
+    data = serology_abm
+  ) +
+  geom_vline(
+    xintercept = ymd(c("2021-11-15", "2022-02-10")),
+    lty = 2
+  ) +
+  facet_wrap(~age) +
+  scale_x_date(date_breaks = "6 months", date_labels = "%m/%y") +
+  theme(axis.title.x = element_blank()) +
+  labs(y = "Seroprevalence %")
+
+ggsave("./fig/seroprevalence.png", width = 2800, height = 1800, units = "px")
+
+####
+data.table::fwrite(serology_abm, "serology_abm.csv")
+data.table::fwrite(df_flus_long, "serology_flus.csv")
diff --git a/sim_data_to_sero_generator.py b/sim_data_to_sero_generator.py
index 30f8ccac..e4ee7d1c 100644
--- a/sim_data_to_sero_generator.py
+++ b/sim_data_to_sero_generator.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 SIM_START_DATE = datetime.date(2020, 2, 10)
-SIM_INPUT_PATH = "data/sim_data_0.sqlite"
+SIM_INPUT_PATH = "data/abm-data/sim_data_scaled_us.sqlite"
 MODEL_INIT_DATE = datetime.date(2022, 2, 11)
 OUTPUT_DATA_PATH = "data/abm_population3.csv"
 
@@ -13,8 +13,12 @@
 cnx = sqlite3.connect(SIM_INPUT_PATH)
 res = cnx.execute("SELECT name FROM sqlite_master WHERE type='table';")
 print("available tables: ")
-for name in res.fetchall():
-    print(name[0])
+tables = [name[0] for name in res.fetchall()]
+print(tables)
+
+retention = 1 if "retention" in tables else 0
+if retention:
+    print("subsampling of infection history detected")
 
 days_diff = (MODEL_INIT_DATE - SIM_START_DATE).days
 print("model init date: " + str(days_diff))
@@ -22,19 +26,28 @@
 ### LOAD IN TABLES ##########################################################
 
 sim_people = pd.read_csv(
-    "data/sim_ages.txt", sep=" "
+    "data/abm-data/sim_ages.txt", sep=" "
 )  # all people in the simulation and their ages
 # all infections that occured before init date.
 # we exclude those who died from their infections, unless they died after the model init date
-infection_history = pd.read_sql_query(
-    sql="""SELECT inf_owner_id, strain, infected_time, infectious_start, infectious_end
+if retention:
+    sql = """SELECT inf.inf_owner_id, inf.strain, inf.infected_time, inf.infectious_start,
+    inf.infectious_end, r.retain
+    FROM infection_history AS inf
+    JOIN retention AS r ON inf.inf = r.inf
+    WHERE ((death_time == 2147483647 AND infected_time <= {}) OR
+    (death_time > {} AND infected_time <= {})) AND retain = 1"""
+else:
+    sql = """SELECT inf_owner_id, strain, infected_time, infectious_start, infectious_end
     FROM infection_history
     WHERE (death_time == 2147483647 AND infected_time <= {}) OR
-    (death_time > {} AND infected_time <= {})""".format(
-        days_diff, days_diff, days_diff
-    ),
+    (death_time > {} AND infected_time <= {})"""
+
+infection_history = pd.read_sql_query(
+    sql=sql.format(days_diff, days_diff, days_diff),
     con=cnx,
 )
+
 vaccination_history = (
     pd.read_sql_query(
         "SELECT * FROM vaccination_history WHERE vax_sim_day <= "