ecohealthalliance · emmamendelsohn · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 29, 2024
diff --git a/.git-crypt/keys/default/0/5D3C052489037B6882DD9C87D95913B85CD24532.gpg b/.git-crypt/keys/default/0/5D3C052489037B6882DD9C87D95913B85CD24532.gpg
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,4 @@ data/**/*.parquet
 data/**/*.zip
 data/**/*.nc
 data/**/*.tif
+data/*
diff --git a/R/aggregate_augmented_data_by_adm.R b/R/aggregate_augmented_data_by_adm.R
@@ -9,19 +9,27 @@
 #' @author Emma Mendelsohn
 #' @export
 aggregate_augmented_data_by_adm <- function(augmented_data, 
-                                  rsa_polygon, 
-                                  model_dates_selected) {
+                                            rsa_polygon, 
+                                            model_dates_selected) {
 
+  # Read augmented data, convert to raster
   r <- arrow::read_parquet(glue::glue("{augmented_data}/date={model_dates_selected}/part-0.parquet")) |> 
     rast() 
   crs(r) <- crs(rast())
+
+  # Mask raster to polygon
   r <- mask(r, rsa_polygon) 
 
-  p <- terra::extract(r, rsa_polygon, mean, na.rm = TRUE, weights = TRUE)
+  # Get the mean value by polygons
+  p <- terra::extract(r, rsa_polygon, mean, na.rm = TRUE, weights = FALSE, ID = FALSE) 
+
+
+  # spatially weight in fitting - probability per unit area - per size of the district
+  # probability of it happening in the district is the spatial weighting
 
   bind_cols(rsa_polygon, p) |> 
     as_tibble() |> 
     mutate(date = model_dates_selected) |> 
-    select(date, shapeName, ID, names(p))
-
+    select(date, shapeName, names(p))
+  
 }
diff --git a/R/build_workflow.R b/R/build_workflow.R
@@ -0,0 +1,45 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param model_data_train
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+build_workflow <- function(model_data_train) {
+
+  rec <-  recipe(formula = as.formula(outbreak_30 ~
+                                        anomaly_relative_humidity_30 +
+                                        anomaly_temperature_30 +
+                                        anomaly_precipitation_30 +
+                                        anomaly_relative_humidity_60 +
+                                        anomaly_temperature_60 +
+                                        anomaly_precipitation_60 +
+                                        anomaly_relative_humidity_90 +
+                                        anomaly_temperature_90 +
+                                        anomaly_precipitation_90 +
+                                        anomaly_temperature_forecast_29 +
+                                        anomaly_precipitation_forecast_29+
+                                        anomaly_relative_humidity_forecast_29+
+                                        anomaly_ndvi_30 +
+                                        anomaly_ndvi_60 +
+                                        anomaly_ndvi_90) ,
+                                      data = model_data_train) |>
+                   step_mutate(outbreak_30 = as.factor(outbreak_30), skip = TRUE)
+
+                 spec <-
+                   boost_tree(trees = 1000, min_n = tune(), tree_depth = tune(), learn_rate = tune(),
+                              loss_reduction = tune()) |>
+                   set_mode("classification" ) |>
+                   set_engine("xgboost", num_class = 2, objective = "binary:logistic")
+
+                 model_workflow <-
+                   workflow() %>%
+                   add_recipe(rec) %>%
+                   add_model(spec)
+
+                 model_workflow
+
+
+}
diff --git a/R/create_outbreak_layer.R b/R/create_outbreak_layer.R
@@ -0,0 +1,36 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param wahis_rvf_outbreaks_preprocessed
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+create_outbreak_layer <- function(wahis_rvf_outbreaks_preprocessed,
+                                  rsa_polygon,
+                                  model_dates_selected) {
+
+  # Get polygons for outbreaks
+  rvf_points <- wahis_rvf_outbreaks_preprocessed |> 
+    distinct(country_name, outbreak_id, outbreak_start_date, latitude, longitude) |> 
+    st_as_sf(coords = c("longitude", "latitude"))
+  st_crs(rvf_points) <- st_crs(rsa_polygon)  # Set the CRS (Coordinate Reference System)
+
+  rvf_points_polygon <- st_join(rvf_points, rsa_polygon, join = st_within) |> 
+    drop_na(shapeGroup) |> 
+    mutate(outbreak_start_date = ymd(outbreak_start_date))
+
+  # For each model selected date, determine which polygons had an outbreak in the following 30 days
+  rvf_points_polygon_dates <- map_dfr(model_dates_selected, function(model_date){
+    day_diff <- rvf_points_polygon$outbreak_start_date - model_date
+    rvf_points_polygon[which(day_diff >= 1 & day_diff <= 30),] |> 
+      mutate(date = model_date)
+  })
+
+  # Return the unique combination of model select dates and districts with outbreaks
+  rvf_points_polygon_dates |> 
+    distinct(date, shapeName) |> 
+    mutate(outbreak_30 = TRUE)
+
+}
diff --git a/R/download_sentinel_ndvi.R b/R/download_sentinel_ndvi.R
@@ -30,18 +30,19 @@ download_sentinel_ndvi <- function(sentinel_ndvi_api_parameters, download_direct
 
   # auth
   auth <- POST("https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token", 
-                   body = list(
-                     grant_type = "password",
-                     username = Sys.getenv("COPERNICUS_USERNAME"),
-                     password =  Sys.getenv("COPERNICUS_PASSWORD"),
-                     client_id = "cdse-public"
-                   ), 
-                   encode = "form")
-
-  url <- glue::glue("http://catalogue.dataspace.copernicus.eu/odata/v1/Products({id})/$value")
+               body = list(
+                 grant_type = "password",
+                 username = Sys.getenv("COPERNICUS_USERNAME"),
+                 password =  Sys.getenv("COPERNICUS_PASSWORD"),
+                 client_id = "cdse-public"
+               ), 
+               encode = "form")
+
+  url <- glue::glue("https://download.dataspace.copernicus.eu/odata/v1/Products({id})/$value")
   file <- here::here(download_directory, glue::glue("{download_filename}.zip"))
   response <- GET(url, add_headers(Authorization =  paste("Bearer", content(auth)$access_token)),
                   write_disk(file, overwrite = TRUE))
+
   unzip(file, files = paste0(download_filename, c(".SEN3/NDVI.nc")), exdir= download_directory)
   file.remove(file)
   file.rename(here::here(download_directory, paste0(download_filename, ".SEN3/NDVI.nc")), 

diff --git a/R/get_holdout_masks.R b/R/get_holdout_masks.R
@@ -0,0 +1,41 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param holdout_data
+#' @param mask_lookup
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+get_holdout_masks <- function(holdout_data, mask_lookup) {
+
+  date_masks <- holdout_data |> 
+    select(date, shapeName) |> 
+    left_join(mask_lookup$masked_dates_90_days_lookup, by = join_by(date)) |> 
+    unnest(mask) |> 
+    select(date = mask, shapeName)
+
+  shape_masks <- holdout_data |> 
+    select(date, shapeName) |> 
+    left_join(mask_lookup$masked_shapes_adjacent_lookup, by = join_by("shapeName" == "shape")) |> 
+    unnest(mask) |> 
+    select(date, shapeName = mask)
+
+  masks <- bind_rows(date_masks, shape_masks) |> 
+    distinct()
+
+  return(masks)
+
+  # take 1 - mask shape and date (note this incorrectly misses having the district itself masked for 3 months)
+ # holdout_data |> 
+ #    select(date, shapeName) |> 
+ #    left_join(mask_lookup$masked_dates_90_days_lookup, by = join_by(date)) |> 
+ #    rename(date_mask = mask) |> 
+ #    left_join(mask_lookup$masked_shapes_adjacent_lookup, by = join_by("shapeName" == "shape")) |> 
+ #    rename(shape_mask = mask)  |> 
+ #    select(-date, -shapeName) |> 
+ #    unnest(shape_mask) |> 
+ #    unnest(date_mask)
+
+}
diff --git a/R/get_mermaid.R b/R/get_mermaid.R
@@ -0,0 +1,60 @@
+get_mermaid <- function(
+    targets_only = FALSE,
+    names = NULL,
+    shortcut = FALSE,
+    allow = NULL,
+    exclude = ".Random.seed",
+    outdated = TRUE,
+    label = NULL,
+    legend = TRUE,
+    color = TRUE,
+    reporter = targets::tar_config_get("reporter_outdated"),
+    seconds_reporter = targets::tar_config_get("seconds_reporter"),
+    callr_function = callr::r,
+    callr_arguments = targets::tar_callr_args_default(callr_function),
+    envir = parent.frame(),
+    script = targets::tar_config_get("script"),
+    store = targets::tar_config_get("store")
+) {
+  tar_assert_allow_meta("tar_mermaid", store = store)
+  force(envir)
+  tar_assert_lgl(targets_only, "targets_only must be logical.")
+  tar_assert_lgl(outdated, "outdated in tar_mermaid() must be logical.")
+  tar_assert_in(label, c("time", "size", "branches"))
+  tar_assert_lgl(legend)
+  tar_assert_lgl(color)
+  tar_assert_scalar(legend)
+  tar_assert_scalar(color)
+  tar_config_assert_reporter_outdated(reporter)
+  tar_assert_callr_function(callr_function)
+  tar_assert_list(callr_arguments, "callr_arguments mut be a list.")
+  tar_assert_dbl(seconds_reporter)
+  tar_assert_scalar(seconds_reporter)
+  tar_assert_none_na(seconds_reporter)
+  tar_assert_ge(seconds_reporter, 0)
+  targets_arguments <- list(
+    path_store = store,
+    targets_only = targets_only,
+    names_quosure = rlang::enquo(names),
+    shortcut = shortcut,
+    allow_quosure = rlang::enquo(allow),
+    exclude_quosure = rlang::enquo(exclude),
+    outdated = outdated,
+    label = label,
+    legend = legend,
+    color = color,
+    reporter = reporter,
+    seconds_reporter = seconds_reporter
+  )
+  callr_outer(
+    targets_function = tar_mermaid_inner,
+    targets_arguments = targets_arguments,
+    callr_function = callr_function,
+    callr_arguments = callr_arguments,
+    envir = envir,
+    script = script,
+    store = store,
+    fun = "tar_mermaid"
+  )
+}
+
diff --git a/R/make_mask_lookup.R b/R/make_mask_lookup.R
@@ -0,0 +1,29 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param model_data
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+make_mask_lookup <- function(model_dates_selected, rsa_polygon) {
+
+  masked_dates_90_days_lookup <- map_dfr(model_dates_selected, function(date){
+    diffs <- model_dates_selected - date
+    tibble(date = date, mask = list(model_dates_selected[diffs > 0 & diffs <= 90]))
+  })
+
+  masked_shapes_adjacent_lookup <- map_dfr(1:nrow(rsa_polygon), function(i){
+    select_shape <- rsa_polygon[i,]
+    touches <- st_touches(select_shape, rsa_polygon)
+    touches_shapes <- rsa_polygon[unlist(touches),]
+    tibble(shape = select_shape$shapeName, 
+           mask = list(touches_shapes$shapeName))
+  })
+
+  list(masked_dates_90_days_lookup = masked_dates_90_days_lookup,
+       masked_shapes_adjacent_lookup = masked_shapes_adjacent_lookup
+       )
+
+}
diff --git a/R/model_grid.R b/R/model_grid.R
@@ -0,0 +1,22 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+model_grid <- function(training_data) {
+
+  dials::grid_latin_hypercube(
+    dials::tree_depth(),
+    dials::min_n(),
+    dials::loss_reduction(),
+    sample_size = dials::sample_prop(),
+    dials::finalize(dials::mtry(), training_data),
+    dials::learn_rate(),
+    size = 10
+  ) 
+
+}
diff --git a/R/model_recipe.R b/R/model_recipe.R
@@ -0,0 +1,35 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param training_data
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+model_recipe <- function(training_data) {
+
+  recipe(formula = as.formula(outbreak_30 ~
+                                # TODO add day of year
+                                # TODO add static
+                                # TODO add immunity layer
+                                # TODO add recent outbreak layer - has there been an outbreak this season
+                                anomaly_relative_humidity_30 +
+                                anomaly_temperature_30 +
+                                anomaly_precipitation_30 +
+                                anomaly_relative_humidity_60 +
+                                anomaly_temperature_60 +
+                                anomaly_precipitation_60 +
+                                anomaly_relative_humidity_90 +
+                                anomaly_temperature_90 +
+                                anomaly_precipitation_90 +
+                                anomaly_temperature_forecast_29 +
+                                anomaly_precipitation_forecast_29+
+                                anomaly_relative_humidity_forecast_29+
+                                anomaly_ndvi_30 +
+                                anomaly_ndvi_60 +
+                                anomaly_ndvi_90 + 
+                                area) ,
+         data = training_data)
+
+}
diff --git a/R/model_specs.R b/R/model_specs.R
@@ -0,0 +1,30 @@
+#' .. content for \description{} (no empty lines) ..
+#'
+#' .. content for \details{} ..
+#'
+#' @title
+#' @param base_score
+#' @param interaction_constrains
+#' @return
+#' @author Emma Mendelsohn
+#' @export
+model_specs <- function(base_score, interaction_constraints) {
+
+ parsnip::boost_tree(
+    trees = 1000,
+    tree_depth = hardhat::tune(),
+    min_n = hardhat::tune(),
+    loss_reduction = hardhat::tune(),                   
+    sample_size = hardhat::tune(), 
+    mtry = hardhat::tune(),
+    learn_rate = hardhat::tune()
+  ) |>
+    parsnip::set_engine("xgboost", 
+                        objective = "binary:logistic", 
+                        base_score = base_score, # set the background/intercept rate - this allows the tree to split even when the training is all negatives
+                        interaction_constraints = interaction_constraints, # do not interact on area
+                        monotone_constraints = monotone_constraints # enforce positive relationship for area
+                        ) |> 
+    parsnip::set_mode("classification")
+
+}