diff --git a/DESCRIPTION b/DESCRIPTION index 3ce98cd..2a4d646 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,6 +32,7 @@ Imports: PSW, ragg, rsample (>= 0.0.5), + sandwich, skimr, smd, survey, diff --git a/chapters/chapter-04.qmd b/chapters/chapter-04.qmd index 7e0b481..42aee13 100644 --- a/chapters/chapter-04.qmd +++ b/chapters/chapter-04.qmd @@ -67,7 +67,8 @@ data <- data.frame( ) ggplot(data, aes(x = x, y = y)) + - geom_text(aes(label = labels, angle = angle, vjust = 0), + geom_text( + aes(label = labels, angle = angle, vjust = 0), size = 7 ) + geom_segment(aes(x = 1, xend = 2, y = 0.95, yend = 0.95)) + @@ -463,9 +464,11 @@ d |> as.data.frame() -> d library(PSW) df <- as.data.frame(d) -x <- psw(df, +x <- psw( + df, "treatment ~ weight + age", - weight = "ATE", wt = TRUE, + weight = "ATE", + wt = TRUE, out.var = "y" ) tibble( @@ -568,9 +571,11 @@ d |> as.data.frame() -> d library(PSW) df <- as.data.frame(d) -x <- psw(df, +x <- psw( + df, "treatment ~ weight + age", - weight = "ATE", wt = TRUE, + weight = "ATE", + wt = TRUE, out.var = "y" ) tibble( diff --git a/chapters/chapter-05.qmd b/chapters/chapter-05.qmd index 2086c34..d9ed70f 100644 --- a/chapters/chapter-05.qmd +++ b/chapters/chapter-05.qmd @@ -23,37 +23,37 @@ Data collected as a review of causal diagrams in applied health research papers #| fig-cap: "Percentage of health research papers using causal diagrams over time." dag_data <- readxl::read_xlsx(here::here("data", "dag_data.xlsx")) -dag_data <- dag_data |> +dag_data <- dag_data |> separate( - dag_number, - into = c("dag_number", "dag_location"), - convert = TRUE, + dag_number, + into = c("dag_number", "dag_location"), + convert = TRUE, extra = "merge" - ) |> + ) |> separate( - consistent_direction, + consistent_direction, into = c("consistent_direction", "dag_ordering"), extra = "merge" - ) |> + ) |> separate( - report_adjset, + report_adjset, into = c("report_adjset", "report_adjset_loc"), extra = "merge" - ) |> + ) |> mutate( year = str_extract(citation, "\\d{4}") |> as.integer(), used_dag = !is.na(nodes), reported_estimand = !str_detect(report_estimand, "Not reported") ) -dag_data |> - count(year, used_dag) |> - mutate(pct = n / sum(n)) |> - filter(used_dag, year < max(year)) |> +dag_data |> + count(year, used_dag) |> + mutate(pct = n / sum(n)) |> + filter(used_dag, year < max(year)) |> ggplot(aes(year, pct)) + - geom_line(color = "#0072B2", linewidth = .9) + + geom_line(color = "#0072B2", linewidth = .9) + scale_y_continuous( - name = "percent of papers", + name = "percent of papers", labels = scales::label_percent() ) ``` @@ -82,9 +82,9 @@ Here, we are saying that `x` causes `y`. #| fig-height: 2 #| fig-cap: "A causal directed acyclic graph (DAG). DAGs depict causal relationships. In this DAG, the assumption is that `x` causes `y`." library(ggdag) -dagify(y ~ x, coords = time_ordered_coords()) |> - ggdag() + - theme_dag() + +dagify(y ~ x, coords = time_ordered_coords()) |> + ggdag() + + theme_dag() + expand_plot(expand_x = expansion(c(.2, .2))) ``` @@ -122,17 +122,17 @@ collider <- dagify( coords = coords ) -dag_flows <- map(list(fork = fork, chain = chain, collider = collider), tidy_dagitty) |> - map("data") |> - list_rbind(names_to = "dag") |> +dag_flows <- map(list(fork = fork, chain = chain, collider = collider), tidy_dagitty) |> + map("data") |> + list_rbind(names_to = "dag") |> mutate(dag = factor(dag, levels = c("fork", "chain", "collider"))) -dag_flows |> +dag_flows |> ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + - geom_dag_edges(edge_width = 1) + - geom_dag_point() + - geom_dag_text() + - facet_wrap(~ dag) + + geom_dag_edges(edge_width = 1) + + geom_dag_point() + + geom_dag_text() + + facet_wrap(~dag) + expand_plot( expand_x = expansion(c(0.2, 0.2)), expand_y = expansion(c(0.2, 0.2)) @@ -220,17 +220,17 @@ y <- -3 * q + rnorm(n) confounder_data <- tibble(x, y, q = as.factor(q)) -p1 <- confounder_data |> - ggplot(aes(x, y)) + - geom_point(alpha = .2) + - geom_smooth(method = "lm", se = FALSE, color = "black") + - facet_wrap(~ "not adjusting for `q`\n(biased)") +p1 <- confounder_data |> + ggplot(aes(x, y)) + + geom_point(alpha = .2) + + geom_smooth(method = "lm", se = FALSE, color = "black") + + facet_wrap(~"not adjusting for `q`\n(biased)") -p2 <- confounder_data |> - ggplot(aes(x, y, color = q)) + - geom_point(alpha = .2) + +p2 <- confounder_data |> + ggplot(aes(x, y, color = q)) + + geom_point(alpha = .2) + geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~ "adjusting for `q`\n(unbiased)") + facet_wrap(~"adjusting for `q`\n(unbiased)") p1 + p2 ``` @@ -259,7 +259,7 @@ x <- rnorm(n) ### ### q -linear_pred <- 2*x + rnorm(n) +linear_pred <- 2 * x + rnorm(n) prob <- 1 / (1 + exp(-linear_pred)) q <- rbinom(n, size = 1, prob = prob) ### @@ -270,17 +270,17 @@ y <- 2 * q + rnorm(n) mediator_data <- tibble(x, y, q = as.factor(q)) -p1 <- mediator_data |> - ggplot(aes(x, y)) + - geom_point(alpha = .2) + - geom_smooth(method = "lm", se = FALSE, color = "black") + - facet_wrap(~ "not adjusting for `q`\n(total effect)") +p1 <- mediator_data |> + ggplot(aes(x, y)) + + geom_point(alpha = .2) + + geom_smooth(method = "lm", se = FALSE, color = "black") + + facet_wrap(~"not adjusting for `q`\n(total effect)") -p2 <- mediator_data |> - ggplot(aes(x, y, color = q)) + - geom_point(alpha = .2) + +p2 <- mediator_data |> + ggplot(aes(x, y, color = q)) + + geom_point(alpha = .2) + geom_smooth(method = "lm", se = FALSE) + - facet_wrap(~ "adjusting for `q`\n(direct effect)") + facet_wrap(~"adjusting for `q`\n(direct effect)") p1 + p2 ``` @@ -303,7 +303,7 @@ We end up with a biased effect of `x` on `y`. #| label: fig-collider-scatter #| message: false #| fig-cap: "Two scatterplots of the relationship between `x` and `y`. The unadjusted relationship between the two is unbiased. When accounting for `q`, we open a colliding backdoor path and bias the relationship between `x` and `y`." -#| +#| ### x x <- rnorm(n) ### @@ -313,23 +313,23 @@ y <- rnorm(n) ### ### q -linear_pred <- 2*x + 3*y + rnorm(n) +linear_pred <- 2 * x + 3 * y + rnorm(n) prob <- 1 / (1 + exp(-linear_pred)) q <- rbinom(n, size = 1, prob = prob) ### collider_data <- tibble(x, y, q = as.factor(q)) -p1 <- collider_data |> - ggplot(aes(x, y)) + - geom_point(alpha = .2) + - geom_smooth(method = "lm", se = FALSE, color = "black") + +p1 <- collider_data |> + ggplot(aes(x, y)) + + geom_point(alpha = .2) + + geom_smooth(method = "lm", se = FALSE, color = "black") + facet_wrap(~"not adjusting for `q`\n(unbiased)") -p2 <- collider_data |> - ggplot(aes(x, y, color = q)) + - geom_point(alpha = .2) + - geom_smooth(method = "lm", se = FALSE) + +p2 <- collider_data |> + ggplot(aes(x, y, color = q)) + + geom_point(alpha = .2) + + geom_smooth(method = "lm", se = FALSE) + facet_wrap(~"adjusting for `q`\n(biased)") p1 + p2 @@ -350,33 +350,33 @@ At time point 2, `q` happens due to `x` and `y`. coords <- list(x = c(x = 0, y = 2, q = 1), y = c(x = 0, y = 0, q = -1)) collider <- dagify( - q ~ x + y, - exposure = "x", - outcome = "y", - coords = time_ordered_coords() + q ~ x + y, + exposure = "x", + outcome = "y", + coords = time_ordered_coords() ) -collider_t <- collider |> - tidy_dagitty() |> - mutate(time = "time point 1", direction = NA, to = NA) |> +collider_t <- collider |> + tidy_dagitty() |> + mutate(time = "time point 1", direction = NA, to = NA) |> filter(name != "q") -t2 <- collider |> - tidy_dagitty() |> - mutate(time = "time point 2") |> +t2 <- collider |> + tidy_dagitty() |> + mutate(time = "time point 2") |> pull_dag_data() collider_t$data <- bind_rows(collider_t$data, t2) -collider_t |> - mutate(deemphasize = (name %in% c("x", "y") & time == - "time point 2")) |> +collider_t |> + mutate(deemphasize = (name %in% c("x", "y") & time == + "time point 2")) |> ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + - geom_dag_edges(edge_width = 1, edge_color = "grey85") + - geom_dag_point(aes(color = deemphasize), show.legend = FALSE) + - geom_dag_text() + - facet_wrap(~ time) + - theme_dag() + + geom_dag_edges(edge_width = 1, edge_color = "grey85") + + geom_dag_point(aes(color = deemphasize), show.legend = FALSE) + + geom_dag_text() + + facet_wrap(~time) + + theme_dag() + scale_color_manual(values = c("TRUE" = "grey85", "FALSE" = "black")) ``` @@ -501,9 +501,9 @@ podcast_dag <- dagify( coords = time_ordered_coords( list( # time point 1 - c("prepared", "humor", "mood"), + c("prepared", "humor", "mood"), # time point 2 - "podcast", + "podcast", # time point 3 "exam" ) @@ -557,7 +557,7 @@ pod_dag <- dagify( ) # automatically determine layouts -pod_dag |> +pod_dag |> ggdag(text_size = 2.8) ``` @@ -567,7 +567,7 @@ We can also ask for a specific layout, e.g., the popular Sugiyama algorithm for #| fig-width: 4 #| fig-height: 4 #| fig-align: center -pod_dag |> +pod_dag |> ggdag(layout = "sugiyama", text_size = 2.8) ``` @@ -583,7 +583,7 @@ We know that's not the case: listening to the podcast happened before taking the #| fig-width: 4 #| fig-height: 4 #| fig-align: center -pod_dag |> +pod_dag |> ggdag(layout = "time_ordered", text_size = 2.8) ``` @@ -603,9 +603,9 @@ In @fig-paths-podcast, we see two open paths: `podcast <- mood -> exam"` and `po ```{r} #| label: fig-paths-podcast -#| fig-cap: "`ggdag_paths()` visualizes open paths in a DAG. There are two open paths in `podcast_dag`: the fork from `mood` and the fork from `prepared`." -podcast_dag |> - # show the whole dag as a light gray "shadow" +#| fig-cap: "`ggdag_paths()` visualizes open paths in a DAG. There are two open paths in `podcast_dag`: the fork from `mood` and the fork from `prepared`." +podcast_dag |> + # show the whole dag as a light gray "shadow" # rather than just the paths ggdag_paths(shadow = TRUE, text = FALSE, use_labels = "label") ``` @@ -615,7 +615,7 @@ podcast_dag |> This is handy if you want to manipulate the DAG programmatically. ```{r} -podcast_dag_tidy <- podcast_dag |> +podcast_dag_tidy <- podcast_dag |> tidy_dagitty() podcast_dag_tidy @@ -626,8 +626,8 @@ For instance, `dag_paths()` underlies `ggdag_paths()`; it returns a tidy DAG wit You can use several dplyr functions on these objects directly. ```{r} -podcast_dag_tidy |> - dag_paths() |> +podcast_dag_tidy |> + dag_paths() |> filter(set == 2, path == "open path") ``` @@ -636,8 +636,8 @@ Tidy DAGs are not pure data frames, but you can retrieve either the `dataframe` ```{r} library(dagitty) -podcast_dag_tidy |> - pull_dag() |> +podcast_dag_tidy |> + pull_dag() |> paths() ``` ::: @@ -654,8 +654,8 @@ Any arrows coming out of adjusted variables are removed from the DAG because the #| fig-align: center #| fig-cap: "A visualization of the minimal adjustment set for the podcast-exam DAG. If this DAG is correct, two variables are required to block the backdoor paths: `mood` and `prepared`." ggdag_adjustment_set( - podcast_dag, - text = FALSE, + podcast_dag, + text = FALSE, use_labels = "label" ) ``` @@ -677,20 +677,24 @@ Here's a condensed version of what `ggdag_adjustment_set()` is doing: #| fig-width: 4.5 #| fig-height: 4.5 #| fig-align: center -podcast_dag_tidy |> +podcast_dag_tidy |> # add adjustment sets to data dag_adjustment_sets() |> ggplot(aes( - x = x, y = y, xend = xend, yend = yend, - color = adjusted, shape = adjusted - )) + + x = x, + y = y, + xend = xend, + yend = yend, + color = adjusted, + shape = adjusted + )) + # ggdag's custom geoms: add nodes, edges, and labels - geom_dag_point() + + geom_dag_point() + # remove adjusted paths - geom_dag_edges_link(data = \(.df) filter(.df, adjusted != "adjusted")) + - geom_dag_label_repel() + + geom_dag_edges_link(data = \(.df) filter(.df, adjusted != "adjusted")) + + geom_dag_label_repel() + # you can use any ggplot function, too - facet_wrap(~ set) + + facet_wrap(~set) + scale_shape_manual(values = c(adjusted = 15, unadjusted = 19)) ``` ::: @@ -707,8 +711,8 @@ Full adjustment sets are every combination of variables that result in a valid s #| fig-align: center #| fig-cap: "All valid adjustment sets for `podcast_dag`." ggdag_adjustment_set( - podcast_dag, - text = FALSE, + podcast_dag, + text = FALSE, use_labels = "label", # get full adjustment sets type = "all" @@ -750,15 +754,16 @@ In contrast, the model that adjusted for the two variables as suggested by `ggda #| label: fig-dag-sim #| fig-cap: "Forest plot of simulated data based on the DAG described in @fig-dag-podcast." ## Model that does not close backdoor paths +library(broom) unadjusted_model <- lm(exam ~ podcast, sim_data) |> - broom::tidy(conf.int = TRUE) |> - dplyr::filter(term == "podcast") |> + tidy(conf.int = TRUE) |> + filter(term == "podcast") |> mutate(formula = "podcast") ## Model that closes backdoor paths adjusted_model <- lm(exam ~ podcast + mood + prepared, sim_data) |> - broom::tidy(conf.int = TRUE) |> - dplyr::filter(term == "podcast") |> + tidy(conf.int = TRUE) |> + filter(term == "podcast") |> mutate(formula = "podcast + mood + prepared") bind_rows( @@ -795,23 +800,23 @@ Let's take a look at @fig-podcast_dag2. #| fig-height: 4 #| fig-cap: "An expanded version of `podcast_dag` that includes two additional variables: `skills_course`, representing a College Skills Course, and `alertness`." podcast_dag2 <- dagify( - podcast ~ mood + humor + skills_course, - alertness ~ mood, - mood ~ humor, - prepared ~ skills_course, - exam ~ alertness + prepared, - coords = time_ordered_coords(), - exposure = "podcast", - outcome = "exam", - labels = c( - podcast = "podcast", - exam = "exam score", - mood = "mood", - alertness = "alertness", - skills_course = "college\nskills course", - humor = "humor", - prepared = "prepared" - ) + podcast ~ mood + humor + skills_course, + alertness ~ mood, + mood ~ humor, + prepared ~ skills_course, + exam ~ alertness + prepared, + coords = time_ordered_coords(), + exposure = "podcast", + outcome = "exam", + labels = c( + podcast = "podcast", + exam = "exam score", + mood = "mood", + alertness = "alertness", + skills_course = "college\nskills course", + humor = "humor", + prepared = "prepared" + ) ) ggdag(podcast_dag2, use_labels = "label", text = FALSE) @@ -820,7 +825,7 @@ ggdag(podcast_dag2, use_labels = "label", text = FALSE) ```{r} #| echo: false paths <- paths(podcast_dag2) -open_paths <- glue::glue_collapse(glue::glue('`{paths$paths[paths$open]}`'), sep = ", ", last = ", and") +open_paths <- glue::glue_collapse(glue::glue("`{paths$paths[paths$open]}`"), sep = ", ", last = ", and") adj_sets <- unclass(adjustmentSets(podcast_dag2)) |> map_chr(\(.x) glue::glue('{unlist(glue::glue_collapse(.x, sep = " + "))}')) adj_sets <- glue::glue("`{adj_sets}`") ``` @@ -880,10 +885,10 @@ podcast_dag3 <- dagify( coords = time_ordered_coords( list( # time point 1 - c("prepared", "humor", "mood"), + c("prepared", "humor", "mood"), # time point 2 - "podcast", - "showed_up", + "podcast", + "showed_up", # time point 3 "exam" ) @@ -909,11 +914,11 @@ Unfortunately, we cannot calculate the total effect of `podcast` on `exam` becau ```{r} #| label: fig-podcast_dag3-as #| fig-width: 4.5 -#| fig-height: 4 +#| fig-height: 4 #| warning: false #| fig-cap: "The adjustment set for `podcast_dag3` given that the data are inherently conditioned on showing up to the exam. In this case, there is no way to recover an unbiased estimate of the total effect of `podcast` on `exam`." -podcast_dag3 |> - adjust_for("showed_up") |> +podcast_dag3 |> + adjust_for("showed_up") |> ggdag_adjustment_set(text = FALSE, use_labels = "label") ``` @@ -923,11 +928,11 @@ We can't calculate the total effect because we are missing the indirect effect, ```{r} #| label: fig-podcast_dag3-direct #| fig-width: 4.5 -#| fig-height: 4 +#| fig-height: 4 #| warning: false #| fig-cap: "The adjustment set for `podcast_dag3` when targeting a different effect. There is one minimal adjustment set that we can use to estimate the direct effect of `podcast` on `exam`." -podcast_dag3 |> - adjust_for("showed_up") |> +podcast_dag3 |> + adjust_for("showed_up") |> ggdag_adjustment_set(effect = "direct", text = FALSE, use_labels = "label") ``` @@ -941,7 +946,7 @@ It's called M-bias because it looks like an M when arranged top to bottom. #| fig-width: 4 #| fig-height: 4 #| fig-cap: "A DAG representing M-Bias, a situation where a collider predates the exposure and outcome." -m_bias() |> +m_bias() |> ggdag() ``` @@ -974,7 +979,7 @@ podcast_dag4 <- dagify( coords = time_ordered_coords(list( c("u1", "u2"), "mood", - "podcast", + "podcast", "exam" )), exposure = "podcast", @@ -1005,8 +1010,8 @@ There is no way to close this open path. #| fig-width: 5.5 #| fig-height: 4.5 #| fig-cap: "The adjustment set where `mood` is a collider. If we control for `mood` and don't know about or have the unmeasured causes of `mood`, we have no means of closing the backdoor path opened by adjusting for a collider." -podcast_dag4 |> - adjust_for("mood") |> +podcast_dag4 |> + adjust_for("mood") |> ggdag_adjustment_set(use_labels = "label", text = FALSE) ``` @@ -1023,7 +1028,7 @@ This arrangement is sometimes called butterfly or bowtie bias, again because of #| fig-width: 5 #| fig-height: 4 #| fig-cap: "In butterfly bias, `mood` is both a collider and a confounder. Controlling for the bias induced by `mood` opens a new pathway because we've also conditioned on a collider. We can't properly close all backdoor paths without either `u1` or `u2`." -butterfly_bias(x = "podcast", y = "exam", m = "mood", a = "u1", b = "u2") |> +butterfly_bias(x = "podcast", y = "exam", m = "mood", a = "u1", b = "u2") |> ggdag(text = FALSE, use_labels = "label") ``` @@ -1051,9 +1056,9 @@ podcast_dag5 <- dagify( coords = time_ordered_coords( list( # time point 1 - c("prepared", "humor", "mood"), + c("prepared", "humor", "mood"), # time point 2 - c("podcast", "grader_mood"), + c("podcast", "grader_mood"), # time point 3 "exam" ) @@ -1137,7 +1142,7 @@ error_dag <- dagify( coords = time_ordered_coords() ) -error_dag |> +error_dag |> ggdag(text = FALSE, use_labels = "label") ``` @@ -1165,46 +1170,47 @@ We'll discuss estimands in detail in [Chapter -@sec-estimands]. #| label: tbl-dag-properties #| echo: false #| message: false -#| tbl-cap: "A table of DAG properties measured by @Tennant2021. Number of nodes and arcs are the median number of variables and arrows in the analyzed DAGs, while the Node to Arc ratio is their ratio. Saturation proportion is the proportion of all possible arrows going forward in time to other included variables. Fully saturated DAGs are those that include all such arrows. @Tennant2021 also analyzed whether studies reported their estimands and adjustment sets." +#| tbl-cap: "A table of DAG properties in applied health research. Number of nodes and arcs are the median number of variables and arrows in the analyzed DAGs, while the Node to Arc ratio is their ratio. Saturation proportion is the proportion of all possible arrows going forward in time to other included variables. Fully saturated DAGs are those that include all such arrows. The researchers also analyzed whether studies reported their estimands and adjustment sets." library(gtsummary) library(gt) -dag_data_used <- dag_data |> +dag_data_used <- dag_data |> filter(used_dag) as_yes_no <- function(x) { factor(x, c(TRUE, FALSE), labels = c("Yes", "No")) } -dag_data_used |> +dag_data_used |> mutate( saturated = near(saturation, 1), report_adjset = report_adjset == "Yes", across(c(saturated, reported_estimand, report_adjset), as_yes_no) - ) |> - select(nodes, arcs, ratio, saturation, saturated, reported_estimand, report_adjset) |> - tbl_summary(label = list( - nodes ~ "Number of Nodes", - arcs ~ "Number of Arcs", - ratio ~ "Node to Arc Ratio", - saturation ~ "Saturation Proportion", - saturated ~ "Fully Saturated", - reported_estimand ~ "Reported Estimand", - report_adjset ~ "Reported Adjustment Set" - ), - digits = list(everything() ~ 0, c(ratio, saturation) ~ 2), - type = list(where(is.factor) ~ "categorical") -) |> - as_gt() |> + ) |> + select(nodes, arcs, ratio, saturation, saturated, reported_estimand, report_adjset) |> + tbl_summary( + label = list( + nodes ~ "Number of Nodes", + arcs ~ "Number of Arcs", + ratio ~ "Node to Arc Ratio", + saturation ~ "Saturation Proportion", + saturated ~ "Fully Saturated", + reported_estimand ~ "Reported Estimand", + report_adjset ~ "Reported Adjustment Set" + ), + digits = list(everything() ~ 0, c(ratio, saturation) ~ 2), + type = list(where(is.factor) ~ "categorical") + ) |> + as_gt() |> tab_row_group( label = md("**DAG properties**"), rows = 1:7, id = "dag_prop" - ) |> + ) |> tab_row_group( label = md("**Reporting**"), rows = 8:13, id = "reporting" - ) |> + ) |> row_group_order(c("dag_prop", "reporting")) ``` @@ -1268,7 +1274,7 @@ dagify( ac_use ~ global_temp, global_temp ~ ac_use, labels = c(ac_use = "A/C use", global_temp = "Global\ntemperature") -) |> +) |> ggdag(layout = "circle", edge_type = "arc", text = FALSE, use_labels = "label") ``` @@ -1293,16 +1299,16 @@ dagify( ac_use_2020 ~ ac_use_2010 + global_temp_2010, coords = time_ordered_coords(), labels = c( - ac_use_1990 = "A/C use\n(1990)", + ac_use_1990 = "A/C use\n(1990)", global_temp_1990 = "Global\ntemperature\n(1990)", - ac_use_2000 = "A/C use\n(2000)", + ac_use_2000 = "A/C use\n(2000)", global_temp_2000 = "Global\ntemperature\n(2000)", - ac_use_2010 = "A/C use\n(2010)", + ac_use_2010 = "A/C use\n(2010)", global_temp_2010 = "Global\ntemperature\n(2010)", - ac_use_2020 = "A/C use\n(2020)", + ac_use_2020 = "A/C use\n(2020)", global_temp_2020 = "Global\ntemperature\n(2020)" ) -) |> +) |> ggdag(text = FALSE, use_labels = "label") ``` @@ -1398,7 +1404,7 @@ From the DAG, it would appear that the entire design is invalid! #| label: fig-case-control #| fig-width: 4.5 #| fig-height: 3.2 -#| fig-cap: "A DAG representing a matched case-control study. In such a study, selection is determined by outcome status and any matched confounders. Selection into the study is thus a collider. Since it is inherently stratified on who is actually in the study, such data are limited in the types of causal effects they can estimate." +#| fig-cap: "A DAG representing a matched case-control study. In such a study, selection is determined by outcome status and any matched confounders. Selection into the study is thus a collider. Since it is inherently stratified on who is actually in the study, such data are limited in the types of causal effects they can estimate." dagify( outcome ~ confounder + exposure, selection ~ outcome + confounder, @@ -1406,7 +1412,7 @@ dagify( exposure = "exposure", outcome = "outcome", coords = time_ordered_coords() -) |> +) |> ggdag(edge_type = "arc", text_size = 2.2) ``` @@ -1432,7 +1438,7 @@ dagify( confounder2 ~ confounder1, exposure = "exposure", outcome = "outcome" -) |> +) |> adjustmentSets() ``` @@ -1446,7 +1452,7 @@ dagify( exposure = "exposure", outcome = "outcome", latent = "confounder1" -) |> +) |> adjustmentSets() ``` @@ -1474,7 +1480,7 @@ dagify( x ~ q, q ~ p, coords = time_ordered_coords() -) |> +) |> ggdag(edge_type = "arc") ``` @@ -1519,8 +1525,8 @@ podcast_dag_sat <- dagify( coords = time_ordered_coords( list( "humor", - c("prepared", "mood"), - "podcast", + c("prepared", "mood"), + "podcast", "exam" ) ), @@ -1538,8 +1544,8 @@ podcast_dag_sat <- dagify( curvatures <- rep(0, 8) curvatures[1] <- .25 -podcast_dag_sat |> - tidy_dagitty() |> +podcast_dag_sat |> + tidy_dagitty() |> ggplot(aes(x, y, xend = xend, yend = yend)) + geom_dag_point() + geom_dag_edges_arc(curvature = curvatures) + @@ -1568,8 +1574,8 @@ podcast_dag_pruned <- dagify( coords = time_ordered_coords( list( "humor", - c("prepared", "mood"), - "podcast", + c("prepared", "mood"), + "podcast", "exam" ) ), @@ -1615,7 +1621,7 @@ When should we include this information in the DAG? We recommend first focusing on the causal structure of the DAG as if you had perfectly measured each variable [@hernan2021]. Then, consider how mismeasurement and missingness might affect the realized data, particularly related to the exposure, outcome, and critical confounders. You may prefer to present this as an alternative DAG to consider strategies for addressing the bias arising from those sources, e.g., imputation or sensitivity analyses. -After all, the DAG in \@ fig-error_dag makes you think the question is unanswerable because we have no method to close all backdoor paths. +After all, the DAG in @fig-error_dag makes you think the question is unanswerable because we have no method to close all backdoor paths. As with all open paths, that depends on the severity of the bias and our ability to reckon with it. diff --git a/chapters/chapter-07.qmd b/chapters/chapter-07.qmd index 07c97e0..3ef8e7e 100644 --- a/chapters/chapter-07.qmd +++ b/chapters/chapter-07.qmd @@ -286,11 +286,14 @@ There is not clear evidence of a lack of positivity here as both exposure levels ```{r} #| label: fig-close #| fig-cap: "Distribution of Magic Kingdom park closing time by whether the date had extra magic hours in the morning" -ggplot(seven_dwarfs_train_2018, aes( - x = factor(park_close), - group = factor(park_extra_magic_morning), - fill = factor(park_extra_magic_morning) -)) + +ggplot( + seven_dwarfs_train_2018, + aes( + x = factor(park_close), + group = factor(park_extra_magic_morning), + fill = factor(park_extra_magic_morning) + ) +) + geom_bar(position = position_dodge2(width = 0.9, preserve = "single")) + labs( fill = "Extra Magic Morning", @@ -308,11 +311,14 @@ This of course would also restrict which days we could draw conclusions about fo #| label: fig-temp #| fig-cap: "Distribution of historic temperature high at Magic Kingdom by whether the date had extra magic hours in the morning" library(halfmoon) -ggplot(seven_dwarfs_train_2018, aes( - x = park_temperature_high, - group = factor(park_extra_magic_morning), - fill = factor(park_extra_magic_morning) -)) + +ggplot( + seven_dwarfs_train_2018, + aes( + x = park_temperature_high, + group = factor(park_extra_magic_morning), + fill = factor(park_extra_magic_morning) + ) +) + geom_mirror_histogram(bins = 20) + labs( fill = "Extra Magic Morning", @@ -326,11 +332,14 @@ Examining @fig-ticket, we do not see any positivity violations. ```{r} #| label: fig-ticket #| fig-cap: "Distribution of historic temperature high at Magic Kingdom by whether the date had extra magic hours in the morning" -ggplot(seven_dwarfs_train_2018, aes( - x = park_ticket_season, - group = factor(park_extra_magic_morning), - fill = factor(park_extra_magic_morning) -)) + +ggplot( + seven_dwarfs_train_2018, + aes( + x = park_ticket_season, + group = factor(park_extra_magic_morning), + fill = factor(park_extra_magic_morning) + ) +) + geom_bar(position = "dodge") + labs( fill = "Extra Magic Morning", diff --git a/chapters/chapter-09.qmd b/chapters/chapter-09.qmd index 9f2885b..dbbefbc 100644 --- a/chapters/chapter-09.qmd +++ b/chapters/chapter-09.qmd @@ -57,7 +57,7 @@ Now, using the `tidy_smd` function, we can examine the standardized mean differe ```{r} library(halfmoon) -smds <- +smds <- seven_dwarfs_9_with_wt |> mutate(park_close = as.numeric(park_close)) |> tidy_smd( @@ -82,12 +82,12 @@ Let's start by visualizing these standardized mean differences. To do so, we lik ggplot( data = smds, aes( - x = abs(smd), - y = variable, - group = method, + x = abs(smd), + y = variable, + group = method, color = method ) -) + +) + geom_love() ``` @@ -100,35 +100,39 @@ As mentioned above, one issue with the standardized mean differences is they onl #| label: fig-boxplot #| fig.cap: "Unweighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not." ggplot( - seven_dwarfs_9_with_wt, + seven_dwarfs_9_with_wt, aes( - x = factor(park_extra_magic_morning), + x = factor(park_extra_magic_morning), y = park_temperature_high, group = park_extra_magic_morning - ) -) + - geom_boxplot(outlier.color = NA) + - geom_jitter() + - labs(x = "Extra magic morning", - y = "Temperature high") + ) +) + + geom_boxplot(outlier.color = NA) + + geom_jitter() + + labs( + x = "Extra magic morning", + y = "Temperature high" + ) ``` ```{r} #| label: fig-weighted-boxplot #| fig.cap: "Weighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not after incorporating the propensity score weight (ATE weight)." ggplot( - seven_dwarfs_9_with_wt, + seven_dwarfs_9_with_wt, aes( - x = factor(park_extra_magic_morning), + x = factor(park_extra_magic_morning), y = park_temperature_high, group = park_extra_magic_morning, weight = w_ate - ) -) + - geom_boxplot(outlier.color = NA) + - geom_jitter() + - labs(x = "Extra magic morning", - y = "Historic temperature high") + ) +) + + geom_boxplot(outlier.color = NA) + + geom_jitter() + + labs( + x = "Extra magic morning", + y = "Historic temperature high" + ) ``` Similarly, we can also examine the empirical cumulative distribution function (eCDF) for the confounder stratified by each exposure group. The unweighted eCDF can be visualized using `geom_ecdf` @@ -137,17 +141,23 @@ Similarly, we can also examine the empirical cumulative distribution function (e #| label: fig-ecdf #| fig.cap: "Unweighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (purple) compared to those that did not (green)." -ggplot(seven_dwarfs_9_with_wt, - aes(x = park_temperature_high, - color = factor(park_extra_magic_morning))) + - geom_ecdf() + +ggplot( + seven_dwarfs_9_with_wt, + aes( + x = park_temperature_high, + color = factor(park_extra_magic_morning) + ) +) + + geom_ecdf() + scale_color_manual( "Extra Magic Morning", values = c("#5154B8", "#5DB854"), labels = c("Yes", "No") - ) + - labs(x = "Historic temperature high", - y = "Proportion <= x") + ) + + labs( + x = "Historic temperature high", + y = "Proportion <= x" + ) ``` The `{halfmoon}` package allows for the additional `weight` argument to be passed to `geom_ecdf` to display a weighted eCDF plot. @@ -156,17 +166,23 @@ The `{halfmoon}` package allows for the additional `weight` argument to be passe #| label: fig-weighted-ecdf #| fig.cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (purple) compared to those that did not (green) after incorporating the propensity score weight (ATE)." -ggplot(seven_dwarfs_9_with_wt, - aes(x = park_temperature_high, - color = factor(park_extra_magic_morning))) + - geom_ecdf(aes(weights = w_ate)) + +ggplot( + seven_dwarfs_9_with_wt, + aes( + x = park_temperature_high, + color = factor(park_extra_magic_morning) + ) +) + + geom_ecdf(aes(weights = w_ate)) + scale_color_manual( "Extra Magic Morning", values = c("#5154B8", "#5DB854"), labels = c("Yes", "No") - ) + - labs(x = "Historic temperature high", - y = "Proportion <= x") + ) + + labs( + x = "Historic temperature high", + y = "Proportion <= x" + ) ``` Examining @fig-weighted-ecdf, we can notice a few things. First, compared to @fig-ecdf there is improvement in the overlap between the two distributions. In @fig-ecdf, the green line is almost always noticeably above the purple, whereas in @fig-weighted-ecdf the two lines appear to mostly overlap until we reach slightly above 80 degrees. After 80 degrees, the lines appear to diverge in the weighted plot. This is why it can be useful to examine the full distribution rather than a single summary measure. If we had just used the standardized mean difference, for example, we would have likely said these two groups are balanced and moved on. Looking at @fig-weighted-ecdf suggests that perhaps there is a non-linear relationship between the probability of having an extra magic morning and the historic high temperature. Let's try refitting our propensity score model using a natural spline. We can use the function `splines::ns` for this. @@ -192,17 +208,23 @@ Now let's see how that impacts the weighted eCDF plot #| label: fig-weighted-ecdf-2 #| fig.cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (purple) compared to those that did not (green) after incorporating the propensity score weight where historic high temperature was modeled flexibly with a spline." -ggplot(seven_dwarfs_9_with_wt, - aes(x = park_temperature_high, - color = factor(park_extra_magic_morning))) + - geom_ecdf(aes(weights = w_ate)) + +ggplot( + seven_dwarfs_9_with_wt, + aes( + x = park_temperature_high, + color = factor(park_extra_magic_morning) + ) +) + + geom_ecdf(aes(weights = w_ate)) + scale_color_manual( "Extra Magic Morning", values = c("#5154B8", "#5DB854"), labels = c("Yes", "No") - ) + - labs(x = "Historic temperature high", - y = "Proportion <= x") + ) + + labs( + x = "Historic temperature high", + y = "Proportion <= x" + ) ``` Now in @fig-weighted-ecdf-2 the lines appear to overlap across the whole space. diff --git a/chapters/chapter-10.qmd b/chapters/chapter-10.qmd index 4e9c0bc..ac3df49 100644 --- a/chapters/chapter-10.qmd +++ b/chapters/chapter-10.qmd @@ -57,7 +57,7 @@ Now, using the `tidy_smd` function, we can examine the standardized mean differe ```{r} library(halfmoon) -smds <- +smds <- seven_dwarfs_9_with_wt |> mutate(park_close = as.numeric(park_close)) |> tidy_smd( @@ -82,12 +82,12 @@ Let's start by visualizing these standardized mean differences. To do so, we lik ggplot( data = smds, aes( - x = abs(smd), - y = variable, - group = method, + x = abs(smd), + y = variable, + group = method, color = method ) -) + +) + geom_love() ``` @@ -100,35 +100,39 @@ As mentioned above, one issue with the standardized mean differences is they onl #| label: fig-boxplot #| fig.cap: "Unweighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not." ggplot( - seven_dwarfs_9_with_wt, + seven_dwarfs_9_with_wt, aes( - x = factor(park_extra_magic_morning), + x = factor(park_extra_magic_morning), y = park_temperature_high, group = park_extra_magic_morning - ) -) + - geom_boxplot(outlier.color = NA) + - geom_jitter() + - labs(x = "Extra magic morning", - y = "Temperature high") + ) +) + + geom_boxplot(outlier.color = NA) + + geom_jitter() + + labs( + x = "Extra magic morning", + y = "Temperature high" + ) ``` ```{r} #| label: fig-weighted-boxplot #| fig.cap: "Weighted boxplot showing the difference in historical high temperature between days that had extra magic hours and those that did not after incorporating the propensity score weight (ATE weight)." ggplot( - seven_dwarfs_9_with_wt, + seven_dwarfs_9_with_wt, aes( - x = factor(park_extra_magic_morning), + x = factor(park_extra_magic_morning), y = park_temperature_high, group = park_extra_magic_morning, weight = w_ate - ) -) + - geom_boxplot(outlier.color = NA) + - geom_jitter() + - labs(x = "Extra magic morning", - y = "Historic temperature high") + ) +) + + geom_boxplot(outlier.color = NA) + + geom_jitter() + + labs( + x = "Extra magic morning", + y = "Historic temperature high" + ) ``` Similarly, we can also examine the empirical cumulative distribution function (eCDF) for the confounder stratified by each exposure group. The unweighted eCDF can be visualized using `geom_ecdf` @@ -137,17 +141,23 @@ Similarly, we can also examine the empirical cumulative distribution function (e #| label: fig-ecdf #| fig.cap: "Unweighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (purple) compared to those that did not (green)." -ggplot(seven_dwarfs_9_with_wt, - aes(x = park_temperature_high, - color = factor(park_extra_magic_morning))) + - geom_ecdf() + +ggplot( + seven_dwarfs_9_with_wt, + aes( + x = park_temperature_high, + color = factor(park_extra_magic_morning) + ) +) + + geom_ecdf() + scale_color_manual( "Extra Magic Morning", values = c("#5154B8", "#5DB854"), labels = c("Yes", "No") - ) + - labs(x = "Historic temperature high", - y = "Proportion <= x") + ) + + labs( + x = "Historic temperature high", + y = "Proportion <= x" + ) ``` The `{halfmoon}` package allows for the additional `weight` argument to be passed to `geom_ecdf` to display a weighted eCDF plot. @@ -156,17 +166,23 @@ The `{halfmoon}` package allows for the additional `weight` argument to be passe #| label: fig-weighted-ecdf #| fig.cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (purple) compared to those that did not (green) after incorporating the propensity score weight (ATE)." -ggplot(seven_dwarfs_9_with_wt, - aes(x = park_temperature_high, - color = factor(park_extra_magic_morning))) + - geom_ecdf(aes(weights = w_ate)) + +ggplot( + seven_dwarfs_9_with_wt, + aes( + x = park_temperature_high, + color = factor(park_extra_magic_morning) + ) +) + + geom_ecdf(aes(weights = w_ate)) + scale_color_manual( "Extra Magic Morning", values = c("#5154B8", "#5DB854"), labels = c("Yes", "No") - ) + - labs(x = "Historic temperature high", - y = "Proportion <= x") + ) + + labs( + x = "Historic temperature high", + y = "Proportion <= x" + ) ``` Examining @fig-weighted-ecdf, we can notice a few things. First, compared to @fig-ecdf there is improvement in the overlap between the two distributions. In @fig-ecdf, the green line is almost always noticeably above the purple, whereas in @fig-weighted-ecdf the two lines appear to mostly overlap until we reach slightly above 80 degrees. After 80 degrees, the lines appear to diverge in the weighted plot. This is why it can be useful to examine the full distribution rather than a single summary measure. If we had just used the standardized mean difference, for example, we would have likely said these two groups are balanced and moved on. Looking at @fig-weighted-ecdf suggests that perhaps there is a non-linear relationship between the probability of having an extra magic morning and the historic high temperature. Let's try refitting our propensity score model using a natural spline. We can use the function `splines::ns` for this. @@ -192,17 +208,23 @@ Now let's see how that impacts the weighted eCDF plot #| label: fig-weighted-ecdf-2 #| fig.cap: "Weighted eCDF examining the difference in distribution for historic high temperature among days that had extra magic morning hours (purple) compared to those that did not (green) after incorporating the propensity score weight where historic high temperature was modeled flexibly with a spline." -ggplot(seven_dwarfs_9_with_wt, - aes(x = park_temperature_high, - color = factor(park_extra_magic_morning))) + - geom_ecdf(aes(weights = w_ate)) + +ggplot( + seven_dwarfs_9_with_wt, + aes( + x = park_temperature_high, + color = factor(park_extra_magic_morning) + ) +) + + geom_ecdf(aes(weights = w_ate)) + scale_color_manual( "Extra Magic Morning", values = c("#5154B8", "#5DB854"), labels = c("Yes", "No") - ) + - labs(x = "Historic temperature high", - y = "Proportion <= x") + ) + + labs( + x = "Historic temperature high", + y = "Proportion <= x" + ) ``` Now in @fig-weighted-ecdf-2 the lines appear to overlap across the whole space. diff --git a/chapters/chapter-11.qmd b/chapters/chapter-11.qmd index 113b65d..ddce50a 100644 --- a/chapters/chapter-11.qmd +++ b/chapters/chapter-11.qmd @@ -266,7 +266,8 @@ If we fit a propensity score model using the one confounder $Z$ and calculate th ## fit the propensity score model finite_sample_wts <- glm( x ~ z, - data = finite_sample, family = binomial("probit") + data = finite_sample, + family = binomial("probit") ) |> augment(newdata = finite_sample, type.predict = "response") |> mutate(wts = wt_ate(.fitted, x)) @@ -299,7 +300,8 @@ sim <- function(n) { ) finite_sample_wts <- glm( x ~ z, - data = finite_sample, family = binomial("probit") + data = finite_sample, + family = binomial("probit") ) |> augment(newdata = finite_sample, type.predict = "response") |> mutate(wts = wt_ate(.fitted, x)) @@ -317,9 +319,13 @@ sim <- function(n) { ## Examine 5 different sample sizes, simulate each 1000 times set.seed(1) -finite_sample_sims <- map_df(rep(c(50, 100, 500, 1000, 5000, 10000), - each = 1000 -), sim) +finite_sample_sims <- map_df( + rep( + c(50, 100, 500, 1000, 5000, 10000), + each = 1000 + ), + sim +) bias <- finite_sample_sims %>% group_by(n) %>% @@ -512,8 +518,8 @@ library(broom) library(touringplans) library(MatchIt) -seven_dwarfs_9 <- seven_dwarfs_train_2018 |> - filter(wait_hour == 9) +seven_dwarfs_9 <- seven_dwarfs_train_2018 |> + filter(wait_hour == 9) m <- matchit( park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, @@ -554,9 +560,11 @@ seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_ps |> We can fit a *weighted* outcome model by using the `weights` argument. ```{r} -lm(wait_minutes_posted_avg ~ park_extra_magic_morning, - data = seven_dwarfs_9_with_wt, - weights = w_att) |> +lm( + wait_minutes_posted_avg ~ park_extra_magic_morning, + data = seven_dwarfs_9_with_wt, + weights = w_att +) |> tidy() ``` @@ -582,24 +590,29 @@ The second option is computationally the easiest, but tends to overestimate the ```{r} fit_ipw <- function(split, ...) { .df <- analysis(split) - + # fit propensity score model propensity_model <- glm( park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, data = seven_dwarfs_9, family = binomial() - ) - + ) + # calculate inverse probability weights .df <- propensity_model |> augment(type.predict = "response", data = .df) |> - mutate(wts = wt_att(.fitted, - park_extra_magic_morning, - exposure_type = "binary")) - + mutate(wts = wt_att( + .fitted, + park_extra_magic_morning, + exposure_type = "binary" + )) + # fit correctly bootstrapped ipw model - lm(wait_minutes_posted_avg ~ park_extra_magic_morning, - data = .df, weights = wts) |> + lm( + wait_minutes_posted_avg ~ park_extra_magic_morning, + data = .df, + weights = wts + ) |> tidy() } ``` @@ -614,13 +627,13 @@ library(rsample) # fit ipw model to bootstrapped samples bootstrapped_seven_dwarfs <- bootstraps( - seven_dwarfs_9, - times = 1000, + seven_dwarfs_9, + times = 1000, apparent = TRUE ) -ipw_results <- bootstrapped_seven_dwarfs |> - mutate(boot_fits = map(splits, fit_ipw)) +ipw_results <- bootstrapped_seven_dwarfs |> + mutate(boot_fits = map(splits, fit_ipw)) ipw_results ``` @@ -639,7 +652,7 @@ ipw_results |> ) ) |> ggplot(aes(estimate)) + - geom_histogram(bins = 30, fill = "#D55E00FF", color = "white", alpha = 0.8) + + geom_histogram(bins = 30, fill = "#D55E00FF", color = "white", alpha = 0.8) + theme_minimal() ``` @@ -648,7 +661,7 @@ ipw_results |> ```{r} # get t-based CIs -boot_estimate <- int_t(ipw_results, boot_fits) |> +boot_estimate <- int_t(ipw_results, boot_fits) |> filter(term == "park_extra_magic_morning") boot_estimate ``` @@ -663,9 +676,11 @@ There are two ways to get the sandwich estimator. The first is to use the same w #| message: false #| warning: false library(sandwich) -weighted_mod <- lm(wait_minutes_posted_avg ~ park_extra_magic_morning, - data = seven_dwarfs_9_with_wt, - weights = w_att) +weighted_mod <- lm( + wait_minutes_posted_avg ~ park_extra_magic_morning, + data = seven_dwarfs_9_with_wt, + weights = w_att +) sandwich(weighted_mod) ``` @@ -711,13 +726,16 @@ The correct sandwich estimator will also take into account the propensity score library(PSW) seven_dwarfs_9 <- seven_dwarfs_9 |> - mutate(park_ticket_season_regular = ifelse(park_ticket_season == "regular", 1, 0), - park_ticket_season_value = ifelse(park_ticket_season == "value", 1, 0) + mutate( + park_ticket_season_regular = ifelse(park_ticket_season == "regular", 1, 0), + park_ticket_season_value = ifelse(park_ticket_season == "value", 1, 0) ) -psw(data = seven_dwarfs_9, - form.ps = "park_extra_magic_morning ~ park_ticket_season_regular + park_ticket_season_value + park_close + park_temperature_high", - weight = "ATT", - wt = TRUE, - out.var = "wait_minutes_posted_avg") +psw( + data = seven_dwarfs_9, + form.ps = "park_extra_magic_morning ~ park_ticket_season_regular + park_ticket_season_value + park_close + park_temperature_high", + weight = "ATT", + wt = TRUE, + out.var = "wait_minutes_posted_avg" +) ``` diff --git a/chapters/chapter-12.qmd b/chapters/chapter-12.qmd index 0bd2628..1b2dc8e 100644 --- a/chapters/chapter-12.qmd +++ b/chapters/chapter-12.qmd @@ -13,8 +13,8 @@ library(broom) library(touringplans) library(MatchIt) -seven_dwarfs_9 <- seven_dwarfs_train_2018 |> - filter(wait_hour == 9) +seven_dwarfs_9 <- seven_dwarfs_train_2018 |> + filter(wait_hour == 9) m <- matchit( park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, @@ -55,9 +55,11 @@ seven_dwarfs_9_with_wt <- seven_dwarfs_9_with_ps |> We can fit a *weighted* outcome model by using the `weights` argument. ```{r} -lm(wait_minutes_posted_avg ~ park_extra_magic_morning, - data = seven_dwarfs_9_with_wt, - weights = w_att) |> +lm( + wait_minutes_posted_avg ~ park_extra_magic_morning, + data = seven_dwarfs_9_with_wt, + weights = w_att +) |> tidy() ``` @@ -83,24 +85,29 @@ The second option is computationally the easiest, but tends to overestimate the ```{r} fit_ipw <- function(split, ...) { .df <- analysis(split) - + # fit propensity score model propensity_model <- glm( park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, data = seven_dwarfs_9, family = binomial() - ) - + ) + # calculate inverse probability weights .df <- propensity_model |> augment(type.predict = "response", data = .df) |> - mutate(wts = wt_att(.fitted, - park_extra_magic_morning, - exposure_type = "binary")) - + mutate(wts = wt_att( + .fitted, + park_extra_magic_morning, + exposure_type = "binary" + )) + # fit correctly bootstrapped ipw model - lm(wait_minutes_posted_avg ~ park_extra_magic_morning, - data = .df, weights = wts) |> + lm( + wait_minutes_posted_avg ~ park_extra_magic_morning, + data = .df, + weights = wts + ) |> tidy() } ``` @@ -115,13 +122,13 @@ library(rsample) # fit ipw model to bootstrapped samples bootstrapped_seven_dwarfs <- bootstraps( - seven_dwarfs_9, - times = 1000, + seven_dwarfs_9, + times = 1000, apparent = TRUE ) -ipw_results <- bootstrapped_seven_dwarfs |> - mutate(boot_fits = map(splits, fit_ipw)) +ipw_results <- bootstrapped_seven_dwarfs |> + mutate(boot_fits = map(splits, fit_ipw)) ipw_results ``` @@ -140,7 +147,7 @@ ipw_results |> ) ) |> ggplot(aes(estimate)) + - geom_histogram(bins = 30, fill = "#D55E00FF", color = "white", alpha = 0.8) + + geom_histogram(bins = 30, fill = "#D55E00FF", color = "white", alpha = 0.8) + theme_minimal() ``` @@ -149,7 +156,7 @@ ipw_results |> ```{r} # get t-based CIs -boot_estimate <- int_t(ipw_results, boot_fits) |> +boot_estimate <- int_t(ipw_results, boot_fits) |> filter(term == "park_extra_magic_morning") boot_estimate ``` @@ -164,9 +171,11 @@ There are two ways to get the sandwich estimator. The first is to use the same w #| message: false #| warning: false library(sandwich) -weighted_mod <- lm(wait_minutes_posted_avg ~ park_extra_magic_morning, - data = seven_dwarfs_9_with_wt, - weights = w_att) +weighted_mod <- lm( + wait_minutes_posted_avg ~ park_extra_magic_morning, + data = seven_dwarfs_9_with_wt, + weights = w_att +) sandwich(weighted_mod) ``` @@ -212,13 +221,16 @@ The correct sandwich estimator will also take into account the propensity score library(PSW) seven_dwarfs_9 <- seven_dwarfs_9 |> - mutate(park_ticket_season_regular = ifelse(park_ticket_season == "regular", 1, 0), - park_ticket_season_value = ifelse(park_ticket_season == "value", 1, 0) + mutate( + park_ticket_season_regular = ifelse(park_ticket_season == "regular", 1, 0), + park_ticket_season_value = ifelse(park_ticket_season == "value", 1, 0) ) -psw(data = seven_dwarfs_9, - form.ps = "park_extra_magic_morning ~ park_ticket_season_regular + park_ticket_season_value + park_close + park_temperature_high", - weight = "ATT", - wt = TRUE, - out.var = "wait_minutes_posted_avg") +psw( + data = seven_dwarfs_9, + form.ps = "park_extra_magic_morning ~ park_ticket_season_regular + park_ticket_season_value + park_close + park_temperature_high", + weight = "ATT", + wt = TRUE, + out.var = "wait_minutes_posted_avg" +) ``` diff --git a/chapters/chapter-13.qmd b/chapters/chapter-13.qmd index 9c413aa..6d7e7d1 100644 --- a/chapters/chapter-13.qmd +++ b/chapters/chapter-13.qmd @@ -271,9 +271,13 @@ ggplot(wait_times_wts, aes(wait_minutes_posted_avg, swts)) + wait_times_wts |> filter(swts > 10) |> select( - park_date, wait_minutes_posted_avg, .fitted, - park_close, park_extra_magic_morning, - park_temperature_high, park_ticket_season + park_date, + wait_minutes_posted_avg, + .fitted, + park_close, + park_extra_magic_morning, + park_temperature_high, + park_ticket_season ) |> knitr::kable() ```