From 6ef17dedab174da43bc660fc13978d7b4c4cd62b Mon Sep 17 00:00:00 2001 From: Malcolm Barrett Date: Thu, 9 Nov 2023 11:15:00 -0500 Subject: [PATCH] fill in citations --- .../chapter-05/execute-results/html.json | 4 +- .../chapter-16/execute-results/html.json | 4 +- chapters/chapter-05.qmd | 45 ++--- chapters/chapter-16.qmd | 5 +- citations.bib | 182 +++++++++++++++++- 5 files changed, 212 insertions(+), 28 deletions(-) diff --git a/_freeze/chapters/chapter-05/execute-results/html.json b/_freeze/chapters/chapter-05/execute-results/html.json index 453fbb0..9c21544 100644 --- a/_freeze/chapters/chapter-05/execute-results/html.json +++ b/_freeze/chapters/chapter-05/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "38333e38dc33c590c0e70d047d0a8c63", + "hash": "7a1a06427dcc094e0606c80041bcc449", "result": { - "markdown": "# Expressing causal questions as DAGs {#sec-dags}\n\n\n\n\n\n## Visualizing Causal Assumptions\n\n> Draw your assumptions before your conclusions --@hernan2021\n\nCausal diagrams are a tool to visualize your assumptions about the causal structure of the questions you're trying to answer.\nIn a randomized experiment, the causal structure is quite simple.\nWhile there may be many causes of an outcome, the only cause of the exposure is the randomization process itself (we hope!).\nIn many non-randomized settings, however, the structure of your question can be a complex web of causality.\nCausal diagrams help communicate what we think this structure looks like.\nIn addition to being open about what we think the causal structure is, causal diagrams have incredible mathematical properties that allow us to identify a way to estimate unbiased causal effects even with observational data.\n\nCausal diagrams are also increasingly common.\nData collected as a review of causal diagrams in applied health research papers show a drastic increase in use over time [@Tennant2021].\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Percentage of health research papers using causal diagrams over time.](chapter-05_files/figure-html/fig-dag-usage-1.png){#fig-dag-usage width=672}\n:::\n:::\n\n\nThe type of causal diagrams we use are also called directed acyclic graphs (DAGs)[^1].\nThese graphs are directed because they include arrows going in a specific direction.\nThey're acyclic because they don't go in circles; a variable can't cause itself, for instance.\nDAGs are used for various problems, but we're specifically concerned with *causal* DAGs.\nThis class of DAGs is sometimes called Structural Causal Models (SCMs) because they are a model of the causal structure of a question.\n\n[^1]: An essential but rarely observed detail of DAGs is that dag is also an [affectionate Australian insult](https://en.wikipedia.org/wiki/Dag_(slang)) referring to the dung-caked fur of a sheep, a *daglock*.\n\nDAGs depict causal relationships between variables.\nVisually, the way they depict variables is as *edges* and *nodes*.\nEdges are the arrows going from one variable to another, sometimes called arcs or just arrows.\nNodes are the variables themselves, sometimes called vertices, points, or just variables.\nIn @fig-dag-basic, there are two nodes, `x` and `y`, and one edge going from `x` to `y`.\nHere, we are saying that `x` causes `y`.\n`y` \"listens\" to `x` [@Pearl_Glymour_Jewell_2021].\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A causal directed acyclic graph (DAG). DAGs depict causal relationships. In this DAG, the assumption is that `x` causes `y`.](chapter-05_files/figure-html/fig-dag-basic-1.png){#fig-dag-basic width=288}\n:::\n:::\n\n\nIf we're interested in the causal effect of `x` on `y`, we're trying to estimate a numeric representation of that arrow.\nUsually, though, there are many other variables and arrows in the causal structure of a given question.\nA series of arrows is called a *path*.\nThere are three types of paths you'll see in DAGs: forks, chains, and colliders (sometimes called inverse forks).\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Three types of causal relationships: forks, chains, and colliders. The direction of the arrows and the relationships of interest dictate which type of path a series of variables is. Forks represent a mutual cause, chains represent direct causes, and colliders represent a mutual descendant.](chapter-05_files/figure-html/fig-dag-path-types-1.png){#fig-dag-path-types width=672}\n:::\n:::\n\n\nForks represent a common cause of two variables.\nHere, we're saying that `q` causes both `x` and `y`, the traditional definition of a confounder.\nThey're called forks because the arrows from `x` to `y` are in different directions.\nChains, on the other hand, represent a series of arrows going in the same direction.\nHere, `q` is called a *mediator*: it is along the causal path from `x` to `y`.\nIn this diagram, the only path from `x` to `y` is mediated through `q`.\nFinally, a collider is a path where two arrowheads meet at a variable.\nBecause causality always goes forward in time, this naturally means that the collider variable is caused by two other variables.\nHere, we're saying that `x` and `y` both cause `q`.\n\n::: callout-tip\n## Are DAGs SEMs?\n\nIf you're familiar with structural equation models (SEMs), a modeling technique commonly used in psychology and other social science settings, you may notice some similarities between SEMs and DAGs.\nDAGs are a form of *non-parametric* SEM.\nSEMs estimate entire graphs using parametric assumptions.\nCausal DAGs, on the other hand, don't estimate anything; an arrow going from one variable to another says nothing about the strength or functional form of that relationship, only that we think it exists.\n:::\n\nOne of the significant benefits of DAGs is that they help us identify sources of bias and, often, provide clues on how to address them.\nHowever, talking about an unbiased effect estimate only makes sense when we have a specific causal question in mind.\nSince each arrow represents a cause, it's causality all the way down; no individual arrow is inherently problematic.\nHere, we're interested in the effect of `x` on `y`.\nThis question defines which paths we're interested in and which we're not.\n\nThese three types of paths have different implications for the statistical relationship between `x` and `y`.\nIf we only look at the correlation between the two variables under these assumptions:\n\n1. In the fork, `x` and `y` will be associated, despite there being no arrow from `x` to `y`.\n2. In the chain, `x` and `y` are related only through `q`.\n3. In the collider, `x` and `y` will *not* be related.\n\nPaths that transmit association are called *open paths*.\nPaths that do not transmit association are called *closed paths*.\nForks and chains are open, while colliders are closed.\n\nSo, should we adjust for `q`?\nThat depends on the nature of the path.\nForks are confounding paths.\nBecause `q` causes both `x` and `y`, `x` and `y` will have a spurious association.\nThey both contain information from `q`, their mutual cause.\nThat mutual causal relationship makes `x` and `y` associated statistically.\nAdjusting for `q` will *block* the bias from confounding and give us the true relationship between `x` and `y`.\n\n::: callout-tip\n## Adjustment\n\nWe can use a variety of techniques to account for a variable.\nWe use the term \"adjustment\" or \"controlling for\" to refer to any technique that removes the effect of variables we're not interested in.\n:::\n\n@fig-confounder-scatter depicts this effect visually.\nHere, `x` and `y` are continuous, and by definition of the DAG, they are unrelated.\n`q`, however, causes both.\nThe unadjusted effect is biased because it includes information about the open path from `x` to `y` via `q`.\nWithin levels of `q`, however, `x` and `y` are unrelated.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Two scatterplots of the relationship between `x` and `y`. With forks, the relationship is biased by `q`. When accounting for `q`, we see the true null relationship.](chapter-05_files/figure-html/fig-confounder-scatter-1.png){#fig-confounder-scatter width=672}\n:::\n:::\n\n\nFor chains, whether or not we adjust for mediators depends on the research question.\nHere, adjusting for `q` would result in a null estimate of the effect of `x` on `y`.\nBecause the only effect of `x` on `y` is via `q`, no other effect remains.\nThe effect of `x` on `y` mediated by `q` is called the *indirect* effect, while the effect of `x` on `y` directly is called the *direct* effect.\nIf we're only interested in the direct effect, controlling for `q` might be what we want.\nIf we want to know about both effects, we shouldn't try to adjust for `q`.\nWe'll learn more about estimating these and other mediation effects in @sec-mediation.\n\n@fig-mediator-scatter shows this effect visually.\nThe unadjusted effect of `x` on `y` represents the total effect.\nSince the total effect is due entirely to the path mediated by `q`, when we adjust for `q`, no relationship remains.\nThis null effect is the direct effect.\nNeither of these effects is due to bias, but each answers a different research question.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Two scatterplots of the relationship between `x` and `y`. With chains, whether and how we should account for `q` depends on the research question. Without doing so, we see the impact of the total effect of `x` and `y`, including the indirect effect via `q`. When accounting for `q`, we see the direct (null) effect of `x` on `y`.](chapter-05_files/figure-html/fig-mediator-scatter-1.png){#fig-mediator-scatter width=672}\n:::\n:::\n\n\nColliders are different.\nIn the collider DAG of @fig-dag-path-types, `x` and `y` are *not* associated, but both cause `q`.\nAdjusting for `q` has the opposite effect than with confounding: it *opens* a biasing pathway.\nSometimes, people draw the path opened up by conditioning on a collider connecting `x` and `y`.\n\nVisually, we can see this happen when `x` and `y` are continuous and `q` is binary.\nIn @fig-collider-scatter, when we don't include `q`, we find no relationship between `x` and `y`.\nThat's the correct result.\nHowever, when we include `q`, we can detect information about both `x` and `y`, and they appear correlated: across levels of `x`, those with `q = 0` have lower levels of `y`.\nAssociation seemingly flows back in time.\nOf course, that can't happen from a causal perspective, so controlling for `q` is the wrong thing to do.\nWe end up with a biased effect of `x` on `y`.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Two scatterplots of the relationship between `x` and `y`. The unadjusted relationship between the two is unbiased. When accounting for `q`, we open a colliding backdoor path and bias the relationship between `x` and `y`.](chapter-05_files/figure-html/fig-collider-scatter-1.png){#fig-collider-scatter width=672}\n:::\n:::\n\n\nHow can this be?\nSince `x` and `y` happen before `q`, `q` can't impact them.\nLet's turn the DAG on its side and consider @fig-collider-time.\nIf we break down the two time points, at time point 1, `q` hasn't happened yet, and `x` and `y` are unrelated.\nAt time point 2, `q` happens due to `x` and `y`.\nBut causality only goes forward in time.\n`q` happening later can't change the fact that `x` and `y` happened independently at time point 1.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A collider relationship over two points in time. At time point one, there is no relationship between `x` and `y`. Both cause `q` by time point two, but this does not change what already happened at time point one.](chapter-05_files/figure-html/fig-collider-time-1.png){#fig-collider-time width=672}\n:::\n:::\n\n\nCausality only goes forward.\nAssociation, however, is time-agnostic.\nIt's just an observation about the numerical relationships between variables.\nWhen we control for the future, we risk introducing bias.\nIt takes time to develop an intuition for this.\nConsider a case where `x` and `y` are the only causes of `q`, and all three variables are binary.\nWhen *either* `x` or `y` equals 1, then `q` happens.\nIf we know `q = 1` and `x = 0` then logically it must be that `y = 1`.\nThus, knowing about `q` gives us information about `y` via `x`.\nThis example is extreme, but it shows how this type of bias, sometimes called *collider-stratification bias* or *selection bias*, occurs: conditioning on `q` provides statistical information about `x` and `y` and distorts their relationship.\n\n::: callout-tip\n## Exchangeability revisited\n\nWe commonly refer to exchangability as the assumption of no confounding.\nActually, this isn't quite right.\nIt's the assumption of no *open, non-causal* paths.\nMany times, these are confounding pathways.\nHowever, conditioning on a collider can also open paths.\nEven though these aren't confounders, doing so creates non-exchangeability between the two groups: they are different in a way that matters to the exposure and outcome.\n\nOpen, non-causal paths are also called *backdoor paths*.\nWe'll use this terminology often because it captures the idea well: these are any open paths biasing the effect we're interested in estimating.\n:::\n\nCorrectly identifying the causal structure between the exposure and outcome thus helps us 1) communicate the assumptions we're making about the relationships between variables and 2) identify sources of bias.\nImportantly, in doing 2), we are also often able to identify ways to prevent bias based on the assumptions in 1).\nIn the simple case of the three DAGs in @fig-dag-path-types, we know whether or not to control for `q` depending on the nature of the causal structure.\nThe set or sets of variables we need to adjust for is called the *adjustment set*.\nDAGs can help us identify adjustment sets even in complex settings.\n\n::: callout-tip\n## What about interaction?\n\nDAGs don't make a statement about interaction or effect estimate modification, even though they are an important part of inference.\nTechnically, interaction is a matter of the functional form of the relationships in the DAG.\nMuch as we don't need to specify how we will model a variable in the DAG (e.g., with splines), we don't need to determine how variables statistically interact.\nThat's a matter for the modeling stage.\n\nThere are several ways we use interactions in causal inference.\nIn one extreme, they are simply a matter of functional form: interaction terms are included in models but marginalized to get an overall causal effect.\nConversely, we're interested in *joint causal effects*, where the two variables interacting are both causal.\nIn between, we can use interaction terms to identify *heterogeneous causal effects*, which vary by a second variable that is not assumed to be causal.\nAs with many tools in causal inference, we use the same statistical technique in many ways to answer different questions. \nWe'll revisit this topic in detail in [Chapter -@sec-interaction].\n\nMany people have tried expressing interaction in DAGs using different types of arcs, nodes, and other annotations, but no approach has taken off as the preferred way.\n:::\n\nLet's take a look at an example in R.\nWe'll learn to build DAGs, visualize them, and identify important information like adjustment sets.\n\n## DAGs in R\n\nFirst, consider a research question: Does listening to a comedy podcast the morning before an exam improve graduate students' test scores?\nWe can diagram this using the method described in @sec-diag (@fig-diagram-podcast).\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A sentence diagram for the question: Does listening to a comedy podcast the morning before an exam improve graduate student test scores? The population is graduate students. The start time is morning, and the outcome time is after the exam.](../images/podcast-diagram.png){#fig-diagram-podcast width=2267}\n:::\n:::\n\n\nThe tool we'll use for making DAGs is ggdag.\nggdag is a package that connects ggplot2, the most powerful visualization tool in R, to dagitty, an R package with sophisticated algorithms for querying DAGs.\n\nTo create a DAG object, we'll use the `dagify()` function.`dagify()` returns a `dagitty` object that works with both the dagitty and ggdag packages.\nThe `dagify()` function takes formulas, separated by commas, that specify causes and effects, with the left element of the formula defining the effect and the right all of the factors that cause it.\nThis is just like the type of formula we specify for most regression models in R.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n effect1 ~ cause1 + cause2 + cause3,\n effect2 ~ cause1 + cause4,\n ...\n)\n```\n:::\n\n\nWhat are all of the factors that cause graduate students to listen to a podcast the morning before an exam?\nWhat are all of the factors that could cause a graduate student to do well on a test?\nLet's posit some here.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(ggdag)\ndagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\ndag {\nexam\nhumor\nmood\npodcast\nprepared\nhumor -> podcast\nmood -> exam\nmood -> podcast\nprepared -> exam\nprepared -> podcast\n}\n```\n\n\n:::\n:::\n\n\nIn the code above, we assume that:\n\n- a graduate student's mood, sense of humor, and how prepared they feel for the exam could influence whether they listened to a podcast the morning of the test\n- their mood and how prepared they are also influence their exam score\n\nNotice we *do not* see podcast in the exam equation; this means that we assume that there is **no** causal relationship between podcast and the exam score.\n\nThere are some other useful arguments you'll often find yourself supplying to `dagify()`:\n\n- `exposure` and `outcome`: Telling ggdag the variables that are the exposure and outcome of your research question is required for many of the most valuable queries we can make of DAGs.\n- `latent`: This argument lets us tell ggdag that some variables in the DAG are unmeasured. `latent` helps identify valid adjustment sets with the data we actually have.\n- `coords`: Coordinates for the variables. You can choose between algorithmic or manual layouts, as discussed below. We'll use `time_ordered_coords()` here.\n- `labels`: A character vector of labels for the variables.\n\nLet's create a DAG object, `podcast_dag`, with some of these attributes, then visualize the DAG with `ggdag()`.\n`ggdag()` returns a ggplot object, so we can add additional layers to the plot, like themes.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared,\n coords = time_ordered_coords(\n list(\n # time point 1\n c(\"prepared\", \"humor\", \"mood\"), \n # time point 2\n \"podcast\", \n # time point 3\n \"exam\"\n )\n ),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n humor = \"humor\",\n prepared = \"prepared\"\n )\n)\nggdag(podcast_dag, use_labels = \"label\", text = FALSE) +\n theme_dag()\n```\n\n::: {.cell-output-display}\n![Proposed DAG to answer the question: Does listening to a comedy podcast the morning before an exam improve graduate students' test scores?](chapter-05_files/figure-html/fig-dag-podcast-1.png){#fig-dag-podcast width=384}\n:::\n:::\n\n\n::: callout-note\nFor the rest of the chapter, we'll use `theme_dag()`, a ggplot theme from ggdag meant for DAGs.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntheme_set(\n theme_dag() %+replace%\n # also add some additional styling\n theme(\n legend.position = \"bottom\",\n strip.text.x = element_text(margin = margin(2, 0, 2, 0, \"mm\"))\n )\n)\n```\n:::\n\n:::\n\n::: callout-tip\n## DAG coordinates\n\nYou don't need to specify coordinates to ggdag.\nIf you don't, it uses algorithms designed for automatic layouts.\nThere are many such algorithms, and they focus on different aspects of the layout, e.g., the shape, the space between the nodes, minimizing how many edges cross, etc.\nThese layout algorithms usually have a component of randomness, so it's good to use a seed if you want to get the same result.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# no coordinates specified\nset.seed(123)\npod_dag <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared\n)\n\n# automatically determine layouts\npod_dag |> \n ggdag(text_size = 2.8)\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-14-1.png){fig-align='center' width=384}\n:::\n:::\n\n\nWe can also ask for a specific layout, e.g., the popular Sugiyama algorithm for DAGs.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npod_dag |> \n ggdag(layout = \"sugiyama\", text_size = 2.8)\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-15-1.png){fig-align='center' width=384}\n:::\n:::\n\n\nFor causal DAGs, the time-ordered layout algorithm is often best, which we can specify with `time_ordered_coords()` or `layout = \"time_ordered\"`.\nWe'll discuss time ordering in greater detail in @sec-time-ordered.\nEarlier, we explicitly told ggdag which variables were at which time points, but we don't need to.\nNotice, though, that the time ordering algorithm puts `podcast` and `exam` at the same time point since one doesn't cause another (and thus predate it).\nWe know that's not the case: listening to the podcast happened before taking the exam.\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npod_dag |> \n ggdag(layout = \"time_ordered\", text_size = 2.8)\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-16-1.png){fig-align='center' width=384}\n:::\n:::\n\n\nYou can manually specify coordinates using a list or data frame and provide them to the `coords` argument of `dagify()`.\nAdditionally, because ggdag is based on dagitty, you can use `dagitty.net` to create and organize a DAG using a graphical interface, then export the result as dagitty code for ggdag to consume.\n\nAlgorithmic layouts are lovely for fast visualization of DAGs or particularly complex graphs.\nOnce you want to share your DAG, it's usually best to be more intentional about the layout, perhaps by specifying the coordinates manually.\n`time_ordered_coords()` is often the best of both worlds, and we'll use it for most DAGs in this book.\n:::\n\nWe've specified the DAG for this question and told ggdag what the exposure and outcome of interest are.\nAccording to the DAG, there is no direct causal relationship between listening to a podcast and exam scores.\nAre there any other open paths?\n`ggdag_paths()` takes a DAG and visualizes the open paths.\nIn @fig-paths-podcast, we see two open paths: `podcast <- mood -> exam\"` and `podcast <- prepared -> exam`. These are both forks---*confounding pathways*. Since there is no causal relationship between listening to a podcast and exam scores, the only open paths are *backdoor* paths, these two confounding pathways.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag |> \n # show the whole dag as a light gray \"shadow\" \n # rather than just the paths\n ggdag_paths(shadow = TRUE, text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![`ggdag_paths()` visualizes open paths in a DAG. There are two open paths in `podcast_dag`: the fork from `mood` and the fork from `prepared`.](chapter-05_files/figure-html/fig-paths-podcast-1.png){#fig-paths-podcast width=672}\n:::\n:::\n\n\n::: callout-tip\n`dagify()` returns a `dagitty()` object, but underneath the hood, ggdag converts `dagitty` objects to tidy DAGs, a structure that holds both the `dagitty` object and a `dataframe` about the DAG.\nThis is handy if you want to manipulate the DAG programmatically.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag_tidy <- podcast_dag |> \n tidy_dagitty()\n\npodcast_dag_tidy\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A DAG with 5 nodes and 5 edges\n#\n# Exposure: podcast\n# Outcome: exam\n#\n# A tibble: 7 × 9\n name x y direction to xend yend\n \n1 exam 3 0 NA NA\n2 humor 1 0 -> podcast 2 0\n3 mood 1 1 -> exam 3 0\n4 mood 1 1 -> podcast 2 0\n5 podcast 2 0 NA NA\n6 prepared 1 -1 -> exam 3 0\n7 prepared 1 -1 -> podcast 2 0\n# ℹ 2 more variables: circular , label \n```\n\n\n:::\n:::\n\n\nMost of the quick plotting functions transform the `dagitty` object to a tidy DAG if it's not already, then manipulate the data in some capacity.\nFor instance, `dag_paths()` underlies `ggdag_paths()`; it returns a tidy DAG with data about the paths.\nYou can use several dplyr functions on these objects directly.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag_tidy |> \n dag_paths() |> \n filter(set == 2, path == \"open path\")\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A DAG with 3 nodes and 2 edges\n#\n# Exposure: podcast\n# Outcome: exam\n#\n# A tibble: 4 × 11\n set name x y direction to xend yend\n \n1 2 exam 3 0 NA NA\n2 2 podcast 2 0 NA NA\n3 2 prepar… 1 -1 -> exam 3 0\n4 2 prepar… 1 -1 -> podc… 2 0\n# ℹ 3 more variables: circular , label ,\n# path \n```\n\n\n:::\n:::\n\n\nTidy DAGs are not pure data frames, but you can retrieve either the `dataframe` or `dagitty` object to work with them directly using `pull_dag_data()` or `pull_dag()`.\n`pull_dag()` can be useful when you want to work with dagitty functions:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dagitty)\npodcast_dag_tidy |> \n pull_dag() |> \n paths()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n$paths\n[1] \"podcast <- mood -> exam\" \n[2] \"podcast <- prepared -> exam\"\n\n$open\n[1] TRUE TRUE\n```\n\n\n:::\n:::\n\n:::\n\nBackdoor paths pollute the statistical association between `podcast` and `exam`, so we must account for them.\n`ggdag_adjustment_set()` visualizes any valid adjustment sets implied by the DAG.\n@fig-podcast-adustment-set shows adjusted variables as squares.\nAny arrows coming out of adjusted variables are removed from the DAG because the path is longer open at that variable.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nggdag_adjustment_set(\n podcast_dag, \n text = FALSE, \n use_labels = \"label\"\n)\n```\n\n::: {.cell-output-display}\n![A visualization of the minimal adjustment set for the podcast-exam DAG. If this DAG is correct, two variables are required to block the backdoor paths: `mood` and `prepared`.](chapter-05_files/figure-html/fig-podcast-adustment-set-1.png){#fig-podcast-adustment-set fig-align='center' width=384}\n:::\n:::\n\n\n@fig-podcast-adustment-set shows the *minimal adjustment set*.\nBy default, ggdag returns the set(s) that can close all backdoor paths with the fewest number of variables possible.\nIn this DAG, that's just one set: `mood` and `prepared`.\nThis set makes sense because there are two backdoor paths, and the only other variables on them besides the exposure and outcome are these two variables.\nSo, at minimum, we must account for both to get a valid estimate.\n\n::: callout-tip\n`ggdag()` and friends usually use `tidy_dagitty()` and `dag_*()` or `node_*()` functions to change the underlying data frame.\nSimilarly, the quick plotting functions use ggdag's geoms to visualize the resulting DAG(s).\nIn other words, you can use the same data manipulation and visualization strategies that you use day-to-day directly with ggdag.\n\nHere's a condensed version of what `ggdag_adjustment_set()` is doing:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npodcast_dag_tidy |> \n # add adjustment sets to data\n dag_adjustment_sets() |>\n ggplot(aes(\n x = x, y = y, xend = xend, yend = yend,\n color = adjusted, shape = adjusted\n )) + \n # ggdag's custom geoms: add nodes, edges, and labels\n geom_dag_point() + \n # remove adjusted paths\n geom_dag_edges_link(data = \\(.df) filter(.df, adjusted != \"adjusted\")) + \n geom_dag_label_repel() + \n # you can use any ggplot function, too\n facet_wrap(~ set) +\n scale_shape_manual(values = c(adjusted = 15, unadjusted = 19))\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-22-1.png){fig-align='center' width=432}\n:::\n:::\n\n:::\n\nMinimal adjustment sets are only one type of valid adjustment set.\nSometimes, other combinations of variables can get us an unbiased effect estimate.\nTwo other options available in ggdag are full adjustment sets and canonical adjustment sets.\nFull adjustment sets are every combination of variables that result in a valid set.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nggdag_adjustment_set(\n podcast_dag, \n text = FALSE, \n use_labels = \"label\",\n # get full adjustment sets\n type = \"all\"\n)\n```\n\n::: {.cell-output-display}\n![All valid adjustment sets for `podcast_dag`.](chapter-05_files/figure-html/fig-adustment-set-all-1.png){#fig-adustment-set-all fig-align='center' width=624}\n:::\n:::\n\n\nIt turns out that we can also control for `humor`.\n\nCanonical adjustment sets are a bit more complex: they are all possible ancestors of the exposure and outcome minus any likely descendants.\nIn fully saturated DAGs (DAGs where every node causes anything that comes after it in time), the canonical adjustment set is the minimal adjustment set.\n\n::: callout-tip\nMost of the functions in ggdag use dagitty underneath the hood.\nIt's often helpful to call dagitty functions directly.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nadjustmentSets(podcast_dag, type = \"canonical\")\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n{ humor, mood, prepared }\n```\n\n\n:::\n:::\n\n:::\n\nUsing our proposed DAG, let's simulate some data to see how accounting for the minimal adjustment set might occur in practice.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nset.seed(10)\nsim_data <- podcast_dag |>\n simulate_data()\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nsim_data\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 500 × 5\n exam humor mood podcast prepared\n \n 1 -0.435 0.263 -0.100 -0.630 1.07 \n 2 -0.593 0.317 0.143 -1.55 0.0640\n 3 0.786 1.97 -0.591 -0.318 -0.439 \n 4 -0.103 2.86 -0.139 1.07 0.754 \n 5 -0.614 -2.39 0.702 0.464 0.356 \n 6 1.01 1.21 0.910 0.769 0.561 \n 7 0.167 -1.37 -0.559 -0.866 0.214 \n 8 1.16 0.164 -0.743 0.969 -1.67 \n 9 0.650 0.215 -0.248 0.691 -0.303 \n10 0.156 0.713 1.19 -1.02 -0.219 \n# ℹ 490 more rows\n```\n\n\n:::\n:::\n\n\nSince we have simulated this data, we know that this is a case where *standard methods will succeed* (see @sec-standard) and, therefore, can estimate the causal effect using a basic linear regression model.\n@fig-dag-sim shows a forest plot of the simulated data based on our DAG.\nNotice the model that only included the exposure resulted in a spurious effect (an estimate of -0.1 when we know the truth is 0).\nIn contrast, the model that adjusted for the two variables as suggested by `ggdag_adjustment_set()` is not spurious (much closer to 0).\n\n\n::: {.cell}\n\n```{.r .cell-code}\n## Model that does not close backdoor paths\nunadjusted_model <- lm(exam ~ podcast, sim_data) |>\n broom::tidy(conf.int = TRUE) |>\n dplyr::filter(term == \"podcast\") |>\n mutate(formula = \"podcast\")\n\n## Model that closes backdoor paths\nadjusted_model <- lm(exam ~ podcast + mood + prepared, sim_data) |>\n broom::tidy(conf.int = TRUE) |>\n dplyr::filter(term == \"podcast\") |>\n mutate(formula = \"podcast + mood + prepared\")\n\nbind_rows(\n unadjusted_model,\n adjusted_model\n) |>\n ggplot(aes(x = estimate, y = formula, xmin = conf.low, xmax = conf.high)) +\n geom_vline(xintercept = 0, linewidth = 1, color = \"grey80\") +\n geom_pointrange(fatten = 3, size = 1) +\n theme_minimal(18) +\n labs(\n y = NULL,\n caption = \"correct effect size: 0\"\n )\n```\n\n::: {.cell-output-display}\n![Forest plot of simulated data based on the DAG described in @fig-dag-podcast.](chapter-05_files/figure-html/fig-dag-sim-1.png){#fig-dag-sim width=672}\n:::\n:::\n\n\n## Structures of Causality\n\n### Advanced Confounding\n\nIn `podcast_dag`, `mood` and `prepared` were *direct* confounders: an arrow was going directly from them to `podcast` and `exam`.\nOften, backdoor paths are more complex.\nLet's consider such a case by adding two new variables: `alertness` and `skills_course`.\n`alertness` represents the feeling of alertness from a good mood, thus the arrow from `mood` to `alertness`.\n`skills_course` represents whether the student took a College Skills Course and learned time management techniques.\nNow, `skills_course` is what frees up the time to listen to a podcast as well as being prepared for the exam.\n`mood` and `prepared` are no longer direct confounders: they are two variables along a more complex backdoor path.\nAdditionally, we've added an arrow going from `humor` to `mood`.\nLet's take a look at @fig-podcast_dag2.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag2 <- dagify(\n podcast ~ mood + humor + skills_course,\n alertness ~ mood,\n mood ~ humor,\n prepared ~ skills_course,\n exam ~ alertness + prepared,\n coords = time_ordered_coords(),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n alertness = \"alertness\",\n skills_course = \"college\\nskills course\",\n humor = \"humor\",\n prepared = \"prepared\"\n )\n)\n\nggdag(podcast_dag2, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![An expanded version of `podcast_dag` that includes two additional variables: `skills_course`, representing a College Skills Course, and `alertness`.](chapter-05_files/figure-html/fig-podcast_dag2-1.png){#fig-podcast_dag2 width=480}\n:::\n:::\n\n::: {.cell}\n\n:::\n\n\nNow there are *three* backdoor paths we need to close: `podcast <- humor -> mood -> alertness -> exam`, `podcast <- mood -> alertness -> exam`, and`podcast <- skills_course -> prepared -> exam`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggdag_paths(podcast_dag2, use_labels = \"label\", text = FALSE, shadow = TRUE)\n```\n\n::: {.cell-output-display}\n![Three open paths in `podcast_dag2`. Since there is no effect of `podcast` on `exam`, all three are backdoor paths that must be closed to get the correct effect.](chapter-05_files/figure-html/fig-podcast_dag2-paths-1.png){#fig-podcast_dag2-paths width=1056}\n:::\n:::\n\n\nThere are four minimal adjustment sets to close all three paths (and eighteen full adjustment sets!).\nThe minimal adjustment sets are `alertness + prepared`, `alertness + skills_course`, `mood + prepared`, `mood + skills_course`.\nWe can now block the open paths in several ways.\n`mood` and `prepared` still work, but we've got other options now.\nNotably, `prepared` and `alertness` could happen at the same time or even after `podcast`.\n`skills_course` and `mood` still happen before both `podcast` and `exam`, so the idea is still the same: the confounding pathway starts before the exposure and outcome.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggdag_adjustment_set(podcast_dag2, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![Valid minimal adjustment sets that will close the backdoor paths in @fig-podcast_dag2-paths.](chapter-05_files/figure-html/fig-podcast_dag2-set-1.png){#fig-podcast_dag2-set width=672}\n:::\n:::\n\n\nDeciding between these adjustment sets is a matter of judgment: if all data are perfectly measured, the DAG is correct, and we've modeled them correctly, then it doesn't matter which we use.\nEach adjustment set will result in an unbiased estimate.\nAll three of those assumptions are usually untrue to some degree.\nLet's consider the path via `skills_course` and `prepared`.\nIt may be that we are better able to assess whether or not someone took the College Skills Course than how prepared for the exam they are.\nIn that case, an adjustment set with `skills_course` is a better option.\nBut perhaps we better understand the relationship between preparedness and exam results.\nIf we have it measured, controlling for that might be better.\nWe could get the best of both worlds by including both variables: between the better measurement of `skills_course` and the better modeling of `prepared`, we might have a better chance of minimizing confounding from this path.\n\n### Selection Bias and Mediation\n\nSelection bias is another name for the type of bias that is induced by adjusting for a collider.\nIt's called \"selection bias\" because a common form of collider-induced bias is a variable inherently stratified upon by the design of the study---selection *into* the study.\nLet's consider a case based on the original `podcast_dag` but with one additional variable: whether or not the student showed up to the exam.\nNow, there is an indirect effect of `podcast` on `exam`: listening to a podcast influences whether or not the students attend the exam.\nThe true result of `exam` is missing for those who didn't show up; by studying the group of people who *did* show up, we are inherently stratifying on this variable.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag3 <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared + showed_up,\n showed_up ~ podcast + mood + prepared,\n coords = time_ordered_coords(\n list(\n # time point 1\n c(\"prepared\", \"humor\", \"mood\"), \n # time point 2\n \"podcast\", \n \"showed_up\", \n # time point 3\n \"exam\"\n )\n ),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n humor = \"humor\",\n prepared = \"prepared\",\n showed_up = \"showed up\"\n )\n)\nggdag(podcast_dag3, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![Another variant of `podcast_dag`, this time including the inherent stratification on those who appear for the exam. There is still no direct effect of `podcast` on `exam`, but there is an indirect effect via `showed_up`.](chapter-05_files/figure-html/fig-podcast_dag3-1.png){#fig-podcast_dag3 width=432}\n:::\n:::\n\n\nThe problem is that `showed_up` is both a collider and a mediator: stratifying on it induces a relationship between many of the variables in the DAG but blocks the indirect effect of `podcast` on `exam`.\nLuckily, the adjustment sets can handle the first problem; because `showed_up` happens *before* `exam`, we're less at risk of collider bias between the exposure and outcome.\nUnfortunately, we cannot calculate the total effect of `podcast` on `exam` because part of the effect is missing: the indirect effect is closed at `showed_up`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag3 |> \n adjust_for(\"showed_up\") |> \n ggdag_adjustment_set(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![The adjustment set for `podcast_dag3` given that the data are inherently conditioned on showing up to the exam. In this case, there is no way to recover an unbiased estimate of the total effect of `podcast` on `exam`.](chapter-05_files/figure-html/fig-podcast_dag3-as-1.png){#fig-podcast_dag3-as width=432}\n:::\n:::\n\n\nSometimes, you can still estimate effects in this situation by changing the estimate you wish to calculate.\nWe can't calculate the total effect because we are missing the indirect effect, but we can still calculate the direct effect of `podcast` on `exam`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag3 |> \n adjust_for(\"showed_up\") |> \n ggdag_adjustment_set(effect = \"direct\", text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![The adjustment set for `podcast_dag3` when targeting a different effect. There is one minimal adjustment set that we can use to estimate the direct effect of `podcast` on `exam`.](chapter-05_files/figure-html/fig-podcast_dag3-direct-1.png){#fig-podcast_dag3-direct width=432}\n:::\n:::\n\n\n#### M-Bias and Butterfly Bias\n\nA particular case of selection bias that you'll often see people talk about is *M-bias*.\nIt's called M-bias because it looks like an M when arranged top to bottom.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nm_bias() |> \n ggdag()\n```\n\n::: {.cell-output-display}\n![A DAG representing M-Bias, a situation where a collider predates the exposure and outcome.](chapter-05_files/figure-html/fig-m-bias-1.png){#fig-m-bias width=384}\n:::\n:::\n\n\n::: callout-tip\nggdag has several quick-DAGs for demonstrating basic causal structures, including `confounder_triangle()`, `collider_triangle()`, `m_bias()`, and `butterfly_bias()`.\n:::\n\nWhat's theoretically interesting about M-bias is that `m' is a collider but occurs before`x`and`y`. Remember that association is blocked at a collider, so there is no open path between`x`and`y\\`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npaths(m_bias())\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n$paths\n[1] \"x <- a -> m <- b -> y\"\n\n$open\n[1] FALSE\n```\n\n\n:::\n:::\n\n\nLet's focus on the `mood` path of the podcast-exam DAG.\nWhat if we were wrong about mood, and the actual relationship was M-shaped?\nLet's say that, rather than causing `podcast` and `exam`, `mood` was itself caused by two mutual causes of `podcast` and `exam`, `u1` and `u2`, as in @fig-podcast_dag4.\nWe don't know what `u1` and `u2` are, and we don't have them measured.\nAs above, there are no open paths in this subset of the DAG.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag4 <- dagify(\n podcast ~ u1,\n exam ~ u2,\n mood ~ u1 + u2,\n coords = time_ordered_coords(list(\n c(\"u1\", \"u2\"),\n \"mood\",\n \"podcast\", \n \"exam\"\n )),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n u1 = \"unmeasured\",\n u2 = \"unmeasured\"\n ),\n # we don't have them measured\n latent = c(\"u1\", \"u2\")\n)\n\nggdag(podcast_dag4, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![A reconfiguration of @fig-dag-podcast where `mood` is a collider on an M-shaped path.](chapter-05_files/figure-html/fig-podcast_dag4-1.png){#fig-podcast_dag4 width=528}\n:::\n:::\n\n\nThe problem arises when we think our original DAG is the right DAG: `mood` is in the adjustment set, so we control for it.\nBut this induces bias!\nIt opens up a path between `u1` and `u2`, thus creating a path from `podcast` to `exam`.\nIf we had either `u1` or `u2` measured, we could adjust for them to close this path, but we don't.\nThere is no way to close this open path.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag4 |> \n adjust_for(\"mood\") |> \n ggdag_adjustment_set(use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![The adjustment set where `mood` is a collider. If we control for `mood` and don't know about or have the unmeasured causes of `mood`, we have no means of closing the backdoor path opened by adjusting for a collider.](chapter-05_files/figure-html/fig-podcast_dag4-as-1.png){#fig-podcast_dag4-as width=528}\n:::\n:::\n\n\nOf course, the best thing to do here is not control for `mood` at all.\nSometimes, though, that is not an option.\nImagine if, instead of `mood`, this turned out to be the real structure for `showed_up`: since we inherently control for `showed_up`, and we don't have the unmeasured variables, our study results will always be biased.\nIt's essential to understand if we're in that situation so we can address it with sensitivity analysis to understand just how biased the effect would be.\n\nLet's consider a variation on M-bias where `mood` causes `podcast` and `exam` and `u1` and `u2` are mutual causes of `mood` and the exposure and outcome.\nThis arrangement is sometimes called butterfly or bowtie bias, again because of its shape.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbutterfly_bias(x = \"podcast\", y = \"exam\", m = \"mood\", a = \"u1\", b = \"u2\") |> \n ggdag(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![In butterfly bias, `mood` is both a collider and a confounder. Controlling for the bias induced by `mood` opens a new pathway because we've also conditioned on a collider. We can't properly close all backdoor paths without either `u1` or `u2`.](chapter-05_files/figure-html/fig-butterfly_bias-1.png){#fig-butterfly_bias width=480}\n:::\n:::\n\n\nNow, we're in a challenging position: we need to control for `mood` because it's a confounder, but controlling for `mood` opens up the pathway from `u1` to `u2`.\nBecause we don't have either variable measured, we can't then close the path opened from conditioning on `mood`.\nWhat should we do?\nIt turns out that, when in doubt, controlling for `mood` is the better of the two options: confounding bias tends to be worse than collider bias, and M-shaped structures of colliders are sensitive to slight deviations (e.g., if this is not the exact structure, often the bias isn't as bad).\n\nAnother common form of selection bias is from *loss to follow-up*: people drop out of a study in a way that is related to the exposure and outcome.\nWe'll come back to this topic in [Chapter -@sec-longitudinal].\n\n### Causes of the exposure, causes of the outcome\n\nLet's consider one other type of causal structure that's important: causes of the exposure and not the outcome, and their opposites, causes of the outcome and not the exposure.\nLet's add a variable, `grader_mood`, to the original DAG.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag5 <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared + grader_mood,\n coords = time_ordered_coords(\n list(\n # time point 1\n c(\"prepared\", \"humor\", \"mood\"), \n # time point 2\n c(\"podcast\", \"grader_mood\"), \n # time point 3\n \"exam\"\n )\n ),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"student\\nmood\",\n humor = \"humor\",\n prepared = \"prepared\",\n grader_mood = \"grader\\nmood\"\n )\n)\nggdag(podcast_dag5, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![A DAG containing a cause of the exposure that is not the cause of the outcome (`humor`) and a cause of the outcome that is not a cause of the exposure (`grader_mood`).](chapter-05_files/figure-html/fig-podcast_dag5-1.png){#fig-podcast_dag5 width=480}\n:::\n:::\n\n\nThere are now two variables that aren't related to *both* the exposure and the outcome: `humor`, which causes `podcast` but not `exam`, and `grader_mood`, which causes `exam` but not `podcast`.\nLet's start with `humor`.\n\nVariables that cause the exposure but not the outcome are also called *instrumental variables* (IVs).\nIVs are an unusual circumstance where, under certain conditions, controlling for them can make other types of bias worse.\nWhat's unique about this is that IVs can *also* be used to conduct an entirely different approach to estimating an unbiased effect of the exposure on the outcome.\nIVs are commonly used this way in econometrics and are increasingly popular in other areas.\nIn short, IV analysis allows us to estimate the causal effect using a different set of assumptions than the approaches we've talked about thus far.\nSometimes, a problem intractable using propensity score methods can be addressed using IVs and vice versa.\nWe'll talk more about IVs in @sec-iv-friends.\n\nSo, if you're *not* using IV methods, should you include an IV in a model meant to address confounding?\nIf you're unsure if the variable is an IV or not, you should probably add it to your model: it's more likely to be a confounder than an IV, and, it turns out, the bias from adding an IV is usually small in practice.\nSo, like adjusting for a potential M-structure variable, the risk of bias is worse from confounding.\n\nNow, let's talk about the opposite of an IV: a cause of the outcome that is not the cause of the exposure.\nThese variables are sometimes called *competing exposures* (because they also cause the outcome) or *precision variables* (because, as we'll see, they increase the precision of causal estimates).\nWe'll call them precision variables because we're concerned about the relationship to the research question at hand, not to another research question where they are exposures.\n\nLike IVs, precision variables do not occur along paths from the exposure to the outcome.\nThus, including them is not necessary.\nUnlike IVs, including precision variables is beneficial.\nIncluding other causes of the outcomes helps a statistical model capture some of its variation.\nThis doesn't impact the point estimate of the effect, but it does reduce the variance, resulting in smaller standard errors and narrower confidence intervals.\nThus, we recommend including them when possible.\n\nSo, even though we don't need to control for `grader_mood`, if we have it in the data set, we should.\nSimilarly, `humor` is not a good addition to the model unless we think it really might be a confounder; if it is a valid instrument, we might want to consider using IV methods to estimate the effect instead.\n\n### Measurement Error and Missingness\n\nDAGs can also help us understand the bias arising from mismeasurements in the data, including the worst mismeasurement: not measuring it at all.\nWe'll cover these topics in [Chapter -@sec-missingness], but the basic idea is that by separating the actual value from the observed value, we can better understand how such biases may behave.\nHere's a basic example of a bias called *recall bias*.\nRecall bias is when the outcome influences a participant's memory of exposure, so it's a particular problem in retrospective studies where the earlier exposure is not recorded until after the outcome happens.\nAn example of when this can occur is a case-control study of cancer.\nSomeone *with* cancer may be more motivated to ruminate on their past exposures than someone *without* cancer.\nSo, their memory about a given exposure may be more refined than someone without.\nBy conditioning on the observed version of the exposure, we open up many collider paths.\nUnfortunately, there is no way to close them all.\nIf this is the case, we must investigate how severe the bias would be in practice.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nerror_dag <- dagify(\n exposure_observed ~ exposure_real + exposure_error,\n outcome_observed ~ outcome_real + outcome_error,\n outcome_real ~ exposure_real,\n exposure_error ~ outcome_real,\n labels = c(\n exposure_real = \"Exposure\\n(truth)\",\n exposure_error = \"Measurement Error\\n(exposure)\",\n exposure_observed = \"Exposure\\n(observed)\",\n outcome_real = \"Outcome\\n(truth)\",\n outcome_error = \"Measurement Error\\n(outcome)\",\n outcome_observed = \"Outcome\\n(observed)\"\n ),\n exposure = \"exposure_real\",\n outcome = \"outcome_real\",\n coords = time_ordered_coords()\n)\n\nerror_dag |> \n ggdag(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![A DAG representing measurement error in observing the exposure and outcome. In this case, the outcome impacts the participant's memory of the exposure, also known as recall bias.](chapter-05_files/figure-html/fig-error_dag-1.png){#fig-error_dag width=528}\n:::\n:::\n\n\n## Recommendations in building DAGs\n\nIn principle, using DAGs is easy: specify the causal relationships you think exist and then query the DAG for information like valid adjustment sets.\nIn practice, assembling DAGs takes considerable time and thought.\nNext to defining the research question itself, it's one of the most challenging steps in making causal inferences.\nVery little guidance exists on best practices in assembling DAGs.\n@Tennant2021 collected data on DAGs in applied health research to better understand how researchers used them.\n@tbl-dag-properties shows some information they collected: the median number of nodes and arcs in a DAG, their ratio, the saturation percent of the DAG, and how many were fully saturated.\nSaturating DAGs means adding all possible arrows going forward in time, e.g., in a fully saturated DAG, any given variable at time point 1 has arrows going to all variables in future time points, and so on.\nMost DAGs were only about half saturated, and very few were fully saturated.\n\nOnly about half of the papers using DAGs reported the adjustment set used.\nIn other words, researchers presented their assumptions about the research question but not the implications about how they should handle the modeling stage or if they did use a valid adjustment set.\nSimilarly, the majority of studies did not report the estimand of interest.\n\n::: callout-note\nThe estimand is the target of interest in terms of what we're trying to estimate, as discussed briefly in [Chapter -@sec-whole-game].\nWe'll discuss estimands in detail in [Chapter -@sec-estimands].\n:::\n\n\n::: {#tbl-dag-properties .cell tbl-cap='A table of DAG properties measured by @Tennant2021. Number of nodes and arcs are the median number of variables and arrows in the analyzed DAGs, while the Node to Arc ratio is their ratio. Saturation proportion is the proportion of all possible arrows going forward in time to other included variables. Fully saturated DAGs are those that include all such arrows. @Tennant2021 also analyzed whether studies reported their estimands and adjustment sets.'}\n::: {.cell-output-display}\n\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n \n \n \n\n \n\n \n\n \n\n \n\n \n\n \n \n \n \n \n \n \n
CharacteristicN = 1441
DAG properties
Number of Nodes12 (9, 16)
Number of Arcs29 (19, 41)
Node to Arc Ratio2.30 (1.78, 3.00)
Saturation Proportion0.46 (0.31, 0.67)
Fully Saturated
    Yes4 (3%)
    No140 (97%)
Reporting
Reported Estimand
    Yes40 (28%)
    No104 (72%)
Reported Adjustment Set
    Yes80 (56%)
    No64 (44%)
1 Median (IQR); n (%)
\n
\n```\n\n:::\n:::\n\n\nIn this section, we'll offer some advice from @Tennant2021 and our own experience assembling DAGs.\n\n### Iterate early and often\n\nOne of the best things you can do for the quality of your results is to make the DAG before you conduct the study, ideally before you even collect the data.\nIf you're already working with your data, at minimum, build your DAG before doing data analysis.\nThis advice is similar in spirit to pre-registered analysis plans: declaring your assumptions ahead of time can help clarify what you need to do, reduce the risk of overfitting (e.g., determining confounders incorrectly from the data), and give you time to get feedback on your DAG.\n\nThis last benefit is significant: you should ideally democratize your DAG.\nShare it early and often with others who are experts on the data, domain, and models.\nIt's natural to create a DAG, present it to your colleagues, and realize you have missed something important.\nSometimes, you will only agree on some details of the structure.\nThat's a good thing: you know now where there is uncertainty in your DAG.\nYou can then examine the results from multiple plausible DAGs or address the uncertainty with sensitivity analyses.\n\nIf you have more than one candidate DAG, check their adjustment sets.\nIf two DAGs have overlapping adjustment sets, focus on those sets; then, you can move forward in a way that satisfies the plausible assumptions you have.\n\n### Consider your question\n\nAs we saw in @fig-podcast_dag3, some questions can be challenging to answer with certain data, while others are more approachable.\nYou should consider precisely what it is you want to estimate.\nDefining your target estimate is an important topic and the subject of [Chapter -@sec-estimands].\n\nAnother important detail about how your DAG relates to your question is the population and time.\nMany causal structures are not static over time and space.\nConsider lung cancer: the distribution of causes of lung cancer was considerably different before the spread of smoking.\nIn medieval Japan, before the spread of tobacco from the Americas centuries later, the causal structure for lung cancer would have been practically different from what it is in Japan today, both in terms of tobacco use and other factors (age of the population, etc.).\n\nThe same is true for confounders.\nEven if something *can* cause the exposure and outcome, if the prevalence of that thing is zero in the population you're analyzing, it's irrelevant to the causal question.\nIt may also be that, in some populations, it doesn't affect one of the two.\nThe reverse is also true: something might be unique to the target population.\nThe use of tobacco in North America several centuries ago was unique among the world population, even though ceremonial tobacco use was quite different from modern recreational use.\nMany changes won't happen as dramatically as across centuries, but sometimes, they do, e.g., if regulation in one country effectively eliminates the population's exposure to something.\n\n### Order nodes by time {#sec-time-ordered}\n\nAs discussed earlier, we recommend ordering your variables by time, either left-to-right or up-to-down.\nThere are two reasons for this.\nFirst, time ordering is an integral part of your assumptions.\nAfter all, something happening before another thing is a requirement for it to be a cause.\nThinking this through carefully will clarify your DAG and the variables you need to address.\n\nSecond, after a certain level of complexity, it's easier to read a DAG when arranged by time because you have to think less about that dimension; it's inherent to the layout.\nThe time ordering algorithm in ggdag automates much of this for you, although, as we saw earlier, it's sometimes helpful to give it more information about the order.\n\nA related topic is feedback loops.\nOften, we think about two things that mutually cause each other as happening in a circle, like global warming and A/C use (A/C use increases global warming, which makes it hotter, which increases A/C use, and so on).\nIt's tempting to visualize that relationship like this:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n ac_use ~ global_temp,\n global_temp ~ ac_use,\n labels = c(ac_use = \"A/C use\", global_temp = \"Global\\ntemperature\")\n) |> \n ggdag(layout = \"circle\", edge_type = \"arc\", text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![A DAG representing the reciprocal relationship between A/C use and global temperature because of global warming. Feedback loops are useful mental shorthands to describe variables that impact each other over time compactly, but they are not true causal diagrams.](chapter-05_files/figure-html/fig-feedback-loop-1.png){#fig-feedback-loop width=432}\n:::\n:::\n\n\nFrom a DAG perspective, this is a problem because of the *A* part of *DAG*: it's cyclic!\nImportantly, though, it's also not correct from a causal perspective.\nFeedback loops are a shorthand for what really happens, which is that the two variables mutually affect each other *over time*.\nCausality only goes forward in time, so it doesn't make sense to go back and forth like in @fig-feedback-loop.\n\nThe real DAG looks something like this:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n global_temp_2000 ~ ac_use_1990 + global_temp_1990,\n ac_use_2000 ~ ac_use_1990 + global_temp_1990,\n global_temp_2010 ~ ac_use_2000 + global_temp_2000,\n ac_use_2010 ~ ac_use_2000 + global_temp_2000,\n global_temp_2020 ~ ac_use_2010 + global_temp_2010,\n ac_use_2020 ~ ac_use_2010 + global_temp_2010,\n coords = time_ordered_coords(),\n labels = c(\n ac_use_1990 = \"A/C use\\n(1990)\", \n global_temp_1990 = \"Global\\ntemperature\\n(1990)\",\n ac_use_2000 = \"A/C use\\n(2000)\", \n global_temp_2000 = \"Global\\ntemperature\\n(2000)\",\n ac_use_2010 = \"A/C use\\n(2010)\", \n global_temp_2010 = \"Global\\ntemperature\\n(2010)\",\n ac_use_2020 = \"A/C use\\n(2020)\", \n global_temp_2020 = \"Global\\ntemperature\\n(2020)\"\n )\n) |> \n ggdag(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![A DAG showing the relationship between A/C use and global temperature over time. The true causal relationship in a feedback loop goes *forward*.](chapter-05_files/figure-html/fig-feedforward-1.png){#fig-feedforward width=480}\n:::\n:::\n\n\nThe two variables, rather than being in a feed*back* loop, are actually in a feed*forward* loop: they co-evolve over time.\nHere, we only show four discrete moments in time (the decades from 1990 to 2020), but of course, we could get much finer depending on the question and data.\n\nAs with any DAG, the proper analysis approach depends on the question.\nThe effect of A/C use in 2000 on the global temperature in 2020 produces a different adjustment set than the global temperature in 2000 on A/C use in 2020.\nSimilarly, whether we also model this change over time or just those two time points depends on the question.\nOften, these feedforward relationships require you to address *time-varying* confounding, which we'll discuss in [Chapter -@sec-longitudinal].\n\n### Consider the whole data collection process\n\nAs @fig-podcast_dag3 showed us, it's essential to consider the *way* we collected data as much as the causal structure of the question.\nConsidering the whole data collection process is particularly true if you're working with \"found\" data---a data set not intentionally collected to answer the research question.\nWe are always inherently conditioning on the data we have vs. the data we don't have.\nIf other variables influenced the data collection process in the causal structure, you need to consider the impact.\nDo you need to control for additional variables?\nDo you need to change the effect you are trying to estimate?\nCan you answer the question at all?\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n::: callout-tip\n## What about case-control studies?\n\nA standard study design in epidemiology is the case-control study.\nCase-control studies are beneficial when the outcome under study is rare or takes a very long time to happen (like many types of cancer).\nParticipants are selected into the study based on their outcome: once a person has an event, they are entered as a case and matched with a control who hasn't had the event.\nOften, they are matched on other factors as well.\n\nMatched case-control studies are selection biased by design.\nIn @fig-case-control, when we condition on selection into the study, we lose the ability to close all backdoor paths, even if we control for `confounder`.\nFrom the DAG, it would appear that the entire design is invalid!\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n outcome ~ confounder + exposure,\n selection ~ outcome + confounder,\n exposure ~ confounder,\n exposure = \"exposure\",\n outcome = \"outcome\",\n coords = time_ordered_coords()\n) |> \n ggdag(edge_type = \"arc\", text_size = 2.2)\n```\n\n::: {.cell-output-display}\n![A DAG representing a matched case-control study. In such a study, selection is determined by outcome status and any matched confounders. Selection into the study is thus a collider. Since it is inherently stratified on who is actually in the study, such data are limited in the types of causal effects they can estimate.](chapter-05_files/figure-html/fig-case-control-1.png){#fig-case-control width=432}\n:::\n:::\n\n\nLuckily, this isn't wholly true.\nCase-control studies are limited in the type of causal effects they can estimate (causal odds ratios, which under some circumstances approximate causal risk ratios).\nWith careful study design and sampling, the math works out such that these estimates are still valid.\nExactly how and why case-control studies work is beyond the scope of this book, but they are a remarkably clever design.\n:::\n\n### Include variables you don't have\n\nIt's critical that you include *all* variables important to the causal structure, not just the variables you have measured in your data.\nggdag can mark variables as unmeasured (\"latent\"); it will then return only usable adjustment sets, e.g., those without the unmeasured variables.\nOf course, the best thing to do is to use DAGs to help you understand what to measure in the first place, but there are many reasons why your data might be different.\nEven data intentionally collected for the research question might not have a variable discovered to be a confounder after data collection.\n\nFor instance, if we have a DAG where `exposure` and `outcome` have a confounding pathway consisting of `confounder1` and `confounder2`, we can control for either to successfully debias the estimate:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n outcome ~ exposure + confounder1,\n exposure ~ confounder2,\n confounder2 ~ confounder1,\n exposure = \"exposure\",\n outcome = \"outcome\"\n) |> \n adjustmentSets()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n{ confounder1 }\n{ confounder2 }\n```\n\n\n:::\n:::\n\n\nThus, if just one is missing (`latent`), then we are ok:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n outcome ~ exposure + confounder1,\n exposure ~ confounder2,\n confounder2 ~ confounder1,\n exposure = \"exposure\",\n outcome = \"outcome\",\n latent = \"confounder1\"\n) |> \n adjustmentSets()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n{ confounder2 }\n```\n\n\n:::\n:::\n\n\nBut if both are missing, there are no valid adjustment sets.\n\nWhen you don't have a variable measured, you still have a few options.\nAs mentioned above, you may be able to identify alternate adjustment sets.\nIf the missing variable is required to close all backdoor paths completely, you can and should conduct a sensitivity analysis to understand the impact of not having it.\nThis is the subject of [Chapter -@sec-sensitivity].\n\nUnder some lucky circumstances, you can also use a *proxy* confounder.\nA proxy confounder is a variable closely related to the confounder such that controlling for it controls for some of the effects of the missing variable.\nConsider an expansion of the fundamental confounding relationship where `q` has a cause, `p`, as in @fig-proxy-confounder.\nTechnically, if we don't have `q`, we can't close the backdoor path, and our effect will be biased.\nPractically, though, if `p` is highly correlated with `q`, it can serve as a method to reduce the confounding from `q`.\nYou can think of `p` as a mismeasured version of `q`; it will seldom wholly control for the bias via `q`, but it can help minimize it.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n y ~ x + q,\n x ~ q,\n q ~ p,\n coords = time_ordered_coords()\n) |> \n ggdag(edge_type = \"arc\")\n```\n\n::: {.cell-output-display}\n![A DAG with a confounder, `q`, and a proxy confounder, `p`. The true adjustment set is `q`. Since `p` causes `q`, it contains information about `q` and can reduce the bias if we don't have `q` measured.](chapter-05_files/figure-html/fig-proxy-confounder-1.png){#fig-proxy-confounder width=432}\n:::\n:::\n\n\n### Saturate your DAG, then prune\n\nIn discussing @tbl-dag-properties, we mentioned *saturated* DAGs.\nThese are DAGs where all possible arrows are included based on the time ordering, e.g., every variable causes variables that come after it in time.\n\n*Not* including an arrow is a bigger assumption than including one.\nIn other words, your default should be to have an arrow from one variable to a future variable.\nThis default is counterintuitive to many people.\nHow can it be that we need to be so careful about assessing causal effects yet be so liberal in applying causal assumptions in the DAG?\nThe answer to this lies in the strength and prevalence of the cause.\nTechnically, an arrow present means that *for at least a single observation*, the prior node causes the following node.\nThe arrow similarly says nothing about the strength of the relationship.\nSo, a minuscule causal effect on a single individual justifies the presence of an arrow.\nIn practice, such a case is probably not relevant.\nThere is *effectively* no arrow.\n\nThe more significant point, though, is that you should feel confident to add an arrow.\nThe bar for justification is much lower than you think.\nInstead, it's helpful to 1) determine your time ordering, 2) saturate the DAG, and 3) prune out implausible arrows.\n\nLet's experiment by working through a saturated version of the podcast-exam DAG.\n\nFirst, the time-ordering.\nPresumably, the student's sense of humor far predates the day of the exam.\nMood in the morning, too, predates listening to the podcast or exam score, as does preparation.\nThe saturated DAG given this ordering is:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A saturated version of `podcast_dag`: variables have all possible arrows going forward to other variables over time.](chapter-05_files/figure-html/fig-podcast_dag_sat-1.png){#fig-podcast_dag_sat width=528}\n:::\n:::\n\n\nThere are a few new arrows here.\nHumor now causes the other two confounders, as well as exam score.\nSome of them make sense.\nSense of humor probably affects mood for some people.\nWhat about preparedness?\nThis relationship seems a little less plausible.\nSimilarly, we know that a sense of humor does not affect exam scores in this case because the grading is blinded.\nLet's prune those two.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A pruned version of @fig-podcast_dag_sat: we've removed implausible arrows from the fully saturated DAGs.](chapter-05_files/figure-html/fig-podcast_dag_pruned-1.png){#fig-podcast_dag_pruned width=528}\n:::\n:::\n\n\nThis DAG seems more reasonable.\nSo, was our original DAG wrong?\nThat depends on several factors.\nNotably, both DAGs produce the same adjustment set: controlling for `mood` and `prepared` will give us an unbiased effect if either DAG is correct.\nEven if the new DAG were to produce a different adjustment set, whether the result is meaningfully different depends on the strength of the confounding.\n\n### Include instruments and precision variables\n\nTechnically, you do not need to include instrumental and precision variables in your DAG.\nThe adjustment sets will be the same with and without them.\nHowever, adding them is helpful for two reasons.\nFirstly, they demonstrate your assumptions about their relationships and the variables under study.\nAs discussed above, *not* including an arrow is a more significant assumption than including one, so it's valuable information about how you think the causal structure operates.\nSecondly, it impacts your modeling decision.\nYou should always include precision variables in your model to reduce variability in your estimate so it helps you identify those.\nInstruments are also helpful to see because they may guide alternative or complementary modeling strategies, as we'll discuss in @sec-evidence.\n\n### Focus on the causal structure, then consider measurement bias\n\nAs we saw above, missingness and measurement error can be a source of bias.\nAs we'll see in [Chapter -@sec-missingness], we have several strategies to approach such a situation.\nYet, almost everything we measure is inaccurate to some degree.\nThe true DAG for the data at hand inherently conditions on the measured version of variables.\nIn that sense, your data are always subtly-wrong, a sort of unreliable narrator.\nWhen should we include this information in the DAG?\nWe recommend first focusing on the causal structure of the DAG as if you had perfectly measured each variable.\nThen, consider how mismeasurement and missingness might affect the realized data, particularly related to the exposure, outcome, and critical confounders.\nYou may prefer to present this as an alternative DAG to consider strategies for addressing the bias arising from those sources, e.g., imputation or sensitivity analyses.\nAfter all, the DAG in \\@ fig-error_dag makes you think the question is unanswerable because we have no method to close all backdoor paths.\nAs with all open paths, that depends on the severity of the bias and our ability to reckon with it.\n\n\n\n\n\n### Pick adjustment sets most likely to be successful\n\nOne area where measurement error is an important consideration is when picking an adjustment set.\nIn theory, if a DAG is correct, any adjustment set will work to create an unbiased result.\nIn practice, variables have different levels of quality.\nPick an adjustment set most likely to succeed because it contains accurate variables.\nSimilarly, non-minimal adjustment sets are helpful to consider because, together, several variables with measurement error along a backdoor path may be enough to minimize the practical bias resulting from that path.\n\nWhat if you don't have certain critical variables measured and thus do not have a valid adjustment set?\nIn that case, you should pick the adjustment set with the best chance of minimizing the bias from other backdoor paths.\nAll is not lost if you don't have every confounder measured: get the highest quality estimate you can, then conduct a sensitivity analysis about the unmeasured variables to understand the impact.\n\n### Use robustness checks\n\nFinally, we recommend checking your DAG for robustness.\nYou can never verify the correctness of your DAG under most conditions, but you can use the implications in your DAG to support it.\nThree types of robustness checks can be helpful depending on the circumstances.\n\n1. **Negative controls**. These come in two flavors: negative exposure controls and negative outcome controls. The idea is to find something associated with one but not the other, e.g., the outcome but not the exposure, so there should be no effect. Since there should be no effect, you now have a measurement for how well you control for *other* effects (e.g., the difference from null). Ideally, the confounders for negative controls are similar to the research question.\n2. **DAG-data consistency**. Negative controls are an implication of your DAG. An extension of this idea is that there are *many* such implications. Because blocking a path removes statistical dependencies from that path, you can check those assumptions in several places in your DAG.\n3. **Alternate adjustment sets**. Adjustment sets should give roughly the same answer because, outside of random and measurement errors, they are all sets that block backdoor paths. If more than one adjustment set seems reasonable, you can use that as a sensitivity analysis by checking multiple models.\n\nWe'll discuss these in detail in [Chapter -@sec-sensitivity].\nThe caveat here is that these should be complementary to your initial DAG, not a way of *replacing* it.\nIn fact, if you use more than one adjustment set during your analysis, you should report the results from all of them to avoid overfitting your results to your data.\n", + "markdown": "# Expressing causal questions as DAGs {#sec-dags}\n\n\n\n\n\n## Visualizing Causal Assumptions\n\n> Draw your assumptions before your conclusions --@hernan2021\n\nCausal diagrams are a tool to visualize your assumptions about the causal structure of the questions you're trying to answer.\nIn a randomized experiment, the causal structure is quite simple.\nWhile there may be many causes of an outcome, the only cause of the exposure is the randomization process itself (we hope!).\nIn many non-randomized settings, however, the structure of your question can be a complex web of causality.\nCausal diagrams help communicate what we think this structure looks like.\nIn addition to being open about what we think the causal structure is, causal diagrams have incredible mathematical properties that allow us to identify a way to estimate unbiased causal effects even with observational data.\n\nCausal diagrams are also increasingly common.\nData collected as a review of causal diagrams in applied health research papers show a drastic increase in use over time [@Tennant2021].\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Percentage of health research papers using causal diagrams over time.](chapter-05_files/figure-html/fig-dag-usage-1.png){#fig-dag-usage width=672}\n:::\n:::\n\n\nThe type of causal diagrams we use are also called directed acyclic graphs (DAGs)[^chapter-05-1].\nThese graphs are directed because they include arrows going in a specific direction.\nThey're acyclic because they don't go in circles; a variable can't cause itself, for instance.\nDAGs are used for various problems, but we're specifically concerned with *causal* DAGs.\nThis class of DAGs is sometimes called Structural Causal Models (SCMs) because they are a model of the causal structure of a question [@hernan2021; @Pearl_Glymour_Jewell_2021].\n\n[^chapter-05-1]: An essential but rarely observed detail of DAGs is that dag is also an [affectionate Australian insult](https://en.wikipedia.org/wiki/Dag_(slang)) referring to the dung-caked fur of a sheep, a *daglock*.\n\nDAGs depict causal relationships between variables.\nVisually, the way they depict variables is as *edges* and *nodes*.\nEdges are the arrows going from one variable to another, sometimes called arcs or just arrows.\nNodes are the variables themselves, sometimes called vertices, points, or just variables.\nIn @fig-dag-basic, there are two nodes, `x` and `y`, and one edge going from `x` to `y`.\nHere, we are saying that `x` causes `y`.\n`y` \"listens\" to `x` [@Pearl_Glymour_Jewell_2021].\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A causal directed acyclic graph (DAG). DAGs depict causal relationships. In this DAG, the assumption is that `x` causes `y`.](chapter-05_files/figure-html/fig-dag-basic-1.png){#fig-dag-basic width=288}\n:::\n:::\n\n\nIf we're interested in the causal effect of `x` on `y`, we're trying to estimate a numeric representation of that arrow.\nUsually, though, there are many other variables and arrows in the causal structure of a given question.\nA series of arrows is called a *path*.\nThere are three types of paths you'll see in DAGs: forks, chains, and colliders (sometimes called inverse forks).\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Three types of causal relationships: forks, chains, and colliders. The direction of the arrows and the relationships of interest dictate which type of path a series of variables is. Forks represent a mutual cause, chains represent direct causes, and colliders represent a mutual descendant.](chapter-05_files/figure-html/fig-dag-path-types-1.png){#fig-dag-path-types width=672}\n:::\n:::\n\n\nForks represent a common cause of two variables.\nHere, we're saying that `q` causes both `x` and `y`, the traditional definition of a confounder.\nThey're called forks because the arrows from `x` to `y` are in different directions.\nChains, on the other hand, represent a series of arrows going in the same direction.\nHere, `q` is called a *mediator*: it is along the causal path from `x` to `y`.\nIn this diagram, the only path from `x` to `y` is mediated through `q`.\nFinally, a collider is a path where two arrowheads meet at a variable.\nBecause causality always goes forward in time, this naturally means that the collider variable is caused by two other variables.\nHere, we're saying that `x` and `y` both cause `q`.\n\n::: callout-tip\n## Are DAGs SEMs?\n\nIf you're familiar with structural equation models (SEMs), a modeling technique commonly used in psychology and other social science settings, you may notice some similarities between SEMs and DAGs.\nDAGs are a form of *non-parametric* SEM.\nSEMs estimate entire graphs using parametric assumptions.\nCausal DAGs, on the other hand, don't estimate anything; an arrow going from one variable to another says nothing about the strength or functional form of that relationship, only that we think it exists.\n:::\n\nOne of the significant benefits of DAGs is that they help us identify sources of bias and, often, provide clues on how to address them.\nHowever, talking about an unbiased effect estimate only makes sense when we have a specific causal question in mind.\nSince each arrow represents a cause, it's causality all the way down; no individual arrow is inherently problematic.\nHere, we're interested in the effect of `x` on `y`.\nThis question defines which paths we're interested in and which we're not.\n\nThese three types of paths have different implications for the statistical relationship between `x` and `y`.\nIf we only look at the correlation between the two variables under these assumptions:\n\n1. In the fork, `x` and `y` will be associated, despite there being no arrow from `x` to `y`.\n2. In the chain, `x` and `y` are related only through `q`.\n3. In the collider, `x` and `y` will *not* be related.\n\nPaths that transmit association are called *open paths*.\nPaths that do not transmit association are called *closed paths*.\nForks and chains are open, while colliders are closed.\n\nSo, should we adjust for `q`?\nThat depends on the nature of the path.\nForks are confounding paths.\nBecause `q` causes both `x` and `y`, `x` and `y` will have a spurious association.\nThey both contain information from `q`, their mutual cause.\nThat mutual causal relationship makes `x` and `y` associated statistically.\nAdjusting for `q` will *block* the bias from confounding and give us the true relationship between `x` and `y`.\n\n::: callout-tip\n## Adjustment\n\nWe can use a variety of techniques to account for a variable.\nWe use the term \"adjustment\" or \"controlling for\" to refer to any technique that removes the effect of variables we're not interested in.\n:::\n\n@fig-confounder-scatter depicts this effect visually.\nHere, `x` and `y` are continuous, and by definition of the DAG, they are unrelated.\n`q`, however, causes both.\nThe unadjusted effect is biased because it includes information about the open path from `x` to `y` via `q`.\nWithin levels of `q`, however, `x` and `y` are unrelated.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Two scatterplots of the relationship between `x` and `y`. With forks, the relationship is biased by `q`. When accounting for `q`, we see the true null relationship.](chapter-05_files/figure-html/fig-confounder-scatter-1.png){#fig-confounder-scatter width=672}\n:::\n:::\n\n\nFor chains, whether or not we adjust for mediators depends on the research question.\nHere, adjusting for `q` would result in a null estimate of the effect of `x` on `y`.\nBecause the only effect of `x` on `y` is via `q`, no other effect remains.\nThe effect of `x` on `y` mediated by `q` is called the *indirect* effect, while the effect of `x` on `y` directly is called the *direct* effect.\nIf we're only interested in the direct effect, controlling for `q` might be what we want.\nIf we want to know about both effects, we shouldn't try to adjust for `q`.\nWe'll learn more about estimating these and other mediation effects in @sec-mediation.\n\n@fig-mediator-scatter shows this effect visually.\nThe unadjusted effect of `x` on `y` represents the total effect.\nSince the total effect is due entirely to the path mediated by `q`, when we adjust for `q`, no relationship remains.\nThis null effect is the direct effect.\nNeither of these effects is due to bias, but each answers a different research question.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Two scatterplots of the relationship between `x` and `y`. With chains, whether and how we should account for `q` depends on the research question. Without doing so, we see the impact of the total effect of `x` and `y`, including the indirect effect via `q`. When accounting for `q`, we see the direct (null) effect of `x` on `y`.](chapter-05_files/figure-html/fig-mediator-scatter-1.png){#fig-mediator-scatter width=672}\n:::\n:::\n\n\nColliders are different.\nIn the collider DAG of @fig-dag-path-types, `x` and `y` are *not* associated, but both cause `q`.\nAdjusting for `q` has the opposite effect than with confounding: it *opens* a biasing pathway.\nSometimes, people draw the path opened up by conditioning on a collider connecting `x` and `y`.\n\nVisually, we can see this happen when `x` and `y` are continuous and `q` is binary.\nIn @fig-collider-scatter, when we don't include `q`, we find no relationship between `x` and `y`.\nThat's the correct result.\nHowever, when we include `q`, we can detect information about both `x` and `y`, and they appear correlated: across levels of `x`, those with `q = 0` have lower levels of `y`.\nAssociation seemingly flows back in time.\nOf course, that can't happen from a causal perspective, so controlling for `q` is the wrong thing to do.\nWe end up with a biased effect of `x` on `y`.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![Two scatterplots of the relationship between `x` and `y`. The unadjusted relationship between the two is unbiased. When accounting for `q`, we open a colliding backdoor path and bias the relationship between `x` and `y`.](chapter-05_files/figure-html/fig-collider-scatter-1.png){#fig-collider-scatter width=672}\n:::\n:::\n\n\nHow can this be?\nSince `x` and `y` happen before `q`, `q` can't impact them.\nLet's turn the DAG on its side and consider @fig-collider-time.\nIf we break down the two time points, at time point 1, `q` hasn't happened yet, and `x` and `y` are unrelated.\nAt time point 2, `q` happens due to `x` and `y`.\n*But causality only goes forward in time*.\n`q` happening later can't change the fact that `x` and `y` happened independently in the past.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A collider relationship over two points in time. At time point one, there is no relationship between `x` and `y`. Both cause `q` by time point two, but this does not change what already happened at time point one.](chapter-05_files/figure-html/fig-collider-time-1.png){#fig-collider-time width=672}\n:::\n:::\n\n\nCausality only goes forward.\nAssociation, however, is time-agnostic.\nIt's just an observation about the numerical relationships between variables.\nWhen we control for the future, we risk introducing bias.\nIt takes time to develop an intuition for this.\nConsider a case where `x` and `y` are the only causes of `q`, and all three variables are binary.\nWhen *either* `x` or `y` equals 1, then `q` happens.\nIf we know `q = 1` and `x = 0` then logically it must be that `y = 1`.\nThus, knowing about `q` gives us information about `y` via `x`.\nThis example is extreme, but it shows how this type of bias, sometimes called *collider-stratification bias* or *selection bias*, occurs: conditioning on `q` provides statistical information about `x` and `y` and distorts their relationship [@Banack2023].\n\n::: callout-tip\n## Exchangeability revisited\n\nWe commonly refer to exchangability as the assumption of no confounding.\nActually, this isn't quite right.\nIt's the assumption of no *open, non-causal* paths [@hernan2021].\nMany times, these are confounding pathways.\nHowever, conditioning on a collider can also open paths.\nEven though these aren't confounders, doing so creates non-exchangeability between the two groups: they are different in a way that matters to the exposure and outcome.\n\nOpen, non-causal paths are also called *backdoor paths*.\nWe'll use this terminology often because it captures the idea well: these are any open paths biasing the effect we're interested in estimating.\n:::\n\nCorrectly identifying the causal structure between the exposure and outcome thus helps us 1) communicate the assumptions we're making about the relationships between variables and 2) identify sources of bias.\nImportantly, in doing 2), we are also often able to identify ways to prevent bias based on the assumptions in 1).\nIn the simple case of the three DAGs in @fig-dag-path-types, we know whether or not to control for `q` depending on the nature of the causal structure.\nThe set or sets of variables we need to adjust for is called the *adjustment set*.\nDAGs can help us identify adjustment sets even in complex settings [@vanderzander2019].\n\n::: callout-tip\n## What about interaction?\n\nDAGs don't make a statement about interaction or effect estimate modification, even though they are an important part of inference.\nTechnically, interaction is a matter of the functional form of the relationships in the DAG.\nMuch as we don't need to specify how we will model a variable in the DAG (e.g., with splines), we don't need to determine how variables statistically interact.\nThat's a matter for the modeling stage.\n\nThere are several ways we use interactions in causal inference.\nIn one extreme, they are simply a matter of functional form: interaction terms are included in models but marginalized to get an overall causal effect.\nConversely, we're interested in *joint causal effects*, where the two variables interacting are both causal.\nIn between, we can use interaction terms to identify *heterogeneous causal effects*, which vary by a second variable that is not assumed to be causal.\nAs with many tools in causal inference, we use the same statistical technique in many ways to answer different questions.\nWe'll revisit this topic in detail in [Chapter -@sec-interaction].\n\nMany people have tried expressing interaction in DAGs using different types of arcs, nodes, and other annotations, but no approach has taken off as the preferred way [@weinberg2007; @Nilsson2021].\n:::\n\nLet's take a look at an example in R.\nWe'll learn to build DAGs, visualize them, and identify important information like adjustment sets.\n\n## DAGs in R\n\nFirst, consider a research question: Does listening to a comedy podcast the morning before an exam improve graduate students' test scores?\nWe can diagram this using the method described in @sec-diag (@fig-diagram-podcast).\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A sentence diagram for the question: Does listening to a comedy podcast the morning before an exam improve graduate student test scores? The population is graduate students. The start time is morning, and the outcome time is after the exam.](../images/podcast-diagram.png){#fig-diagram-podcast width=2267}\n:::\n:::\n\n\nThe tool we'll use for making DAGs is ggdag.\nggdag is a package that connects ggplot2, the most powerful visualization tool in R, to dagitty, an R package with sophisticated algorithms for querying DAGs.\n\nTo create a DAG object, we'll use the `dagify()` function.`dagify()` returns a `dagitty` object that works with both the dagitty and ggdag packages.\nThe `dagify()` function takes formulas, separated by commas, that specify causes and effects, with the left element of the formula defining the effect and the right all of the factors that cause it.\nThis is just like the type of formula we specify for most regression models in R.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n effect1 ~ cause1 + cause2 + cause3,\n effect2 ~ cause1 + cause4,\n ...\n)\n```\n:::\n\n\nWhat are all of the factors that cause graduate students to listen to a podcast the morning before an exam?\nWhat are all of the factors that could cause a graduate student to do well on a test?\nLet's posit some here.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(ggdag)\ndagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared\n)\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\ndag {\nexam\nhumor\nmood\npodcast\nprepared\nhumor -> podcast\nmood -> exam\nmood -> podcast\nprepared -> exam\nprepared -> podcast\n}\n```\n\n\n:::\n:::\n\n\nIn the code above, we assume that:\n\n- a graduate student's mood, sense of humor, and how prepared they feel for the exam could influence whether they listened to a podcast the morning of the test\n- their mood and how prepared they are also influence their exam score\n\nNotice we *do not* see podcast in the exam equation; this means that we assume that there is **no** causal relationship between podcast and the exam score.\n\nThere are some other useful arguments you'll often find yourself supplying to `dagify()`:\n\n- `exposure` and `outcome`: Telling ggdag the variables that are the exposure and outcome of your research question is required for many of the most valuable queries we can make of DAGs.\n- `latent`: This argument lets us tell ggdag that some variables in the DAG are unmeasured. `latent` helps identify valid adjustment sets with the data we actually have.\n- `coords`: Coordinates for the variables. You can choose between algorithmic or manual layouts, as discussed below. We'll use `time_ordered_coords()` here.\n- `labels`: A character vector of labels for the variables.\n\nLet's create a DAG object, `podcast_dag`, with some of these attributes, then visualize the DAG with `ggdag()`.\n`ggdag()` returns a ggplot object, so we can add additional layers to the plot, like themes.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared,\n coords = time_ordered_coords(\n list(\n # time point 1\n c(\"prepared\", \"humor\", \"mood\"), \n # time point 2\n \"podcast\", \n # time point 3\n \"exam\"\n )\n ),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n humor = \"humor\",\n prepared = \"prepared\"\n )\n)\nggdag(podcast_dag, use_labels = \"label\", text = FALSE) +\n theme_dag()\n```\n\n::: {.cell-output-display}\n![Proposed DAG to answer the question: Does listening to a comedy podcast the morning before an exam improve graduate students' test scores?](chapter-05_files/figure-html/fig-dag-podcast-1.png){#fig-dag-podcast width=384}\n:::\n:::\n\n\n::: callout-note\nFor the rest of the chapter, we'll use `theme_dag()`, a ggplot theme from ggdag meant for DAGs.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntheme_set(\n theme_dag() %+replace%\n # also add some additional styling\n theme(\n legend.position = \"bottom\",\n strip.text.x = element_text(margin = margin(2, 0, 2, 0, \"mm\"))\n )\n)\n```\n:::\n\n:::\n\n::: callout-tip\n## DAG coordinates\n\nYou don't need to specify coordinates to ggdag.\nIf you don't, it uses algorithms designed for automatic layouts.\nThere are many such algorithms, and they focus on different aspects of the layout, e.g., the shape, the space between the nodes, minimizing how many edges cross, etc.\nThese layout algorithms usually have a component of randomness, so it's good to use a seed if you want to get the same result.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\n# no coordinates specified\nset.seed(123)\npod_dag <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared\n)\n\n# automatically determine layouts\npod_dag |> \n ggdag(text_size = 2.8)\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-14-1.png){fig-align='center' width=384}\n:::\n:::\n\n\nWe can also ask for a specific layout, e.g., the popular Sugiyama algorithm for DAGs [@sugiyama1981].\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npod_dag |> \n ggdag(layout = \"sugiyama\", text_size = 2.8)\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-15-1.png){fig-align='center' width=384}\n:::\n:::\n\n\nFor causal DAGs, the time-ordered layout algorithm is often best, which we can specify with `time_ordered_coords()` or `layout = \"time_ordered\"`.\nWe'll discuss time ordering in greater detail in @sec-time-ordered.\nEarlier, we explicitly told ggdag which variables were at which time points, but we don't need to.\nNotice, though, that the time ordering algorithm puts `podcast` and `exam` at the same time point since one doesn't cause another (and thus predate it).\nWe know that's not the case: listening to the podcast happened before taking the exam.\n\n\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npod_dag |> \n ggdag(layout = \"time_ordered\", text_size = 2.8)\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-16-1.png){fig-align='center' width=384}\n:::\n:::\n\n\nYou can manually specify coordinates using a list or data frame and provide them to the `coords` argument of `dagify()`.\nAdditionally, because ggdag is based on dagitty, you can use `dagitty.net` to create and organize a DAG using a graphical interface, then export the result as dagitty code for ggdag to consume.\n\nAlgorithmic layouts are lovely for fast visualization of DAGs or particularly complex graphs.\nOnce you want to share your DAG, it's usually best to be more intentional about the layout, perhaps by specifying the coordinates manually.\n`time_ordered_coords()` is often the best of both worlds, and we'll use it for most DAGs in this book.\n:::\n\nWe've specified the DAG for this question and told ggdag what the exposure and outcome of interest are.\nAccording to the DAG, there is no direct causal relationship between listening to a podcast and exam scores.\nAre there any other open paths?\n`ggdag_paths()` takes a DAG and visualizes the open paths.\nIn @fig-paths-podcast, we see two open paths: `podcast <- mood -> exam\"` and `podcast <- prepared -> exam`. These are both forks---*confounding pathways*. Since there is no causal relationship between listening to a podcast and exam scores, the only open paths are *backdoor* paths, these two confounding pathways.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag |> \n # show the whole dag as a light gray \"shadow\" \n # rather than just the paths\n ggdag_paths(shadow = TRUE, text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![`ggdag_paths()` visualizes open paths in a DAG. There are two open paths in `podcast_dag`: the fork from `mood` and the fork from `prepared`.](chapter-05_files/figure-html/fig-paths-podcast-1.png){#fig-paths-podcast width=672}\n:::\n:::\n\n\n::: callout-tip\n`dagify()` returns a `dagitty()` object, but underneath the hood, ggdag converts `dagitty` objects to tidy DAGs, a structure that holds both the `dagitty` object and a `dataframe` about the DAG.\nThis is handy if you want to manipulate the DAG programmatically.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag_tidy <- podcast_dag |> \n tidy_dagitty()\n\npodcast_dag_tidy\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A DAG with 5 nodes and 5 edges\n#\n# Exposure: podcast\n# Outcome: exam\n#\n# A tibble: 7 × 9\n name x y direction to xend yend\n \n1 exam 3 0 NA NA\n2 humor 1 0 -> podcast 2 0\n3 mood 1 1 -> exam 3 0\n4 mood 1 1 -> podcast 2 0\n5 podcast 2 0 NA NA\n6 prepared 1 -1 -> exam 3 0\n7 prepared 1 -1 -> podcast 2 0\n# ℹ 2 more variables: circular , label \n```\n\n\n:::\n:::\n\n\nMost of the quick plotting functions transform the `dagitty` object to a tidy DAG if it's not already, then manipulate the data in some capacity.\nFor instance, `dag_paths()` underlies `ggdag_paths()`; it returns a tidy DAG with data about the paths.\nYou can use several dplyr functions on these objects directly.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag_tidy |> \n dag_paths() |> \n filter(set == 2, path == \"open path\")\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A DAG with 3 nodes and 2 edges\n#\n# Exposure: podcast\n# Outcome: exam\n#\n# A tibble: 4 × 11\n set name x y direction to xend yend\n \n1 2 exam 3 0 NA NA\n2 2 podcast 2 0 NA NA\n3 2 prepar… 1 -1 -> exam 3 0\n4 2 prepar… 1 -1 -> podc… 2 0\n# ℹ 3 more variables: circular , label ,\n# path \n```\n\n\n:::\n:::\n\n\nTidy DAGs are not pure data frames, but you can retrieve either the `dataframe` or `dagitty` object to work with them directly using `pull_dag_data()` or `pull_dag()`.\n`pull_dag()` can be useful when you want to work with dagitty functions:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dagitty)\npodcast_dag_tidy |> \n pull_dag() |> \n paths()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n$paths\n[1] \"podcast <- mood -> exam\" \n[2] \"podcast <- prepared -> exam\"\n\n$open\n[1] TRUE TRUE\n```\n\n\n:::\n:::\n\n:::\n\nBackdoor paths pollute the statistical association between `podcast` and `exam`, so we must account for them.\n`ggdag_adjustment_set()` visualizes any valid adjustment sets implied by the DAG.\n@fig-podcast-adustment-set shows adjusted variables as squares.\nAny arrows coming out of adjusted variables are removed from the DAG because the path is longer open at that variable.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nggdag_adjustment_set(\n podcast_dag, \n text = FALSE, \n use_labels = \"label\"\n)\n```\n\n::: {.cell-output-display}\n![A visualization of the minimal adjustment set for the podcast-exam DAG. If this DAG is correct, two variables are required to block the backdoor paths: `mood` and `prepared`.](chapter-05_files/figure-html/fig-podcast-adustment-set-1.png){#fig-podcast-adustment-set fig-align='center' width=384}\n:::\n:::\n\n\n@fig-podcast-adustment-set shows the *minimal adjustment set*.\nBy default, ggdag returns the set(s) that can close all backdoor paths with the fewest number of variables possible.\nIn this DAG, that's just one set: `mood` and `prepared`.\nThis set makes sense because there are two backdoor paths, and the only other variables on them besides the exposure and outcome are these two variables.\nSo, at minimum, we must account for both to get a valid estimate.\n\n::: callout-tip\n`ggdag()` and friends usually use `tidy_dagitty()` and `dag_*()` or `node_*()` functions to change the underlying data frame.\nSimilarly, the quick plotting functions use ggdag's geoms to visualize the resulting DAG(s).\nIn other words, you can use the same data manipulation and visualization strategies that you use day-to-day directly with ggdag.\n\nHere's a condensed version of what `ggdag_adjustment_set()` is doing:\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\npodcast_dag_tidy |> \n # add adjustment sets to data\n dag_adjustment_sets() |>\n ggplot(aes(\n x = x, y = y, xend = xend, yend = yend,\n color = adjusted, shape = adjusted\n )) + \n # ggdag's custom geoms: add nodes, edges, and labels\n geom_dag_point() + \n # remove adjusted paths\n geom_dag_edges_link(data = \\(.df) filter(.df, adjusted != \"adjusted\")) + \n geom_dag_label_repel() + \n # you can use any ggplot function, too\n facet_wrap(~ set) +\n scale_shape_manual(values = c(adjusted = 15, unadjusted = 19))\n```\n\n::: {.cell-output-display}\n![](chapter-05_files/figure-html/unnamed-chunk-22-1.png){fig-align='center' width=432}\n:::\n:::\n\n:::\n\nMinimal adjustment sets are only one type of valid adjustment set [@vanderzander2019].\nSometimes, other combinations of variables can get us an unbiased effect estimate.\nTwo other options available in ggdag are full adjustment sets and canonical adjustment sets.\nFull adjustment sets are every combination of variables that result in a valid set.\n\n\n::: {.cell layout-align=\"center\"}\n\n```{.r .cell-code}\nggdag_adjustment_set(\n podcast_dag, \n text = FALSE, \n use_labels = \"label\",\n # get full adjustment sets\n type = \"all\"\n)\n```\n\n::: {.cell-output-display}\n![All valid adjustment sets for `podcast_dag`.](chapter-05_files/figure-html/fig-adustment-set-all-1.png){#fig-adustment-set-all fig-align='center' width=624}\n:::\n:::\n\n\nIt turns out that we can also control for `humor`.\n\nCanonical adjustment sets are a bit more complex: they are all possible ancestors of the exposure and outcome minus any likely descendants.\nIn fully saturated DAGs (DAGs where every node causes anything that comes after it in time), the canonical adjustment set is the minimal adjustment set.\n\n::: callout-tip\nMost of the functions in ggdag use dagitty underneath the hood.\nIt's often helpful to call dagitty functions directly.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nadjustmentSets(podcast_dag, type = \"canonical\")\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n{ humor, mood, prepared }\n```\n\n\n:::\n:::\n\n:::\n\nUsing our proposed DAG, let's simulate some data to see how accounting for the minimal adjustment set might occur in practice.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nset.seed(10)\nsim_data <- podcast_dag |>\n simulate_data()\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nsim_data\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n# A tibble: 500 × 5\n exam humor mood podcast prepared\n \n 1 -0.435 0.263 -0.100 -0.630 1.07 \n 2 -0.593 0.317 0.143 -1.55 0.0640\n 3 0.786 1.97 -0.591 -0.318 -0.439 \n 4 -0.103 2.86 -0.139 1.07 0.754 \n 5 -0.614 -2.39 0.702 0.464 0.356 \n 6 1.01 1.21 0.910 0.769 0.561 \n 7 0.167 -1.37 -0.559 -0.866 0.214 \n 8 1.16 0.164 -0.743 0.969 -1.67 \n 9 0.650 0.215 -0.248 0.691 -0.303 \n10 0.156 0.713 1.19 -1.02 -0.219 \n# ℹ 490 more rows\n```\n\n\n:::\n:::\n\n\nSince we have simulated this data, we know that this is a case where *standard methods will succeed* (see @sec-standard) and, therefore, can estimate the causal effect using a basic linear regression model.\n@fig-dag-sim shows a forest plot of the simulated data based on our DAG.\nNotice the model that only included the exposure resulted in a spurious effect (an estimate of -0.1 when we know the truth is 0).\nIn contrast, the model that adjusted for the two variables as suggested by `ggdag_adjustment_set()` is not spurious (much closer to 0).\n\n\n::: {.cell}\n\n```{.r .cell-code}\n## Model that does not close backdoor paths\nunadjusted_model <- lm(exam ~ podcast, sim_data) |>\n broom::tidy(conf.int = TRUE) |>\n dplyr::filter(term == \"podcast\") |>\n mutate(formula = \"podcast\")\n\n## Model that closes backdoor paths\nadjusted_model <- lm(exam ~ podcast + mood + prepared, sim_data) |>\n broom::tidy(conf.int = TRUE) |>\n dplyr::filter(term == \"podcast\") |>\n mutate(formula = \"podcast + mood + prepared\")\n\nbind_rows(\n unadjusted_model,\n adjusted_model\n) |>\n ggplot(aes(x = estimate, y = formula, xmin = conf.low, xmax = conf.high)) +\n geom_vline(xintercept = 0, linewidth = 1, color = \"grey80\") +\n geom_pointrange(fatten = 3, size = 1) +\n theme_minimal(18) +\n labs(\n y = NULL,\n caption = \"correct effect size: 0\"\n )\n```\n\n::: {.cell-output-display}\n![Forest plot of simulated data based on the DAG described in @fig-dag-podcast.](chapter-05_files/figure-html/fig-dag-sim-1.png){#fig-dag-sim width=672}\n:::\n:::\n\n\n## Structures of Causality\n\n### Advanced Confounding\n\nIn `podcast_dag`, `mood` and `prepared` were *direct* confounders: an arrow was going directly from them to `podcast` and `exam`.\nOften, backdoor paths are more complex.\nLet's consider such a case by adding two new variables: `alertness` and `skills_course`.\n`alertness` represents the feeling of alertness from a good mood, thus the arrow from `mood` to `alertness`.\n`skills_course` represents whether the student took a College Skills Course and learned time management techniques.\nNow, `skills_course` is what frees up the time to listen to a podcast as well as being prepared for the exam.\n`mood` and `prepared` are no longer direct confounders: they are two variables along a more complex backdoor path.\nAdditionally, we've added an arrow going from `humor` to `mood`.\nLet's take a look at @fig-podcast_dag2.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag2 <- dagify(\n podcast ~ mood + humor + skills_course,\n alertness ~ mood,\n mood ~ humor,\n prepared ~ skills_course,\n exam ~ alertness + prepared,\n coords = time_ordered_coords(),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n alertness = \"alertness\",\n skills_course = \"college\\nskills course\",\n humor = \"humor\",\n prepared = \"prepared\"\n )\n)\n\nggdag(podcast_dag2, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![An expanded version of `podcast_dag` that includes two additional variables: `skills_course`, representing a College Skills Course, and `alertness`.](chapter-05_files/figure-html/fig-podcast_dag2-1.png){#fig-podcast_dag2 width=480}\n:::\n:::\n\n::: {.cell}\n\n:::\n\n\nNow there are *three* backdoor paths we need to close: `podcast <- humor -> mood -> alertness -> exam`, `podcast <- mood -> alertness -> exam`, and`podcast <- skills_course -> prepared -> exam`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggdag_paths(podcast_dag2, use_labels = \"label\", text = FALSE, shadow = TRUE)\n```\n\n::: {.cell-output-display}\n![Three open paths in `podcast_dag2`. Since there is no effect of `podcast` on `exam`, all three are backdoor paths that must be closed to get the correct effect.](chapter-05_files/figure-html/fig-podcast_dag2-paths-1.png){#fig-podcast_dag2-paths width=1056}\n:::\n:::\n\n\nThere are four minimal adjustment sets to close all three paths (and eighteen full adjustment sets!).\nThe minimal adjustment sets are `alertness + prepared`, `alertness + skills_course`, `mood + prepared`, `mood + skills_course`.\nWe can now block the open paths in several ways.\n`mood` and `prepared` still work, but we've got other options now.\nNotably, `prepared` and `alertness` could happen at the same time or even after `podcast`.\n`skills_course` and `mood` still happen before both `podcast` and `exam`, so the idea is still the same: the confounding pathway starts before the exposure and outcome.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nggdag_adjustment_set(podcast_dag2, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![Valid minimal adjustment sets that will close the backdoor paths in @fig-podcast_dag2-paths.](chapter-05_files/figure-html/fig-podcast_dag2-set-1.png){#fig-podcast_dag2-set width=672}\n:::\n:::\n\n\nDeciding between these adjustment sets is a matter of judgment: if all data are perfectly measured, the DAG is correct, and we've modeled them correctly, then it doesn't matter which we use.\nEach adjustment set will result in an unbiased estimate.\nAll three of those assumptions are usually untrue to some degree.\nLet's consider the path via `skills_course` and `prepared`.\nIt may be that we are better able to assess whether or not someone took the College Skills Course than how prepared for the exam they are.\nIn that case, an adjustment set with `skills_course` is a better option.\nBut perhaps we better understand the relationship between preparedness and exam results.\nIf we have it measured, controlling for that might be better.\nWe could get the best of both worlds by including both variables: between the better measurement of `skills_course` and the better modeling of `prepared`, we might have a better chance of minimizing confounding from this path.\n\n### Selection Bias and Mediation\n\nSelection bias is another name for the type of bias that is induced by adjusting for a collider [@lu2022].\nIt's called \"selection bias\" because a common form of collider-induced bias is a variable inherently stratified upon by the design of the study---selection *into* the study.\nLet's consider a case based on the original `podcast_dag` but with one additional variable: whether or not the student showed up to the exam.\nNow, there is an indirect effect of `podcast` on `exam`: listening to a podcast influences whether or not the students attend the exam.\nThe true result of `exam` is missing for those who didn't show up; by studying the group of people who *did* show up, we are inherently stratifying on this variable.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag3 <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared + showed_up,\n showed_up ~ podcast + mood + prepared,\n coords = time_ordered_coords(\n list(\n # time point 1\n c(\"prepared\", \"humor\", \"mood\"), \n # time point 2\n \"podcast\", \n \"showed_up\", \n # time point 3\n \"exam\"\n )\n ),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n humor = \"humor\",\n prepared = \"prepared\",\n showed_up = \"showed up\"\n )\n)\nggdag(podcast_dag3, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![Another variant of `podcast_dag`, this time including the inherent stratification on those who appear for the exam. There is still no direct effect of `podcast` on `exam`, but there is an indirect effect via `showed_up`.](chapter-05_files/figure-html/fig-podcast_dag3-1.png){#fig-podcast_dag3 width=432}\n:::\n:::\n\n\nThe problem is that `showed_up` is both a collider and a mediator: stratifying on it induces a relationship between many of the variables in the DAG but blocks the indirect effect of `podcast` on `exam`.\nLuckily, the adjustment sets can handle the first problem; because `showed_up` happens *before* `exam`, we're less at risk of collider bias between the exposure and outcome.\nUnfortunately, we cannot calculate the total effect of `podcast` on `exam` because part of the effect is missing: the indirect effect is closed at `showed_up`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag3 |> \n adjust_for(\"showed_up\") |> \n ggdag_adjustment_set(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![The adjustment set for `podcast_dag3` given that the data are inherently conditioned on showing up to the exam. In this case, there is no way to recover an unbiased estimate of the total effect of `podcast` on `exam`.](chapter-05_files/figure-html/fig-podcast_dag3-as-1.png){#fig-podcast_dag3-as width=432}\n:::\n:::\n\n\nSometimes, you can still estimate effects in this situation by changing the estimate you wish to calculate.\nWe can't calculate the total effect because we are missing the indirect effect, but we can still calculate the direct effect of `podcast` on `exam`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag3 |> \n adjust_for(\"showed_up\") |> \n ggdag_adjustment_set(effect = \"direct\", text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![The adjustment set for `podcast_dag3` when targeting a different effect. There is one minimal adjustment set that we can use to estimate the direct effect of `podcast` on `exam`.](chapter-05_files/figure-html/fig-podcast_dag3-direct-1.png){#fig-podcast_dag3-direct width=432}\n:::\n:::\n\n\n#### M-Bias and Butterfly Bias\n\nA particular case of selection bias that you'll often see people talk about is *M-bias*.\nIt's called M-bias because it looks like an M when arranged top to bottom.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nm_bias() |> \n ggdag()\n```\n\n::: {.cell-output-display}\n![A DAG representing M-Bias, a situation where a collider predates the exposure and outcome.](chapter-05_files/figure-html/fig-m-bias-1.png){#fig-m-bias width=384}\n:::\n:::\n\n\n::: callout-tip\nggdag has several quick-DAGs for demonstrating basic causal structures, including `confounder_triangle()`, `collider_triangle()`, `m_bias()`, and `butterfly_bias()`.\n:::\n\nWhat's theoretically interesting about M-bias is that `m` is a collider but occurs before `x` and `y`.\nRemember that association is blocked at a collider, so there is no open path between `x` and `y`.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npaths(m_bias())\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n$paths\n[1] \"x <- a -> m <- b -> y\"\n\n$open\n[1] FALSE\n```\n\n\n:::\n:::\n\n\nLet's focus on the `mood` path of the podcast-exam DAG.\nWhat if we were wrong about mood, and the actual relationship was M-shaped?\nLet's say that, rather than causing `podcast` and `exam`, `mood` was itself caused by two mutual causes of `podcast` and `exam`, `u1` and `u2`, as in @fig-podcast_dag4.\nWe don't know what `u1` and `u2` are, and we don't have them measured.\nAs above, there are no open paths in this subset of the DAG.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag4 <- dagify(\n podcast ~ u1,\n exam ~ u2,\n mood ~ u1 + u2,\n coords = time_ordered_coords(list(\n c(\"u1\", \"u2\"),\n \"mood\",\n \"podcast\", \n \"exam\"\n )),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"mood\",\n u1 = \"unmeasured\",\n u2 = \"unmeasured\"\n ),\n # we don't have them measured\n latent = c(\"u1\", \"u2\")\n)\n\nggdag(podcast_dag4, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![A reconfiguration of @fig-dag-podcast where `mood` is a collider on an M-shaped path.](chapter-05_files/figure-html/fig-podcast_dag4-1.png){#fig-podcast_dag4 width=528}\n:::\n:::\n\n\nThe problem arises when we think our original DAG is the right DAG: `mood` is in the adjustment set, so we control for it.\nBut this induces bias!\nIt opens up a path between `u1` and `u2`, thus creating a path from `podcast` to `exam`.\nIf we had either `u1` or `u2` measured, we could adjust for them to close this path, but we don't.\nThere is no way to close this open path.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag4 |> \n adjust_for(\"mood\") |> \n ggdag_adjustment_set(use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![The adjustment set where `mood` is a collider. If we control for `mood` and don't know about or have the unmeasured causes of `mood`, we have no means of closing the backdoor path opened by adjusting for a collider.](chapter-05_files/figure-html/fig-podcast_dag4-as-1.png){#fig-podcast_dag4-as width=528}\n:::\n:::\n\n\nOf course, the best thing to do here is not control for `mood` at all.\nSometimes, though, that is not an option.\nImagine if, instead of `mood`, this turned out to be the real structure for `showed_up`: since we inherently control for `showed_up`, and we don't have the unmeasured variables, our study results will always be biased.\nIt's essential to understand if we're in that situation so we can address it with sensitivity analysis to understand just how biased the effect would be.\n\nLet's consider a variation on M-bias where `mood` causes `podcast` and `exam` and `u1` and `u2` are mutual causes of `mood` and the exposure and outcome.\nThis arrangement is sometimes called butterfly or bowtie bias, again because of its shape.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nbutterfly_bias(x = \"podcast\", y = \"exam\", m = \"mood\", a = \"u1\", b = \"u2\") |> \n ggdag(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![In butterfly bias, `mood` is both a collider and a confounder. Controlling for the bias induced by `mood` opens a new pathway because we've also conditioned on a collider. We can't properly close all backdoor paths without either `u1` or `u2`.](chapter-05_files/figure-html/fig-butterfly_bias-1.png){#fig-butterfly_bias width=480}\n:::\n:::\n\n\nNow, we're in a challenging position: we need to control for `mood` because it's a confounder, but controlling for `mood` opens up the pathway from `u1` to `u2`.\nBecause we don't have either variable measured, we can't then close the path opened from conditioning on `mood`.\nWhat should we do?\nIt turns out that, when in doubt, controlling for `mood` is the better of the two options: confounding bias tends to be worse than collider bias, and M-shaped structures of colliders are sensitive to slight deviations (e.g., if this is not the exact structure, often the bias isn't as bad) [@DingMiratrix2015].\n\nAnother common form of selection bias is from *loss to follow-up*: people drop out of a study in a way that is related to the exposure and outcome.\nWe'll come back to this topic in [Chapter -@sec-longitudinal].\n\n### Causes of the exposure, causes of the outcome\n\nLet's consider one other type of causal structure that's important: causes of the exposure and not the outcome, and their opposites, causes of the outcome and not the exposure.\nLet's add a variable, `grader_mood`, to the original DAG.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npodcast_dag5 <- dagify(\n podcast ~ mood + humor + prepared,\n exam ~ mood + prepared + grader_mood,\n coords = time_ordered_coords(\n list(\n # time point 1\n c(\"prepared\", \"humor\", \"mood\"), \n # time point 2\n c(\"podcast\", \"grader_mood\"), \n # time point 3\n \"exam\"\n )\n ),\n exposure = \"podcast\",\n outcome = \"exam\",\n labels = c(\n podcast = \"podcast\",\n exam = \"exam score\",\n mood = \"student\\nmood\",\n humor = \"humor\",\n prepared = \"prepared\",\n grader_mood = \"grader\\nmood\"\n )\n)\nggdag(podcast_dag5, use_labels = \"label\", text = FALSE)\n```\n\n::: {.cell-output-display}\n![A DAG containing a cause of the exposure that is not the cause of the outcome (`humor`) and a cause of the outcome that is not a cause of the exposure (`grader_mood`).](chapter-05_files/figure-html/fig-podcast_dag5-1.png){#fig-podcast_dag5 width=480}\n:::\n:::\n\n\nThere are now two variables that aren't related to *both* the exposure and the outcome: `humor`, which causes `podcast` but not `exam`, and `grader_mood`, which causes `exam` but not `podcast`.\nLet's start with `humor`.\n\nVariables that cause the exposure but not the outcome are also called *instrumental variables* (IVs).\nIVs are an unusual circumstance where, under certain conditions, controlling for them can make other types of bias worse.\nWhat's unique about this is that IVs can *also* be used to conduct an entirely different approach to estimating an unbiased effect of the exposure on the outcome.\nIVs are commonly used this way in econometrics and are increasingly popular in other areas.\nIn short, IV analysis allows us to estimate the causal effect using a different set of assumptions than the approaches we've talked about thus far.\nSometimes, a problem intractable using propensity score methods can be addressed using IVs and vice versa.\nWe'll talk more about IVs in @sec-iv-friends.\n\nSo, if you're *not* using IV methods, should you include an IV in a model meant to address confounding?\nIf you're unsure if the variable is an IV or not, you should probably add it to your model: it's more likely to be a confounder than an IV, and, it turns out, the bias from adding an IV is usually small in practice.\nSo, like adjusting for a potential M-structure variable, the risk of bias is worse from confounding [@Myers2011].\n\nNow, let's talk about the opposite of an IV: a cause of the outcome that is not the cause of the exposure.\nThese variables are sometimes called *competing exposures* (because they also cause the outcome) or *precision variables* (because, as we'll see, they increase the precision of causal estimates).\nWe'll call them precision variables because we're concerned about the relationship to the research question at hand, not to another research question where they are exposures [@Brookhart2006].\n\nLike IVs, precision variables do not occur along paths from the exposure to the outcome.\nThus, including them is not necessary.\nUnlike IVs, including precision variables is beneficial.\nIncluding other causes of the outcomes helps a statistical model capture some of its variation.\nThis doesn't impact the point estimate of the effect, but it does reduce the variance, resulting in smaller standard errors and narrower confidence intervals.\nThus, we recommend including them when possible.\n\nSo, even though we don't need to control for `grader_mood`, if we have it in the data set, we should.\nSimilarly, `humor` is not a good addition to the model unless we think it really might be a confounder; if it is a valid instrument, we might want to consider using IV methods to estimate the effect instead.\n\n### Measurement Error and Missingness\n\nDAGs can also help us understand the bias arising from mismeasurements in the data, including the worst mismeasurement: not measuring it at all.\nWe'll cover these topics in [Chapter -@sec-missingness], but the basic idea is that by separating the actual value from the observed value, we can better understand how such biases may behave [@Hernán2009].\nHere's a basic example of a bias called *recall bias*.\nRecall bias is when the outcome influences a participant's memory of exposure, so it's a particular problem in retrospective studies where the earlier exposure is not recorded until after the outcome happens.\nAn example of when this can occur is a case-control study of cancer.\nSomeone *with* cancer may be more motivated to ruminate on their past exposures than someone *without* cancer.\nSo, their memory about a given exposure may be more refined than someone without.\nBy conditioning on the observed version of the exposure, we open up many collider paths.\nUnfortunately, there is no way to close them all.\nIf this is the case, we must investigate how severe the bias would be in practice.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nerror_dag <- dagify(\n exposure_observed ~ exposure_real + exposure_error,\n outcome_observed ~ outcome_real + outcome_error,\n outcome_real ~ exposure_real,\n exposure_error ~ outcome_real,\n labels = c(\n exposure_real = \"Exposure\\n(truth)\",\n exposure_error = \"Measurement Error\\n(exposure)\",\n exposure_observed = \"Exposure\\n(observed)\",\n outcome_real = \"Outcome\\n(truth)\",\n outcome_error = \"Measurement Error\\n(outcome)\",\n outcome_observed = \"Outcome\\n(observed)\"\n ),\n exposure = \"exposure_real\",\n outcome = \"outcome_real\",\n coords = time_ordered_coords()\n)\n\nerror_dag |> \n ggdag(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![A DAG representing measurement error in observing the exposure and outcome. In this case, the outcome impacts the participant's memory of the exposure, also known as recall bias.](chapter-05_files/figure-html/fig-error_dag-1.png){#fig-error_dag width=528}\n:::\n:::\n\n\n## Recommendations in building DAGs\n\nIn principle, using DAGs is easy: specify the causal relationships you think exist and then query the DAG for information like valid adjustment sets.\nIn practice, assembling DAGs takes considerable time and thought.\nNext to defining the research question itself, it's one of the most challenging steps in making causal inferences.\nVery little guidance exists on best practices in assembling DAGs.\n@Tennant2021 collected data on DAGs in applied health research to better understand how researchers used them.\n@tbl-dag-properties shows some information they collected: the median number of nodes and arcs in a DAG, their ratio, the saturation percent of the DAG, and how many were fully saturated.\nSaturating DAGs means adding all possible arrows going forward in time, e.g., in a fully saturated DAG, any given variable at time point 1 has arrows going to all variables in future time points, and so on.\nMost DAGs were only about half saturated, and very few were fully saturated.\n\nOnly about half of the papers using DAGs reported the adjustment set used.\nIn other words, researchers presented their assumptions about the research question but not the implications about how they should handle the modeling stage or if they did use a valid adjustment set.\nSimilarly, the majority of studies did not report the estimand of interest.\n\n::: callout-note\nThe estimand is the target of interest in terms of what we're trying to estimate, as discussed briefly in [Chapter -@sec-whole-game].\nWe'll discuss estimands in detail in [Chapter -@sec-estimands].\n:::\n\n\n::: {#tbl-dag-properties .cell tbl-cap='A table of DAG properties measured by @Tennant2021. Number of nodes and arcs are the median number of variables and arrows in the analyzed DAGs, while the Node to Arc ratio is their ratio. Saturation proportion is the proportion of all possible arrows going forward in time to other included variables. Fully saturated DAGs are those that include all such arrows. @Tennant2021 also analyzed whether studies reported their estimands and adjustment sets.'}\n::: {.cell-output-display}\n\n```{=html}\n
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n \n \n \n\n \n\n \n\n \n\n \n\n \n\n \n \n \n \n \n \n \n
CharacteristicN = 1441
DAG properties
Number of Nodes12 (9, 16)
Number of Arcs29 (19, 41)
Node to Arc Ratio2.30 (1.78, 3.00)
Saturation Proportion0.46 (0.31, 0.67)
Fully Saturated
    Yes4 (3%)
    No140 (97%)
Reporting
Reported Estimand
    Yes40 (28%)
    No104 (72%)
Reported Adjustment Set
    Yes80 (56%)
    No64 (44%)
1 Median (IQR); n (%)
\n
\n```\n\n:::\n:::\n\n\nIn this section, we'll offer some advice from @Tennant2021 and our own experience assembling DAGs.\n\n### Iterate early and often\n\nOne of the best things you can do for the quality of your results is to make the DAG before you conduct the study, ideally before you even collect the data.\nIf you're already working with your data, at minimum, build your DAG before doing data analysis.\nThis advice is similar in spirit to pre-registered analysis plans: declaring your assumptions ahead of time can help clarify what you need to do, reduce the risk of overfitting (e.g., determining confounders incorrectly from the data), and give you time to get feedback on your DAG.\n\nThis last benefit is significant: you should ideally democratize your DAG.\nShare it early and often with others who are experts on the data, domain, and models.\nIt's natural to create a DAG, present it to your colleagues, and realize you have missed something important.\nSometimes, you will only agree on some details of the structure.\nThat's a good thing: you know now where there is uncertainty in your DAG.\nYou can then examine the results from multiple plausible DAGs or address the uncertainty with sensitivity analyses.\n\nIf you have more than one candidate DAG, check their adjustment sets.\nIf two DAGs have overlapping adjustment sets, focus on those sets; then, you can move forward in a way that satisfies the plausible assumptions you have.\n\n### Consider your question\n\nAs we saw in @fig-podcast_dag3, some questions can be challenging to answer with certain data, while others are more approachable.\nYou should consider precisely what it is you want to estimate.\nDefining your target estimate is an important topic and the subject of [Chapter -@sec-estimands].\n\nAnother important detail about how your DAG relates to your question is the population and time.\nMany causal structures are not static over time and space.\nConsider lung cancer: the distribution of causes of lung cancer was considerably different before the spread of smoking.\nIn medieval Japan, before the spread of tobacco from the Americas centuries later, the causal structure for lung cancer would have been practically different from what it is in Japan today, both in terms of tobacco use and other factors (age of the population, etc.).\n\nThe same is true for confounders.\nEven if something *can* cause the exposure and outcome, if the prevalence of that thing is zero in the population you're analyzing, it's irrelevant to the causal question.\nIt may also be that, in some populations, it doesn't affect one of the two.\nThe reverse is also true: something might be unique to the target population.\nThe use of tobacco in North America several centuries ago was unique among the world population, even though ceremonial tobacco use was quite different from modern recreational use.\nMany changes won't happen as dramatically as across centuries, but sometimes, they do, e.g., if regulation in one country effectively eliminates the population's exposure to something.\n\n### Order nodes by time {#chapter-05-chapter-05-chapter-05-sec-time-ordered}\n\nAs discussed earlier, we recommend ordering your variables by time, either left-to-right or up-to-down.\nThere are two reasons for this.\nFirst, time ordering is an integral part of your assumptions.\nAfter all, something happening before another thing is a requirement for it to be a cause.\nThinking this through carefully will clarify your DAG and the variables you need to address.\n\nSecond, after a certain level of complexity, it's easier to read a DAG when arranged by time because you have to think less about that dimension; it's inherent to the layout.\nThe time ordering algorithm in ggdag automates much of this for you, although, as we saw earlier, it's sometimes helpful to give it more information about the order.\n\nA related topic is feedback loops [@murray2022].\nOften, we think about two things that mutually cause each other as happening in a circle, like global warming and A/C use (A/C use increases global warming, which makes it hotter, which increases A/C use, and so on).\nIt's tempting to visualize that relationship like this:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n ac_use ~ global_temp,\n global_temp ~ ac_use,\n labels = c(ac_use = \"A/C use\", global_temp = \"Global\\ntemperature\")\n) |> \n ggdag(layout = \"circle\", edge_type = \"arc\", text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![A DAG representing the reciprocal relationship between A/C use and global temperature because of global warming. Feedback loops are useful mental shorthands to describe variables that impact each other over time compactly, but they are not true causal diagrams.](chapter-05_files/figure-html/fig-feedback-loop-1.png){#fig-feedback-loop width=432}\n:::\n:::\n\n\nFrom a DAG perspective, this is a problem because of the *A* part of *DAG*: it's cyclic!\nImportantly, though, it's also not correct from a causal perspective.\nFeedback loops are a shorthand for what really happens, which is that the two variables mutually affect each other *over time*.\nCausality only goes forward in time, so it doesn't make sense to go back and forth like in @fig-feedback-loop.\n\nThe real DAG looks something like this:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n global_temp_2000 ~ ac_use_1990 + global_temp_1990,\n ac_use_2000 ~ ac_use_1990 + global_temp_1990,\n global_temp_2010 ~ ac_use_2000 + global_temp_2000,\n ac_use_2010 ~ ac_use_2000 + global_temp_2000,\n global_temp_2020 ~ ac_use_2010 + global_temp_2010,\n ac_use_2020 ~ ac_use_2010 + global_temp_2010,\n coords = time_ordered_coords(),\n labels = c(\n ac_use_1990 = \"A/C use\\n(1990)\", \n global_temp_1990 = \"Global\\ntemperature\\n(1990)\",\n ac_use_2000 = \"A/C use\\n(2000)\", \n global_temp_2000 = \"Global\\ntemperature\\n(2000)\",\n ac_use_2010 = \"A/C use\\n(2010)\", \n global_temp_2010 = \"Global\\ntemperature\\n(2010)\",\n ac_use_2020 = \"A/C use\\n(2020)\", \n global_temp_2020 = \"Global\\ntemperature\\n(2020)\"\n )\n) |> \n ggdag(text = FALSE, use_labels = \"label\")\n```\n\n::: {.cell-output-display}\n![A DAG showing the relationship between A/C use and global temperature over time. The true causal relationship in a feedback loop goes *forward*.](chapter-05_files/figure-html/fig-feedforward-1.png){#fig-feedforward width=480}\n:::\n:::\n\n\nThe two variables, rather than being in a feed*back* loop, are actually in a feed*forward* loop: they co-evolve over time.\nHere, we only show four discrete moments in time (the decades from 1990 to 2020), but of course, we could get much finer depending on the question and data.\n\nAs with any DAG, the proper analysis approach depends on the question.\nThe effect of A/C use in 2000 on the global temperature in 2020 produces a different adjustment set than the global temperature in 2000 on A/C use in 2020.\nSimilarly, whether we also model this change over time or just those two time points depends on the question.\nOften, these feedforward relationships require you to address *time-varying* confounding, which we'll discuss in [Chapter -@sec-longitudinal].\n\n### Consider the whole data collection process\n\nAs @fig-podcast_dag3 showed us, it's essential to consider the *way* we collected data as much as the causal structure of the question.\nConsidering the whole data collection process is particularly true if you're working with \"found\" data---a data set not intentionally collected to answer the research question.\nWe are always inherently conditioning on the data we have vs. the data we don't have.\nIf other variables influenced the data collection process in the causal structure, you need to consider the impact.\nDo you need to control for additional variables?\nDo you need to change the effect you are trying to estimate?\nCan you answer the question at all?\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n::: callout-tip\n## What about case-control studies?\n\nA standard study design in epidemiology is the case-control study.\nCase-control studies are beneficial when the outcome under study is rare or takes a very long time to happen (like many types of cancer).\nParticipants are selected into the study based on their outcome: once a person has an event, they are entered as a case and matched with a control who hasn't had the event.\nOften, they are matched on other factors as well.\n\nMatched case-control studies are selection biased by design [@mansournia2013].\nIn @fig-case-control, when we condition on selection into the study, we lose the ability to close all backdoor paths, even if we control for `confounder`.\nFrom the DAG, it would appear that the entire design is invalid!\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n outcome ~ confounder + exposure,\n selection ~ outcome + confounder,\n exposure ~ confounder,\n exposure = \"exposure\",\n outcome = \"outcome\",\n coords = time_ordered_coords()\n) |> \n ggdag(edge_type = \"arc\", text_size = 2.2)\n```\n\n::: {.cell-output-display}\n![A DAG representing a matched case-control study. In such a study, selection is determined by outcome status and any matched confounders. Selection into the study is thus a collider. Since it is inherently stratified on who is actually in the study, such data are limited in the types of causal effects they can estimate.](chapter-05_files/figure-html/fig-case-control-1.png){#fig-case-control width=432}\n:::\n:::\n\n\nLuckily, this isn't wholly true.\nCase-control studies are limited in the type of causal effects they can estimate (causal odds ratios, which under some circumstances approximate causal risk ratios).\nWith careful study design and sampling, the math works out such that these estimates are still valid.\nExactly how and why case-control studies work is beyond the scope of this book, but they are a remarkably clever design.\n:::\n\n### Include variables you don't have\n\nIt's critical that you include *all* variables important to the causal structure, not just the variables you have measured in your data.\nggdag can mark variables as unmeasured (\"latent\"); it will then return only usable adjustment sets, e.g., those without the unmeasured variables.\nOf course, the best thing to do is to use DAGs to help you understand what to measure in the first place, but there are many reasons why your data might be different.\nEven data intentionally collected for the research question might not have a variable discovered to be a confounder after data collection.\n\nFor instance, if we have a DAG where `exposure` and `outcome` have a confounding pathway consisting of `confounder1` and `confounder2`, we can control for either to successfully debias the estimate:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n outcome ~ exposure + confounder1,\n exposure ~ confounder2,\n confounder2 ~ confounder1,\n exposure = \"exposure\",\n outcome = \"outcome\"\n) |> \n adjustmentSets()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n{ confounder1 }\n{ confounder2 }\n```\n\n\n:::\n:::\n\n\nThus, if just one is missing (`latent`), then we are ok:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n outcome ~ exposure + confounder1,\n exposure ~ confounder2,\n confounder2 ~ confounder1,\n exposure = \"exposure\",\n outcome = \"outcome\",\n latent = \"confounder1\"\n) |> \n adjustmentSets()\n```\n\n::: {.cell-output .cell-output-stdout}\n\n```\n{ confounder2 }\n```\n\n\n:::\n:::\n\n\nBut if both are missing, there are no valid adjustment sets.\n\nWhen you don't have a variable measured, you still have a few options.\nAs mentioned above, you may be able to identify alternate adjustment sets.\nIf the missing variable is required to close all backdoor paths completely, you can and should conduct a sensitivity analysis to understand the impact of not having it.\nThis is the subject of [Chapter -@sec-sensitivity].\n\nUnder some lucky circumstances, you can also use a *proxy* confounder [@miao2018].\nA proxy confounder is a variable closely related to the confounder such that controlling for it controls for some of the effects of the missing variable.\nConsider an expansion of the fundamental confounding relationship where `q` has a cause, `p`, as in @fig-proxy-confounder.\nTechnically, if we don't have `q`, we can't close the backdoor path, and our effect will be biased.\nPractically, though, if `p` is highly correlated with `q`, it can serve as a method to reduce the confounding from `q`.\nYou can think of `p` as a mismeasured version of `q`; it will seldom wholly control for the bias via `q`, but it can help minimize it.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndagify(\n y ~ x + q,\n x ~ q,\n q ~ p,\n coords = time_ordered_coords()\n) |> \n ggdag(edge_type = \"arc\")\n```\n\n::: {.cell-output-display}\n![A DAG with a confounder, `q`, and a proxy confounder, `p`. The true adjustment set is `q`. Since `p` causes `q`, it contains information about `q` and can reduce the bias if we don't have `q` measured.](chapter-05_files/figure-html/fig-proxy-confounder-1.png){#fig-proxy-confounder width=432}\n:::\n:::\n\n\n### Saturate your DAG, then prune\n\nIn discussing @tbl-dag-properties, we mentioned *saturated* DAGs.\nThese are DAGs where all possible arrows are included based on the time ordering, e.g., every variable causes variables that come after it in time.\n\n*Not* including an arrow is a bigger assumption than including one.\nIn other words, your default should be to have an arrow from one variable to a future variable.\nThis default is counterintuitive to many people.\nHow can it be that we need to be so careful about assessing causal effects yet be so liberal in applying causal assumptions in the DAG?\nThe answer to this lies in the strength and prevalence of the cause.\nTechnically, an arrow present means that *for at least a single observation*, the prior node causes the following node.\nThe arrow similarly says nothing about the strength of the relationship.\nSo, a minuscule causal effect on a single individual justifies the presence of an arrow.\nIn practice, such a case is probably not relevant.\nThere is *effectively* no arrow.\n\nThe more significant point, though, is that you should feel confident to add an arrow.\nThe bar for justification is much lower than you think.\nInstead, it's helpful to 1) determine your time ordering, 2) saturate the DAG, and 3) prune out implausible arrows.\n\nLet's experiment by working through a saturated version of the podcast-exam DAG.\n\nFirst, the time-ordering.\nPresumably, the student's sense of humor far predates the day of the exam.\nMood in the morning, too, predates listening to the podcast or exam score, as does preparation.\nThe saturated DAG given this ordering is:\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A saturated version of `podcast_dag`: variables have all possible arrows going forward to other variables over time.](chapter-05_files/figure-html/fig-podcast_dag_sat-1.png){#fig-podcast_dag_sat width=528}\n:::\n:::\n\n\nThere are a few new arrows here.\nHumor now causes the other two confounders, as well as exam score.\nSome of them make sense.\nSense of humor probably affects mood for some people.\nWhat about preparedness?\nThis relationship seems a little less plausible.\nSimilarly, we know that a sense of humor does not affect exam scores in this case because the grading is blinded.\nLet's prune those two.\n\n\n::: {.cell}\n::: {.cell-output-display}\n![A pruned version of @fig-podcast_dag_sat: we've removed implausible arrows from the fully saturated DAGs.](chapter-05_files/figure-html/fig-podcast_dag_pruned-1.png){#fig-podcast_dag_pruned width=528}\n:::\n:::\n\n\nThis DAG seems more reasonable.\nSo, was our original DAG wrong?\nThat depends on several factors.\nNotably, both DAGs produce the same adjustment set: controlling for `mood` and `prepared` will give us an unbiased effect if either DAG is correct.\nEven if the new DAG were to produce a different adjustment set, whether the result is meaningfully different depends on the strength of the confounding.\n\n### Include instruments and precision variables\n\nTechnically, you do not need to include instrumental and precision variables in your DAG.\nThe adjustment sets will be the same with and without them.\nHowever, adding them is helpful for two reasons.\nFirstly, they demonstrate your assumptions about their relationships and the variables under study.\nAs discussed above, *not* including an arrow is a more significant assumption than including one, so it's valuable information about how you think the causal structure operates.\nSecondly, it impacts your modeling decision.\nYou should always include precision variables in your model to reduce variability in your estimate so it helps you identify those.\nInstruments are also helpful to see because they may guide alternative or complementary modeling strategies, as we'll discuss in @sec-evidence.\n\n### Focus on the causal structure, then consider measurement bias\n\nAs we saw above, missingness and measurement error can be a source of bias.\nAs we'll see in [Chapter -@sec-missingness], we have several strategies to approach such a situation.\nYet, almost everything we measure is inaccurate to some degree.\nThe true DAG for the data at hand inherently conditions on the measured version of variables.\nIn that sense, your data are always subtly-wrong, a sort of unreliable narrator.\nWhen should we include this information in the DAG?\nWe recommend first focusing on the causal structure of the DAG as if you had perfectly measured each variable [@hernan2021].\nThen, consider how mismeasurement and missingness might affect the realized data, particularly related to the exposure, outcome, and critical confounders.\nYou may prefer to present this as an alternative DAG to consider strategies for addressing the bias arising from those sources, e.g., imputation or sensitivity analyses.\nAfter all, the DAG in \\@ fig-error_dag makes you think the question is unanswerable because we have no method to close all backdoor paths.\nAs with all open paths, that depends on the severity of the bias and our ability to reckon with it.\n\n\n\n\n\n### Pick adjustment sets most likely to be successful\n\nOne area where measurement error is an important consideration is when picking an adjustment set.\nIn theory, if a DAG is correct, any adjustment set will work to create an unbiased result.\nIn practice, variables have different levels of quality.\nPick an adjustment set most likely to succeed because it contains accurate variables.\nSimilarly, non-minimal adjustment sets are helpful to consider because, together, several variables with measurement error along a backdoor path may be enough to minimize the practical bias resulting from that path.\n\nWhat if you don't have certain critical variables measured and thus do not have a valid adjustment set?\nIn that case, you should pick the adjustment set with the best chance of minimizing the bias from other backdoor paths.\nAll is not lost if you don't have every confounder measured: get the highest quality estimate you can, then conduct a sensitivity analysis about the unmeasured variables to understand the impact.\n\n### Use robustness checks\n\nFinally, we recommend checking your DAG for robustness.\nYou can never verify the correctness of your DAG under most conditions, but you can use the implications in your DAG to support it.\nThree types of robustness checks can be helpful depending on the circumstances.\n\n1. **Negative controls** [@Lipsitch2010]. These come in two flavors: negative exposure controls and negative outcome controls. The idea is to find something associated with one but not the other, e.g., the outcome but not the exposure, so there should be no effect. Since there should be no effect, you now have a measurement for how well you control for *other* effects (e.g., the difference from null). Ideally, the confounders for negative controls are similar to the research question.\n2. **DAG-data consistency** [@Textor2016]. Negative controls are an implication of your DAG. An extension of this idea is that there are *many* such implications. Because blocking a path removes statistical dependencies from that path, you can check those assumptions in several places in your DAG.\n3. **Alternate adjustment sets**. Adjustment sets should give roughly the same answer because, outside of random and measurement errors, they are all sets that block backdoor paths. If more than one adjustment set seems reasonable, you can use that as a sensitivity analysis by checking multiple models.\n\nWe'll discuss these in detail in [Chapter -@sec-sensitivity].\nThe caveat here is that these should be complementary to your initial DAG, not a way of *replacing* it.\nIn fact, if you use more than one adjustment set during your analysis, you should report the results from all of them to avoid overfitting your results to your data.\n", "supporting": [ "chapter-05_files" ], diff --git a/_freeze/chapters/chapter-16/execute-results/html.json b/_freeze/chapters/chapter-16/execute-results/html.json index af2d264..1123de4 100644 --- a/_freeze/chapters/chapter-16/execute-results/html.json +++ b/_freeze/chapters/chapter-16/execute-results/html.json @@ -1,7 +1,7 @@ { - "hash": "8797db807aa4f4182c67492d4f35ea09", + "hash": "05f0f21481f04bb0be23917f8f78e338", "result": { - "markdown": "# Interaction {#sec-interaction}\n\n\n\n\n\n## Functional form, hetereogenous effects, and joint causes\n\n## Fitting interaction terms in causal models", + "markdown": "# Interaction {#sec-interaction}\n\n\n\n\n\n## Functional form, hetereogenous effects, and joint causes\n\n## Fitting interaction terms in causal models\n\n\n", "supporting": [], "filters": [ "rmarkdown/pagebreak.lua" diff --git a/chapters/chapter-05.qmd b/chapters/chapter-05.qmd index dc29bf1..15bcf15 100644 --- a/chapters/chapter-05.qmd +++ b/chapters/chapter-05.qmd @@ -62,7 +62,7 @@ The type of causal diagrams we use are also called directed acyclic graphs (DAGs These graphs are directed because they include arrows going in a specific direction. They're acyclic because they don't go in circles; a variable can't cause itself, for instance. DAGs are used for various problems, but we're specifically concerned with *causal* DAGs. -This class of DAGs is sometimes called Structural Causal Models (SCMs) because they are a model of the causal structure of a question. +This class of DAGs is sometimes called Structural Causal Models (SCMs) because they are a model of the causal structure of a question [@hernan2021; @Pearl_Glymour_Jewell_2021]. [^1]: An essential but rarely observed detail of DAGs is that dag is also an [affectionate Australian insult](https://en.wikipedia.org/wiki/Dag_(slang)) referring to the dung-caked fur of a sheep, a *daglock*. @@ -340,8 +340,8 @@ Since `x` and `y` happen before `q`, `q` can't impact them. Let's turn the DAG on its side and consider @fig-collider-time. If we break down the two time points, at time point 1, `q` hasn't happened yet, and `x` and `y` are unrelated. At time point 2, `q` happens due to `x` and `y`. -But causality only goes forward in time. -`q` happening later can't change the fact that `x` and `y` happened independently at time point 1. +*But causality only goes forward in time*. +`q` happening later can't change the fact that `x` and `y` happened independently in the past. ```{r} #| echo: false @@ -389,14 +389,14 @@ Consider a case where `x` and `y` are the only causes of `q`, and all three vari When *either* `x` or `y` equals 1, then `q` happens. If we know `q = 1` and `x = 0` then logically it must be that `y = 1`. Thus, knowing about `q` gives us information about `y` via `x`. -This example is extreme, but it shows how this type of bias, sometimes called *collider-stratification bias* or *selection bias*, occurs: conditioning on `q` provides statistical information about `x` and `y` and distorts their relationship. +This example is extreme, but it shows how this type of bias, sometimes called *collider-stratification bias* or *selection bias*, occurs: conditioning on `q` provides statistical information about `x` and `y` and distorts their relationship [@Banack2023]. ::: callout-tip ## Exchangeability revisited We commonly refer to exchangability as the assumption of no confounding. Actually, this isn't quite right. -It's the assumption of no *open, non-causal* paths. +It's the assumption of no *open, non-causal* paths [@hernan2021]. Many times, these are confounding pathways. However, conditioning on a collider can also open paths. Even though these aren't confounders, doing so creates non-exchangeability between the two groups: they are different in a way that matters to the exposure and outcome. @@ -409,7 +409,7 @@ Correctly identifying the causal structure between the exposure and outcome thus Importantly, in doing 2), we are also often able to identify ways to prevent bias based on the assumptions in 1). In the simple case of the three DAGs in @fig-dag-path-types, we know whether or not to control for `q` depending on the nature of the causal structure. The set or sets of variables we need to adjust for is called the *adjustment set*. -DAGs can help us identify adjustment sets even in complex settings. +DAGs can help us identify adjustment sets even in complex settings [@vanderzander2019]. ::: callout-tip ## What about interaction? @@ -423,10 +423,10 @@ There are several ways we use interactions in causal inference. In one extreme, they are simply a matter of functional form: interaction terms are included in models but marginalized to get an overall causal effect. Conversely, we're interested in *joint causal effects*, where the two variables interacting are both causal. In between, we can use interaction terms to identify *heterogeneous causal effects*, which vary by a second variable that is not assumed to be causal. -As with many tools in causal inference, we use the same statistical technique in many ways to answer different questions. +As with many tools in causal inference, we use the same statistical technique in many ways to answer different questions. We'll revisit this topic in detail in [Chapter -@sec-interaction]. -Many people have tried expressing interaction in DAGs using different types of arcs, nodes, and other annotations, but no approach has taken off as the preferred way. +Many people have tried expressing interaction in DAGs using different types of arcs, nodes, and other annotations, but no approach has taken off as the preferred way [@weinberg2007; @Nilsson2021]. ::: Let's take a look at an example in R. @@ -561,7 +561,7 @@ pod_dag |> ggdag(text_size = 2.8) ``` -We can also ask for a specific layout, e.g., the popular Sugiyama algorithm for DAGs. +We can also ask for a specific layout, e.g., the popular Sugiyama algorithm for DAGs [@sugiyama1981]. ```{r} #| fig-width: 4 @@ -695,7 +695,7 @@ podcast_dag_tidy |> ``` ::: -Minimal adjustment sets are only one type of valid adjustment set. +Minimal adjustment sets are only one type of valid adjustment set [@vanderzander2019]. Sometimes, other combinations of variables can get us an unbiased effect estimate. Two other options available in ggdag are full adjustment sets and canonical adjustment sets. Full adjustment sets are every combination of variables that result in a valid set. @@ -862,7 +862,7 @@ We could get the best of both worlds by including both variables: between the be ### Selection Bias and Mediation -Selection bias is another name for the type of bias that is induced by adjusting for a collider. +Selection bias is another name for the type of bias that is induced by adjusting for a collider [@lu2022]. It's called "selection bias" because a common form of collider-induced bias is a variable inherently stratified upon by the design of the study---selection *into* the study. Let's consider a case based on the original `podcast_dag` but with one additional variable: whether or not the student showed up to the exam. Now, there is an indirect effect of `podcast` on `exam`: listening to a podcast influences whether or not the students attend the exam. @@ -949,7 +949,8 @@ m_bias() |> ggdag has several quick-DAGs for demonstrating basic causal structures, including `confounder_triangle()`, `collider_triangle()`, `m_bias()`, and `butterfly_bias()`. ::: -What's theoretically interesting about M-bias is that `m' is a collider but occurs before`x`and`y`. Remember that association is blocked at a collider, so there is no open path between`x`and`y\`. +What's theoretically interesting about M-bias is that `m` is a collider but occurs before `x` and `y`. +Remember that association is blocked at a collider, so there is no open path between `x` and `y`. ```{r} paths(m_bias()) @@ -1029,7 +1030,7 @@ butterfly_bias(x = "podcast", y = "exam", m = "mood", a = "u1", b = "u2") |> Now, we're in a challenging position: we need to control for `mood` because it's a confounder, but controlling for `mood` opens up the pathway from `u1` to `u2`. Because we don't have either variable measured, we can't then close the path opened from conditioning on `mood`. What should we do? -It turns out that, when in doubt, controlling for `mood` is the better of the two options: confounding bias tends to be worse than collider bias, and M-shaped structures of colliders are sensitive to slight deviations (e.g., if this is not the exact structure, often the bias isn't as bad). +It turns out that, when in doubt, controlling for `mood` is the better of the two options: confounding bias tends to be worse than collider bias, and M-shaped structures of colliders are sensitive to slight deviations (e.g., if this is not the exact structure, often the bias isn't as bad) [@DingMiratrix2015]. Another common form of selection bias is from *loss to follow-up*: people drop out of a study in a way that is related to the exposure and outcome. We'll come back to this topic in [Chapter -@sec-longitudinal]. @@ -1084,11 +1085,11 @@ We'll talk more about IVs in @sec-iv-friends. So, if you're *not* using IV methods, should you include an IV in a model meant to address confounding? If you're unsure if the variable is an IV or not, you should probably add it to your model: it's more likely to be a confounder than an IV, and, it turns out, the bias from adding an IV is usually small in practice. -So, like adjusting for a potential M-structure variable, the risk of bias is worse from confounding. +So, like adjusting for a potential M-structure variable, the risk of bias is worse from confounding [@Myers2011]. Now, let's talk about the opposite of an IV: a cause of the outcome that is not the cause of the exposure. These variables are sometimes called *competing exposures* (because they also cause the outcome) or *precision variables* (because, as we'll see, they increase the precision of causal estimates). -We'll call them precision variables because we're concerned about the relationship to the research question at hand, not to another research question where they are exposures. +We'll call them precision variables because we're concerned about the relationship to the research question at hand, not to another research question where they are exposures [@Brookhart2006]. Like IVs, precision variables do not occur along paths from the exposure to the outcome. Thus, including them is not necessary. @@ -1103,7 +1104,7 @@ Similarly, `humor` is not a good addition to the model unless we think it really ### Measurement Error and Missingness DAGs can also help us understand the bias arising from mismeasurements in the data, including the worst mismeasurement: not measuring it at all. -We'll cover these topics in [Chapter -@sec-missingness], but the basic idea is that by separating the actual value from the observed value, we can better understand how such biases may behave. +We'll cover these topics in [Chapter -@sec-missingness], but the basic idea is that by separating the actual value from the observed value, we can better understand how such biases may behave [@Hernán2009]. Here's a basic example of a bias called *recall bias*. Recall bias is when the outcome influences a participant's memory of exposure, so it's a particular problem in retrospective studies where the earlier exposure is not recorded until after the outcome happens. An example of when this can occur is a case-control study of cancer. @@ -1254,7 +1255,7 @@ Thinking this through carefully will clarify your DAG and the variables you need Second, after a certain level of complexity, it's easier to read a DAG when arranged by time because you have to think less about that dimension; it's inherent to the layout. The time ordering algorithm in ggdag automates much of this for you, although, as we saw earlier, it's sometimes helpful to give it more information about the order. -A related topic is feedback loops. +A related topic is feedback loops [@murray2022]. Often, we think about two things that mutually cause each other as happening in a circle, like global warming and A/C use (A/C use increases global warming, which makes it hotter, which increases A/C use, and so on). It's tempting to visualize that relationship like this: @@ -1389,7 +1390,7 @@ Case-control studies are beneficial when the outcome under study is rare or take Participants are selected into the study based on their outcome: once a person has an event, they are entered as a case and matched with a control who hasn't had the event. Often, they are matched on other factors as well. -Matched case-control studies are selection biased by design. +Matched case-control studies are selection biased by design [@mansournia2013]. In @fig-case-control, when we condition on selection into the study, we lose the ability to close all backdoor paths, even if we control for `confounder`. From the DAG, it would appear that the entire design is invalid! @@ -1456,7 +1457,7 @@ As mentioned above, you may be able to identify alternate adjustment sets. If the missing variable is required to close all backdoor paths completely, you can and should conduct a sensitivity analysis to understand the impact of not having it. This is the subject of [Chapter -@sec-sensitivity]. -Under some lucky circumstances, you can also use a *proxy* confounder. +Under some lucky circumstances, you can also use a *proxy* confounder [@miao2018]. A proxy confounder is a variable closely related to the confounder such that controlling for it controls for some of the effects of the missing variable. Consider an expansion of the fundamental confounding relationship where `q` has a cause, `p`, as in @fig-proxy-confounder. Technically, if we don't have `q`, we can't close the backdoor path, and our effect will be biased. @@ -1611,7 +1612,7 @@ Yet, almost everything we measure is inaccurate to some degree. The true DAG for the data at hand inherently conditions on the measured version of variables. In that sense, your data are always subtly-wrong, a sort of unreliable narrator. When should we include this information in the DAG? -We recommend first focusing on the causal structure of the DAG as if you had perfectly measured each variable. +We recommend first focusing on the causal structure of the DAG as if you had perfectly measured each variable [@hernan2021]. Then, consider how mismeasurement and missingness might affect the realized data, particularly related to the exposure, outcome, and critical confounders. You may prefer to present this as an alternative DAG to consider strategies for addressing the bias arising from those sources, e.g., imputation or sensitivity analyses. After all, the DAG in \@ fig-error_dag makes you think the question is unanswerable because we have no method to close all backdoor paths. @@ -1639,8 +1640,8 @@ Finally, we recommend checking your DAG for robustness. You can never verify the correctness of your DAG under most conditions, but you can use the implications in your DAG to support it. Three types of robustness checks can be helpful depending on the circumstances. -1. **Negative controls**. These come in two flavors: negative exposure controls and negative outcome controls. The idea is to find something associated with one but not the other, e.g., the outcome but not the exposure, so there should be no effect. Since there should be no effect, you now have a measurement for how well you control for *other* effects (e.g., the difference from null). Ideally, the confounders for negative controls are similar to the research question. -2. **DAG-data consistency**. Negative controls are an implication of your DAG. An extension of this idea is that there are *many* such implications. Because blocking a path removes statistical dependencies from that path, you can check those assumptions in several places in your DAG. +1. **Negative controls** [@Lipsitch2010]. These come in two flavors: negative exposure controls and negative outcome controls. The idea is to find something associated with one but not the other, e.g., the outcome but not the exposure, so there should be no effect. Since there should be no effect, you now have a measurement for how well you control for *other* effects (e.g., the difference from null). Ideally, the confounders for negative controls are similar to the research question. +2. **DAG-data consistency** [@Textor2016]. Negative controls are an implication of your DAG. An extension of this idea is that there are *many* such implications. Because blocking a path removes statistical dependencies from that path, you can check those assumptions in several places in your DAG. 3. **Alternate adjustment sets**. Adjustment sets should give roughly the same answer because, outside of random and measurement errors, they are all sets that block backdoor paths. If more than one adjustment set seems reasonable, you can use that as a sensitivity analysis by checking multiple models. We'll discuss these in detail in [Chapter -@sec-sensitivity]. diff --git a/chapters/chapter-16.qmd b/chapters/chapter-16.qmd index 7c97d41..e2b53cb 100644 --- a/chapters/chapter-16.qmd +++ b/chapters/chapter-16.qmd @@ -4,4 +4,7 @@ ## Functional form, hetereogenous effects, and joint causes -## Fitting interaction terms in causal models \ No newline at end of file +## Fitting interaction terms in causal models + + + \ No newline at end of file diff --git a/citations.bib b/citations.bib index d3ac8e2..54545a5 100644 --- a/citations.bib +++ b/citations.bib @@ -521,4 +521,184 @@ @article{Tennant2021 langid = {en} } -@book{Pearl_Glymour_Jewell_2021, place={Chichester, West Sussex}, title={Causal inference in statistics: A Primer}, publisher={Wiley}, author={Pearl, Judea and Glymour, Madelyn and Jewell, Nicholas P.}, year={2021}} \ No newline at end of file +@book{Pearl_Glymour_Jewell_2021, place={Chichester, West Sussex}, title={Causal inference in statistics: A Primer}, publisher={Wiley}, author={Pearl, Judea and Glymour, Madelyn and Jewell, Nicholas P.}, year={2021}} +@article{Banack2023, + title = {Collider Stratification Bias I: Principles and Structure}, + author = {Banack, Hailey R and Mayeda, Elizabeth Rose and Naimi, Ashley I and Fox, Matthew P and Whitcomb, Brian W}, + year = {2023}, + month = {10}, + date = {2023-10-10}, + journal = {American Journal of Epidemiology}, + doi = {10.1093/aje/kwad203}, + url = {http://dx.doi.org/10.1093/aje/kwad203}, + langid = {en} +} + +@article{vanderzander2019, + title = {Separators and adjustment sets in causal graphs: Complete criteria and an algorithmic framework}, + author = {van der Zander, Benito and {Li{\'{s}}kiewicz}, Maciej and Textor, Johannes}, + year = {2019}, + month = {05}, + date = {2019-05}, + journal = {Artificial Intelligence}, + pages = {1--40}, + volume = {270}, + doi = {10.1016/j.artint.2018.12.006}, + url = {http://dx.doi.org/10.1016/j.artint.2018.12.006}, + langid = {en} +} + +@article{weinberg2007, + title = {Can DAGs Clarify Effect Modification?}, + author = {Weinberg, Clarice R.}, + year = {2007}, + month = {09}, + date = {2007-09}, + journal = {Epidemiology}, + pages = {569--572}, + volume = {18}, + number = {5}, + doi = {10.1097/ede.0b013e318126c11d}, + url = {http://dx.doi.org/10.1097/ede.0b013e318126c11d}, + langid = {en} +} + +@article{Nilsson2021, + title = {A directed acyclic graph for interactions}, + author = {Nilsson, Anton and Bonander, Carl and {Strömberg}, Ulf and {Björk}, Jonas}, + year = {2020}, + month = {11}, + date = {2020-11-22}, + journal = {International Journal of Epidemiology}, + pages = {613--619}, + volume = {50}, + number = {2}, + doi = {10.1093/ije/dyaa211}, + url = {http://dx.doi.org/10.1093/ije/dyaa211}, + langid = {en} +} + +@article{sugiyama1981, + title = {Methods for Visual Understanding of Hierarchical System Structures}, + author = {Sugiyama, Kozo and Tagawa, Shojiro and Toda, Mitsuhiko}, + year = {1981}, + date = {1981}, + journal = {IEEE Transactions on Systems, Man, and Cybernetics}, + pages = {109--125}, + volume = {11}, + number = {2}, + doi = {10.1109/tsmc.1981.4308636}, + url = {http://dx.doi.org/10.1109/tsmc.1981.4308636} +} + +@article{lu2022, + title = {Toward a Clearer Definition of Selection Bias When Estimating Causal Effects}, + author = {Lu, Haidong and Cole, Stephen R. and Howe, Chanelle J. and Westreich, Daniel}, + year = {2022}, + month = {06}, + date = {2022-06-06}, + journal = {Epidemiology}, + pages = {699--706}, + volume = {33}, + number = {5}, + doi = {10.1097/ede.0000000000001516}, + url = {http://dx.doi.org/10.1097/ede.0000000000001516}, + langid = {en} +} + +@article{Brookhart2006, + title = {Variable Selection for Propensity Score Models}, + author = {Brookhart, M. Alan and Schneeweiss, Sebastian and Rothman, Kenneth J. and Glynn, Robert J. and Avorn, Jerry and {Stürmer}, Til}, + year = {2006}, + month = {04}, + date = {2006-04-19}, + journal = {American Journal of Epidemiology}, + pages = {1149--1156}, + volume = {163}, + number = {12}, + doi = {10.1093/aje/kwj149}, + url = {http://dx.doi.org/10.1093/aje/kwj149}, + langid = {en} +} + +@article{Hernán2009, + title = {Invited Commentary: Causal Diagrams and Measurement Bias}, + author = {Hernan, M. A. and Cole, S. R.}, + year = {2009}, + month = {09}, + date = {2009-09-15}, + journal = {American Journal of Epidemiology}, + pages = {959--962}, + volume = {170}, + number = {8}, + doi = {10.1093/aje/kwp293}, + url = {http://dx.doi.org/10.1093/aje/kwp293}, + langid = {en} +} + +@misc{murray2022, + title = {As the Wheel Turns: Causal Inference for Feedback Loops and Bidirectional Effects}, + author = {Murray, Eleanor J. and Kunicki, Zach}, + year = {2022}, + month = {07}, + date = {2022-07-18}, + url = {http://dx.doi.org/10.31219/osf.io/9em5q} +} + +@article{mansournia2013, + title = {Matched designs and causal diagrams}, + author = {Mansournia, Mohammad A and {Hernán}, Miguel A and Greenland, Sander}, + year = {2013}, + month = {06}, + date = {2013-06}, + journal = {International Journal of Epidemiology}, + pages = {860--869}, + volume = {42}, + number = {3}, + doi = {10.1093/ije/dyt083}, + url = {http://dx.doi.org/10.1093/ije/dyt083}, + langid = {en} +} + +@article{miao2018, + title = {Identifying causal effects with proxy variables of an unmeasured confounder}, + author = {Miao, Wang and Geng, Zhi and Tchetgen Tchetgen, Eric J}, + year = {2018}, + month = {08}, + date = {2018-08-13}, + journal = {Biometrika}, + pages = {987--993}, + volume = {105}, + number = {4}, + doi = {10.1093/biomet/asy038}, + url = {http://dx.doi.org/10.1093/biomet/asy038}, + langid = {en} +} + +@article{Lipsitch2010, + title = {Negative Controls}, + author = {Lipsitch, Marc and Tchetgen Tchetgen, Eric and Cohen, Ted}, + year = {2010}, + month = {05}, + date = {2010-05}, + journal = {Epidemiology}, + pages = {383--388}, + volume = {21}, + number = {3}, + doi = {10.1097/ede.0b013e3181d61eeb}, + url = {http://dx.doi.org/10.1097/EDE.0b013e3181d61eeb}, + langid = {en} +} + +@article{Textor2016, + title = {Robust causal inference using directed acyclic graphs: the R package {\textquoteleft}dagitty{\textquoteright}}, + author = {Textor, Johannes and van der Zander, Benito and Gilthorpe, Mark S. and {Li{\'{s}}kiewicz}, Maciej and Ellison, George T.H.}, + year = {2017}, + month = {01}, + date = {2017-01-15}, + journal = {International Journal of Epidemiology}, + pages = {dyw341}, + doi = {10.1093/ije/dyw341}, + url = {http://dx.doi.org/10.1093/ije/dyw341}, + langid = {en} +}