diff --git a/.Rbuildignore b/.Rbuildignore
index 91114bf2f..1958855bf 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,2 +1,5 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^\.travis\.yml$
+^\.github$
+^CODE_OF_CONDUCT\.md$
diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 000000000..2d19fc766
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/build_book.yaml b/.github/workflows/build_book.yaml
new file mode 100644
index 000000000..cdcba86cb
--- /dev/null
+++ b/.github/workflows/build_book.yaml
@@ -0,0 +1,60 @@
+on:
+  push:
+    branches: main
+  pull_request:
+    branches: main
+  # to be able to trigger a manual build
+  workflow_dispatch:
+  schedule:
+    # run every day at 11 PM
+    - cron: '0 23 * * *'
+
+name: Render and deploy Book to Netlify
+
+env:
+  isExtPR: ${{ github.event.pull_request.head.repo.fork == true }}
+  RUST_BACKTRACE: 1
+
+jobs:
+  build-deploy:
+    runs-on: ubuntu-latest
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Install Quarto
+        uses: quarto-dev/quarto-actions/install-quarto@v1
+        with:
+          # To install LaTeX to build PDF book
+          tinytex: true
+          # uncomment below and fill to pin a version
+          # version: 0.9.105
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+
+      - name: Render book to all format
+        # Add any command line argument needed
+        run: |
+          quarto render
+
+      - name: Deploy to Netlify
+        if: contains(env.isExtPR, 'false')
+        id: netlify-deploy
+        uses: nwtgck/actions-netlify@v1.1
+        with:
+          publish-dir: './_book'
+          production-branch: main
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          deploy-message:
+            'Deploy from GHA: ${{ github.event.pull_request.title || github.event.head_commit.message }} (${{ github.sha }})'
+          enable-pull-request-comment: false
+          enable-commit-comment: false
+        env:
+          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
+          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID_2E }}
+        timeout-minutes: 1
diff --git a/.gitignore b/.gitignore
index 0c451465c..00b8f139f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,11 +5,18 @@
 _main.rds
 _book
 *.md
-*.html
+!CODE_OF_CONDUCT.md
+/*.html
+!plausible.html
 search_index.json
 libs
 *.rds
 _main.*
-bookdown*
 tmp-pdfcrop-*
 figures
+/.quarto/
+site_libs
+/data/seattle-library-checkouts.csv
+/data/seattle-library-checkouts.parquet
+/data/seattle-library-checkouts
+oreilly
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index d3c3534e3..000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-language: R
-cache:
-  packages: true
-  directories:
-  - _book
-
-addons:
-  apt:
-    packages:
-      - libxml2-dev
-
-script:
-  - Rscript -e 'bookdown::render_book("index.rmd")'
-
-deploy:
-  skip_cleanup: true
-  provider: s3
-  access_key_id: AKIAJ6PXDYWD72R6HBYA
-  secret_access_key:
-     secure: "KB6D4dRFyqABOUBC6q6CTI7WZQ+4kFOSDWNQFAbXJQR4TzR8J6uddAiSZyG8T1/8z+9Lm1VK417Zi0dGm3r3epbSnLClitBetvE11DoByomK+ey+NJ0MdXuXbFCJhX9l+8QDbDRLd/b2MEr36JXNaNQaLf5wdHImVVfcCm5STAIOM42plYMvz4Uhao+VjIKo+0IqiGHQHsNcU4qQXS4jd4FtO/t1xCwa7SgH0wwV2yJmeh8mM7QpmUEpBcZTHDvqZu6BitxtkYQDCh1iuBwhbPlYug/WOtyHmKYgU/c3+C+xW4OLv10OsE+eK6noEzIXQ80sPIyKMpkn+9P+7MnoRU/oZTXmYJOuXE5mvy+CiJ4TzZZxzB/g8HzklRRI4eFBmJ/zTTMmJMwBdbUhCXepARe4gr7pDFKhSTXvBVxljJBrkiGz6W1JeZ9nKzUbuIlWNJ9aaYM2UDMbRef7xyKlKbBNw1+90aTTW8Jo+0Sz3/R7daBTcnr0Bszg4QCaOMoxJJF/Ty/tTHiComAt/kNRqlSiU2g/Ch0jOz5TRV3c29OjQQ/a9ftf5pqlvgStwjjszgHQfRrd4mxGq2E/1gkPGL7ada+TWPAVjCc8HtPGK/36IjSccFB6qGkwTFf3uOBmAC2XVnJJlwG8v20nL5ZZwpCCbQANeQq/ILQsYUmk7RM="
-  bucket: r4ds.had.co.nz
-  local-dir: _book
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..b36903fa8
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards
+of acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies
+when an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at [INSERT CONTACT
+METHOD]. All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0,
+available at https://www.contributor-covenant.org/version/2/0/
+code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at https://
+www.contributor-covenant.org/translations.
diff --git a/DESCRIPTION b/DESCRIPTION
index eca8156c5..7e4996ebf 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -2,32 +2,39 @@ Package: r4ds
 Title: R for data science.
 Version: 0.1
 Authors@R: c(
-  person("Hadley", "Wickham", , "hadley@rstudio.com", c("aut", "cre")),
-  person("Garrett", "Grolemund", , "garrett@rstudio.com", "aut")
+    person("Hadley", "Wickham", , "hadley@rstudio.com", role = c("aut", "cre")),
+    person("Mine", "\u00C7etinkaya-Rundel", , "mine@rstudio.com", role = "aut"),
+    person("Garrett", "Grolemund", , "garrett@rstudio.com", role = "aut")
   )
-Depends: R (>= 3.1.0)
 URL: https://github.com/hadley/r4ds
+Depends: 
+    R (>= 3.1.0)
 Imports:
-  bookdown,
-  condvis,
-  gapminder,
-  ggrepel,
-  hexbin,
-  htmltools,
-  htmlwidgets,
-  jpeg,
-  knitr,
-  Lahman,
-  leaflet,
-  maps,
-  microbenchmark,
-  nycflights13,
-  png,
-  pryr,
-  tidyverse,
-  viridis
-Remotes:
-  hadley/ggplot2,
-  slowkow/ggrepel,
-  rstudio/bookdown,
-  rstudio/rmarkdown
+    arrow,
+    babynames,
+    curl (>= 5.0.0),
+    duckdb,
+    gapminder,
+    ggrepel,
+    ggridges,
+    ggthemes,
+    hexbin,
+    janitor,
+    Lahman,
+    leaflet,
+    maps,
+    nycflights13,
+    openxlsx,
+    palmerpenguins,
+    repurrrsive (>= 1.1.0),
+    tidymodels,
+    tidyverse (>= 2.0.0),
+    writexl
+Suggests:
+    downlit,
+    jpeg,
+    knitr,
+    sessioninfo
+Remotes: tidyverse/dplyr
+Encoding: UTF-8
+License: CC NC ND 3.0
diff --git a/EDA.Rmd b/EDA.Rmd
deleted file mode 100644
index b8a2977b8..000000000
--- a/EDA.Rmd
+++ /dev/null
@@ -1,595 +0,0 @@
-# Exploratory Data Analysis
-
-## Introduction
-
-This chapter will show you how to use visualisation and transformation to explore your data in a systematic way, a task that statisticians call exploratory data analysis, or EDA for short. EDA is an iterative cycle. You:
-
-1. Generate questions about your data.
-
-1. Search for answers by visualising, transforming, and modelling your data.
-
-1. Use what you learn to refine your questions and/or generate new questions.
-
-EDA is not a formal process with a strict set of rules. More than anything, EDA is a state of mind. During the initial phases of EDA you should feel free to investigate every idea that occurs to you. Some of these ideas will pan out, and some will be dead ends. As your exploration continues, you will home in on a few particularly productive areas that you'll eventually write up and communicate to others.
-
-EDA is an important part of any data analysis, even if the questions are handed to you on a platter, because you always need to investigate the quality of your data. Data cleaning is just one application of EDA: you ask questions about whether your data meets your expectations or not. To do data cleaning, you'll need to deploy all the tools of EDA: visualisation, transformation, and modelling.
-
-### Prerequisites
-
-In this chapter we'll combine what you've learned about dplyr and ggplot2 to interactively ask questions, answer them with data, and then ask new questions.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-```
-
-## Questions
-
-> "There are no routine statistical questions, only questionable statistical
-> routines." --- Sir David Cox
-
-> "Far better an approximate answer to the right question, which is often
-> vague, than an exact answer to the wrong question, which can always be made
-> precise." --- John Tukey
-
-Your goal during EDA is to develop an understanding of your data. The easiest way to do this is to use questions as tools to guide your investigation. When you ask a question, the question focuses your attention on a specific part of your dataset and helps you decide which graphs, models, or transformations to make.
-
-EDA is fundamentally a creative process. And like most creative processes, the key to asking _quality_ questions is to generate a large _quantity_ of questions. It is difficult to ask revealing questions at the start of your analysis because you do not know what insights are contained in your dataset. On the other hand, each new question that you ask will expose you to a new aspect of your data and increase your chance of making a discovery. You can quickly drill down into the most interesting parts of your data---and develop a set of thought-provoking questions---if you follow up each question with a new question based on what you find.
-
-There is no rule about which questions you should ask to guide your research. However, two types of questions will always be useful for making discoveries within your data. You can loosely word these questions as:
-
-1. What type of variation occurs within my variables?
-
-1. What type of covariation occurs between my variables?
-
-The rest of this chapter will look at these two questions. I'll explain what variation and covariation are, and I'll show you several ways to answer each question. To make the discussion easier, let's define some terms: 
-
-*   A __variable__ is a quantity, quality, or property that you can measure. 
-
-*   A __value__ is the state of a variable when you measure it. The value of a
-    variable may change from measurement to measurement.
-  
-*   An __observation__ is a set of measurements made under similar conditions
-    (you usually make all of the measurements in an observation at the same 
-    time and on the same object). An observation will contain several values, 
-    each associated with a different variable. I'll sometimes refer to 
-    an observation as a data point.
-
-*   __Tabular data__ is a set of values, each associated with a variable and an
-    observation. Tabular data is _tidy_ if each value is placed in its own
-    "cell", each variable in its own column, and each observation in its own 
-    row. 
-
-So far, all of the data that you've seen has been tidy. In real-life, most data isn't tidy, so we'll come back to these ideas again in [tidy data].
-
-## Variation
-
-**Variation** is the tendency of the values of a variable to change from measurement to measurement. You can see variation easily in real life; if you measure any continuous variable twice, you will get two different results. This is true even if you measure quantities that are constant, like the speed of light. Each of your measurements will include a small amount of error that varies from measurement to measurement. Categorical variables can also vary if you measure across different subjects (e.g. the eye colors of different people), or different times (e.g. the energy levels of an electron at different moments). 
-Every variable has its own pattern of variation, which can reveal interesting information. The best way to understand that pattern is to visualise the distribution of the variable's values.
-
-### Visualising distributions
-
-How you visualise the distribution of a variable will depend on whether the variable is categorical or continuous. A variable is **categorical** if it can only take one of a small set of values. In R, categorical variables are usually saved as factors or character vectors. To examine the distribution of a categorical variable, use a bar chart:
-
-```{r}
-ggplot(data = diamonds) +
-  geom_bar(mapping = aes(x = cut))
-```
-
-The height of the bars displays how many observations occurred with each x value. You can compute these values manually with `dplyr::count()`:
-
-```{r}
-diamonds %>% 
-  count(cut)
-```
-
-A variable is **continuous** if it can take any of an infinite set of ordered values. Numbers and date-times are two examples of continuous variables. To examine the distribution of a continuous variable, use a histogram:
-
-```{r}
-ggplot(data = diamonds) +
-  geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
-```
-
-You can compute this by hand by combining `dplyr::count()` and `ggplot2::cut_width()`:
-
-```{r}
-diamonds %>% 
-  count(cut_width(carat, 0.5))
-```
-
-A histogram divides the x-axis into equally spaced bins and then uses the height of a bar to display the number of observations that fall in each bin. In the graph above, the tallest bar shows that almost 30,000 observations have a `carat` value between 0.25 and 0.75, which are the left and right edges of the bar. 
-
-You can set the width of the intervals in a histogram with the `binwidth` argument, which is measured in the units of the `x` variable. You should always explore a variety of binwidths when working with histograms, as different binwidths can reveal different patterns. For example, here is how the graph above looks when we zoom into just the diamonds with a size of less than three carats and choose a smaller binwidth.
-
-```{r}
-smaller <- diamonds %>% 
-  filter(carat < 3)
-  
-ggplot(data = smaller, mapping = aes(x = carat)) +
-  geom_histogram(binwidth = 0.1)
-```
-
-If you wish to overlay multiple histograms in the same plot, I recommend using `geom_freqpoly()` instead of `geom_histogram()`. `geom_freqpoly()` performs the same calculation as `geom_histogram()`, but instead of displaying the counts with bars, uses lines instead. It's much easier to understand overlapping lines than bars.
-
-```{r}
-ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) +
-  geom_freqpoly(binwidth = 0.1)
-```
-
-There are a few challenges with this type of plot, which we will come back to in [visualising a categorical and a continuous variable](#cat-cont).
-
-Now that you can visualise variation, what should you look for in your plots? And what type of follow-up questions should you ask? I've put together a list below of the most useful types of information that you will find in your graphs, along with some follow-up questions for each type of information. The key to asking good follow-up questions will be to rely on your curiosity (What do you want to learn more about?) as well as your skepticism (How could this be misleading?).
-
-### Typical values
-
-In both bar charts and histograms, tall bars show the common values of a variable, and shorter bars show less-common values. Places that do not have bars reveal values that were not seen in your data. To turn this information into useful questions, look for anything unexpected:
-
-* Which values are the most common? Why?
-
-* Which values are rare? Why? Does that match your expectations?
-
-* Can you see any unusual patterns? What might explain them?
-
-As an example, the histogram below suggests several interesting questions: 
-
-* Why are there more diamonds at whole carats and common fractions of carats?
-
-* Why are there more diamonds slightly to the right of each peak than there 
-  are slightly to the left of each peak?
-  
-* Why are there no diamonds bigger than 3 carats?
-
-```{r}
-ggplot(data = smaller, mapping = aes(x = carat)) +
-  geom_histogram(binwidth = 0.01)
-```
-
-Clusters of similar values suggest that subgroups exist in your data. To understand the subgroups, ask:
-
-* How are the observations within each cluster similar to each other?
-
-* How are the observations in separate clusters different from each other?
-
-* How can you explain or describe the clusters?
-
-* Why might the appearance of clusters be misleading?
-
-The histogram below shows the length (in minutes) of 272 eruptions of the Old Faithful Geyser in Yellowstone National Park. Eruption times appear to be clustered into two groups: there are short eruptions (of around 2 minutes) and long eruptions (4-5 minutes), but little in between.
-
-```{r}
-ggplot(data = faithful, mapping = aes(x = eruptions)) + 
-  geom_histogram(binwidth = 0.25)
-```  
-
-Many of the questions above will prompt you to explore a relationship *between* variables, for example, to see if the values of one variable can explain the behavior of another variable. We'll get to that shortly.
-
-### Unusual values
-
-Outliers are observations that are unusual; data points that don't seem to fit the pattern. Sometimes outliers are data entry errors; other times outliers suggest important new science. When you have a lot of data, outliers are sometimes difficult to see in a histogram.  For example, take the distribution of the `y` variable from the diamonds dataset. The only evidence of outliers is the unusually wide limits on the x-axis.
-
-```{r}
-ggplot(diamonds) + 
-  geom_histogram(mapping = aes(x = y), binwidth = 0.5)
-```   
-
-There are so many observations in the common bins that the rare bins are so short that you can't see them (although maybe if you stare intently at 0 you'll spot something). To make it easy to see the unusual values, we need to zoom to small values of the y-axis with `coord_cartesian()`:
-
-```{r}
-ggplot(diamonds) + 
-  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
-  coord_cartesian(ylim = c(0, 50))
-```   
-
-(`coord_cartesian()` also has an `xlim()` argument for when you need to zoom into the x-axis. ggplot2 also has `xlim()` and `ylim()` functions that work slightly differently: they throw away the data outside the limits.)
-
-This allows us to see that there are three unusual values: 0, ~30, and ~60. We pluck them out with dplyr: 
-
-```{r, include = FALSE}
-old <- options(tibble.print_max = 10, tibble.print_min = 10)
-```
-
-```{r}
-unusual <- diamonds %>% 
-  filter(y < 3 | y > 20) %>% 
-  select(price, x, y, z) %>%
-  arrange(y)
-unusual
-```
-
-```{r, include = FALSE}
-options(old)
-```
-
-The `y` variable measures one of the three dimensions of these diamonds, in mm. We know that diamonds can't have a width of 0mm, so these values must be incorrect. We might also suspect that measurements of 32mm and 59mm are implausible: those diamonds are over an inch long, but don't cost hundreds of thousands of dollars!
-
-It's good practice to repeat your analysis with and without the outliers. If they have minimal effect on the results, and you can't figure out why they're there, it's reasonable to replace them with missing values, and move on. However, if they have a substantial effect on your results, you shouldn't drop them without justification. You'll need to figure out what caused them (e.g. a data entry error) and disclose that you removed them in your write-up.
-
-
-### Exercises
-
-1.  Explore the distribution of each of the `x`, `y`, and `z` variables 
-    in `diamonds`. What do you learn? Think about a diamond and how you
-    might decide which dimension is the length, width, and depth.
-
-1.  Explore the distribution of `price`. Do you discover anything unusual
-    or surprising? (Hint: Carefully think about the `binwidth` and make sure
-    you try a wide range of values.)
-
-1.  How many diamonds are 0.99 carat? How many are 1 carat? What
-    do you think is the cause of the difference?
-    
-1.  Compare and contrast `coord_cartesian()` vs `xlim()` or `ylim()` when
-    zooming in on a histogram. What happens if you leave `binwidth` unset?
-    What happens if you try and zoom so only half a bar shows?
-    
-## Missing values
-
-If you've encountered unusual values in your dataset, and simply want to move on to the rest of your analysis, you have two options.
-
-1.  Drop the entire row with the strange values:
-
-    ```{r, eval = FALSE}
-    diamonds2 <- diamonds %>% 
-      filter(between(y, 3, 20))
-    ```
-    
-    I don't recommend this option because just because one measurement
-    is invalid, doesn't mean all the measurements are. Additionally, if you
-    have low quality data, by time that you've applied this approach to every
-    variable you might find that you don't have any data left!
-
-1.  Instead, I recommend replacing the unusual values with missing values.
-    The easiest way to do this is to use `mutate()` to replace the variable
-    with a modified copy. You can use the `ifelse()` function to replace
-    unusual values with `NA`:
-
-    ```{r}
-    diamonds2 <- diamonds %>% 
-      mutate(y = ifelse(y < 3 | y > 20, NA, y))
-    ```
-
-`ifelse()` has three arguments. The first argument `test` should be a logical vector. The result will contain the value of the second argument, `yes`, when `test` is `TRUE`, and the value of the third argument, `no`, when it is false. Alternatively to ifelse, use `dplyr::case_when()`. `case_when()` is particularly useful inside mutate when you want to create a new variable that relies on a complex combination of existing variables.
-
-Like R, ggplot2 subscribes to the philosophy that missing values should never silently go missing. It's not obvious where you should plot missing values, so ggplot2 doesn't include them in the plot, but it does warn that they've been removed:
-
-```{r, dev = "png"}
-ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + 
-  geom_point()
-```
-
-To suppress that warning, set `na.rm = TRUE`:
-
-```{r, eval = FALSE}
-ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + 
-  geom_point(na.rm = TRUE)
-```
-
-Other times you want to understand what makes observations with missing values different to observations with recorded values. For example, in `nycflights13::flights`, missing values in the `dep_time` variable indicate that the flight was cancelled. So you might want to compare the scheduled departure times for cancelled and non-cancelled times. You can do this by making a new variable with `is.na()`.
-
-```{r}
-nycflights13::flights %>% 
-  mutate(
-    cancelled = is.na(dep_time),
-    sched_hour = sched_dep_time %/% 100,
-    sched_min = sched_dep_time %% 100,
-    sched_dep_time = sched_hour + sched_min / 60
-  ) %>% 
-  ggplot(mapping = aes(sched_dep_time)) + 
-    geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)
-```
-
-However this plot isn't great because there are many more non-cancelled flights than cancelled flights. In the next section we'll explore some techniques for improving this comparison.
-
-### Exercises
-
-1.  What happens to missing values in a histogram?  What happens to missing
-    values in a bar chart? Why is there a difference?
-
-1.  What does `na.rm = TRUE` do in `mean()` and `sum()`?
-
-## Covariation
-
-If variation describes the behavior _within_ a variable, covariation describes the behavior _between_ variables. **Covariation** is the tendency for the values of two or more variables to vary together in a related way. The best way to spot covariation is to visualise the relationship between two or more variables. How you do that should again depend on the type of variables involved.
-
-### A categorical and continuous variable {#cat-cont}
-
-It's common to want to explore the distribution of a continuous variable broken down by a categorical variable, as in the previous frequency polygon. The default appearance of `geom_freqpoly()` is not that useful for that sort of comparison because the height is given by the count. That means if one of the groups is much smaller than the others, it's hard to see the differences in shape. For example, let's explore how the price of a diamond varies with its quality:
-
-```{r}
-ggplot(data = diamonds, mapping = aes(x = price)) + 
-  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
-```
-
-It's hard to see the difference in distribution because the overall counts differ so much:
-
-```{r, fig.width = "50%", fig.width = 4}
-ggplot(diamonds) + 
-  geom_bar(mapping = aes(x = cut))
-```
-
-To make the comparison easier we need to swap what is displayed on the y-axis. Instead of displaying count, we'll display __density__, which is the count standardised so that the area under each frequency polygon is one.
-
-```{r}
-ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + 
-  geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
-```
-
-There's something rather surprising about this plot - it appears that fair diamonds (the lowest quality) have the highest average price!  But maybe that's because frequency polygons are a little hard to interpret - there's a lot going on in this plot.
-
-Another alternative to display the distribution of a continuous variable broken down by a categorical variable is the boxplot. A **boxplot** is a type of visual shorthand for a distribution of values that is popular among statisticians. Each boxplot consists of:
-
-* A box that stretches from the 25th percentile of the distribution to the 
-  75th percentile, a distance known as the interquartile range (IQR). In the
-  middle of the box is a line that displays the median, i.e. 50th percentile,
-  of the distribution. These three lines give you a sense of the spread of the
-  distribution and whether or not the distribution is symmetric about the
-  median or skewed to one side. 
-
-* Visual points that display observations that fall more than 1.5 times the 
-  IQR from either edge of the box. These outlying points are unusual
-  so are plotted individually.
-
-* A line (or whisker) that extends from each end of the box and goes to the   
-  farthest non-outlier point in the distribution.
-
-```{r, echo = FALSE, out.width = "100%"}
-knitr::include_graphics("images/EDA-boxplot.png")
-```
-
-Let's take a look at the distribution of price by cut using `geom_boxplot()`:
-
-```{r fig.height = 3}
-ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
-  geom_boxplot()
-```
-
-We see much less information about the distribution, but the boxplots are much more compact so we can more easily compare them (and fit more on one plot). It supports the counterintuitive finding that better quality diamonds are cheaper on average! In the exercises, you'll be challenged to figure out why.
-
-`cut` is an ordered factor: fair is worse than good, which is worse than very good and so on. Many categorical variables don't have such an intrinsic order, so you might want to reorder them to make a more informative display. One way to do that is with the `reorder()` function.
-
-For example, take the `class` variable in the `mpg` dataset. You might be interested to know how highway mileage varies across classes:
-
-```{r}
-ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
-  geom_boxplot()
-```
-
-To make the trend easier to see, we can reorder `class` based on the median value of `hwy`:
-
-```{r fig.height = 3}
-ggplot(data = mpg) +
-  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
-```
-
-If you have long variable names, `geom_boxplot()` will work better if you flip it 90°. You can do that with `coord_flip()`.
-
-```{r}
-ggplot(data = mpg) +
-  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
-  coord_flip()
-```
-
-#### Exercises
-
-1.  Use what you've learned to improve the visualisation of the departure times
-    of cancelled vs. non-cancelled flights.
-
-1.  What variable in the diamonds dataset is most important for predicting
-    the price of a diamond? How is that variable correlated with cut?
-    Why does the combination of those two relationships lead to lower quality
-    diamonds being more expensive?
-
-1.  Install the ggstance package, and create a horizontal boxplot.
-    How does this compare to using `coord_flip()`?
-
-1.  One problem with boxplots is that they were developed in an era of 
-    much smaller datasets and tend to display a prohibitively large
-    number of "outlying values". One approach to remedy this problem is
-    the letter value plot. Install the lvplot package, and try using
-    `geom_lv()` to display the distribution of price vs cut. What
-    do you learn? How do you interpret the plots?
-
-1.  Compare and contrast `geom_violin()` with a facetted `geom_histogram()`,
-    or a coloured `geom_freqpoly()`. What are the pros and cons of each 
-    method?
-
-1.  If you have a small dataset, it's sometimes useful to use `geom_jitter()`
-    to see the relationship between a continuous and categorical variable.
-    The ggbeeswarm package provides a number of methods similar to 
-    `geom_jitter()`. List them and briefly describe what each one does.
-
-### Two categorical variables
-
-To visualise the covariation between categorical variables, you'll need to count the number of observations for each combination. One way to do that is to rely on the built-in `geom_count()`:
-
-```{r}
-ggplot(data = diamonds) +
-  geom_count(mapping = aes(x = cut, y = color))
-```
-
-The size of each circle in the plot displays how many observations occurred at each combination of values. Covariation will appear as a strong correlation between specific x values and specific y values. 
-
-Another approach is to compute the count with dplyr:
-
-```{r}
-diamonds %>% 
-  count(color, cut)
-```
-
-Then visualise with `geom_tile()` and the fill aesthetic:
-
-```{r}
-diamonds %>% 
-  count(color, cut) %>%  
-  ggplot(mapping = aes(x = color, y = cut)) +
-    geom_tile(mapping = aes(fill = n))
-```
-
-If the categorical variables are unordered, you might want to use the seriation package to simultaneously reorder the rows and columns in order to more clearly reveal interesting patterns. For larger plots, you might want to try the d3heatmap or heatmaply packages, which create interactive plots.
-
-#### Exercises
-
-1.  How could you rescale the count dataset above to more clearly show
-    the distribution of cut within colour, or colour within cut?
-
-1.  Use `geom_tile()` together with dplyr to explore how average flight
-    delays vary by destination and month of year.  What makes the 
-    plot difficult to read? How could you improve it?
-
-1.  Why is it slightly better to use `aes(x = color, y = cut)` rather
-    than `aes(x = cut, y = color)` in the example above?
-
-### Two continuous variables
-
-You've already seen one great way to visualise the covariation between two continuous variables: draw a scatterplot with `geom_point()`. You can see covariation as a pattern in the points. For example, you can see an exponential relationship between the carat size and price of a diamond.
-
-```{r, dev = "png"}
-ggplot(data = diamonds) +
-  geom_point(mapping = aes(x = carat, y = price))
-```
-
-Scatterplots become less useful as the size of your dataset grows, because points begin to overplot, and pile up into areas of uniform black (as above).
-You've already seen one way to fix the problem: using the `alpha` aesthetic to add transparency.
-
-```{r, dev = "png"}
-ggplot(data = diamonds) + 
-  geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
-```
-
-But using transparency can be challenging for very large datasets. Another solution is to use bin. Previously you used `geom_histogram()` and `geom_freqpoly()` to bin in one dimension. Now you'll learn how to use `geom_bin2d()` and `geom_hex()` to bin in two dimensions.
-
-`geom_bin2d()` and `geom_hex()` divide the coordinate plane into 2d bins and then use a fill color to display how many points fall into each bin. `geom_bin2d()` creates rectangular bins. `geom_hex()` creates hexagonal bins. You will need to install the hexbin package to use `geom_hex()`.
-
-```{r, fig.asp = 1, out.width = "50%", fig.align = "default", message = FALSE}
-ggplot(data = smaller) +
-  geom_bin2d(mapping = aes(x = carat, y = price))
-
-# install.packages("hexbin")
-ggplot(data = smaller) +
-  geom_hex(mapping = aes(x = carat, y = price))
-```
-
-Another option is to bin one continuous variable so it acts like a categorical variable. Then you can use one of the techniques for visualising the combination of a categorical and a continuous variable that you learned about. For example, you could bin `carat` and then for each group, display a boxplot:
-
-```{r}
-ggplot(data = smaller, mapping = aes(x = carat, y = price)) + 
-  geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
-```
-
-`cut_width(x, width)`, as used above, divides `x` into bins of width `width`. By default, boxplots look roughly the same (apart from number of outliers) regardless of how many observations there are, so it's difficult to tell that each boxplot summarises a different number of points. One way to show that is to make the width of the boxplot proportional to the number of points with `varwidth = TRUE`.
-
-Another approach is to display approximately the same number of points in each bin. That's the job of `cut_number()`:
-
-```{r}
-ggplot(data = smaller, mapping = aes(x = carat, y = price)) + 
-  geom_boxplot(mapping = aes(group = cut_number(carat, 20)))
-```
-
-#### Exercises
-
-1.  Instead of summarising the conditional distribution with a boxplot, you
-    could use a frequency polygon. What do you need to consider when using
-    `cut_width()` vs `cut_number()`? How does that impact a visualisation of
-    the 2d distribution of `carat` and `price`?
-
-1.  Visualise the distribution of carat, partitioned by price.
-
-1.  How does the price distribution of very large diamonds compare to small 
-    diamonds? Is it as you expect, or does it surprise you?
-    
-1.  Combine two of the techniques you've learned to visualise the 
-    combined distribution of cut, carat, and price.
-
-1. Two dimensional plots reveal outliers that are not visible in one 
-   dimensional plots. For example, some points in the plot below have an 
-   unusual combination of `x` and `y` values, which makes the points outliers 
-   even though their `x` and `y` values appear normal when examined separately.
-  
-    ```{r, dev = "png"}
-    ggplot(data = diamonds) +
-      geom_point(mapping = aes(x = x, y = y)) +
-      coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
-    ```
-    
-    Why is a scatterplot a better display than a binned plot for this case?
-
-## Patterns and models
-
-Patterns in your data provide clues about relationships. If a systematic relationship exists between two variables it will appear as a pattern in the data. If you spot a pattern, ask yourself:
-
-+ Could this pattern be due to coincidence (i.e. random chance)?
-
-+ How can you describe the relationship implied by the pattern?
-
-+ How strong is the relationship implied by the pattern?
-
-+ What other variables might affect the relationship?
-
-+ Does the relationship change if you look at individual subgroups of the data?
-
-A scatterplot of Old Faithful eruption lengths versus the wait time between eruptions shows a pattern: longer wait times are associated with longer eruptions. The scatterplot also displays the two clusters that we noticed above.
-
-```{r fig.height = 2}
-ggplot(data = faithful) + 
-  geom_point(mapping = aes(x = eruptions, y = waiting))
-``` 
-
-Patterns provide one of the most useful tools for data scientists because they reveal covariation. If you think of variation as a phenomenon that creates uncertainty, covariation is a phenomenon that reduces it. If two variables covary, you can use the values of one variable to make better predictions about the values of the second. If the covariation is due to a causal relationship (a special case), then you can use the value of one variable to control the value of the second.
-
-Models are a tool for extracting patterns out of data. For example, consider the diamonds data. It's hard to understand the relationship between cut and price, because cut and carat, and carat and price are tightly related. It's possible to use a model to remove the very strong relationship between price and carat so we can explore the subtleties that remain. The following code fits a model that predicts `price` from `carat` and then computes the residuals (the difference between the predicted value and the actual value). The residuals give us a view of the price of the diamond, once the effect of carat has been removed. 
-
-```{r, dev = "png"}
-library(modelr)
-
-mod <- lm(log(price) ~ log(carat), data = diamonds)
-
-diamonds2 <- diamonds %>% 
-  add_residuals(mod) %>% 
-  mutate(resid = exp(resid))
-
-ggplot(data = diamonds2) + 
-  geom_point(mapping = aes(x = carat, y = resid))
-```
-
-Once you've removed the strong relationship between carat and price, you can see what you expect in the relationship between cut and price: relative to their size, better quality diamonds are more expensive. 
-
-```{r}
-ggplot(data = diamonds2) + 
-  geom_boxplot(mapping = aes(x = cut, y = resid))
-```
-
-You'll learn how models, and the modelr package, work in the final part of the book, [model](#model-intro). We're saving modelling for later because understanding what models are and how they work is easiest once you have tools of data wrangling and programming in hand.
-
-## ggplot2 calls
-
-As we move on from these introductory chapters, we'll transition to a more concise expression of ggplot2 code. So far we've been very explicit, which is helpful when you are learning:
-
-```{r, eval = FALSE}
-ggplot(data = faithful, mapping = aes(x = eruptions)) + 
-  geom_freqpoly(binwidth = 0.25)
-```
-
-Typically, the first one or two arguments to a function are so important that you should know them by heart. The first two arguments to `ggplot()` are `data` and `mapping`, and the first two arguments to `aes()` are `x` and `y`. In the remainder of the book, we won't supply those names. That saves typing, and, by reducing the amount of boilerplate, makes it easier to see what's different between plots. That's a really important programming concern that we'll come back in [functions].
-
-Rewriting the previous plot more concisely yields:
-
-```{r, eval = FALSE}
-ggplot(faithful, aes(eruptions)) + 
-  geom_freqpoly(binwidth = 0.25)
-```
-
-Sometimes we'll turn the end of a pipeline of data transformation into a plot. Watch for the transition from `%>%` to `+`. I wish this transition wasn't necessary but unfortunately ggplot2 was created before the pipe was discovered.
-
-```{r, eval = FALSE}
-diamonds %>% 
-  count(cut, clarity) %>% 
-  ggplot(aes(clarity, cut, fill = n)) + 
-    geom_tile()
-```
-
-## Learning more
-
-If you want to learn more about the mechanics of ggplot2, I'd highly recommend grabbing a copy of the ggplot2 book: <https://amzn.com/331924275X>. It's been recently updated, so it includes dplyr and tidyr code, and has much more space to explore all the facets of visualisation. Unfortunately the book isn't generally available for free, but if you have a connection to a university you can probably get an electronic version for free through SpringerLink.
-
-Another useful resource is the [_R Graphics Cookbook_](https://amzn.com/1449316956) by Winston Chang. Much of the contents are available online at <http://www.cookbook-r.com/Graphs/>.
-
-I also recommend [_Graphical Data Analysis with R_](https://amzn.com/1498715230), by Antony Unwin. This is a book-length treatment similar to the material covered in this chapter, but has the space to go into much greater depth. 
diff --git a/EDA.qmd b/EDA.qmd
new file mode 100644
index 000000000..720d27120
--- /dev/null
+++ b/EDA.qmd
@@ -0,0 +1,715 @@
+# Exploratory data analysis {#sec-exploratory-data-analysis}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+This chapter will show you how to use visualization and transformation to explore your data in a systematic way, a task that statisticians call exploratory data analysis, or EDA for short.
+EDA is an iterative cycle.
+You:
+
+1.  Generate questions about your data.
+
+2.  Search for answers by visualizing, transforming, and modelling your data.
+
+3.  Use what you learn to refine your questions and/or generate new questions.
+
+EDA is not a formal process with a strict set of rules.
+More than anything, EDA is a state of mind.
+During the initial phases of EDA you should feel free to investigate every idea that occurs to you.
+Some of these ideas will pan out, and some will be dead ends.
+As your exploration continues, you will home in on a few particularly productive insights that you'll eventually write up and communicate to others.
+
+EDA is an important part of any data analysis, even if the primary research questions are handed to you on a platter, because you always need to investigate the quality of your data.
+Data cleaning is just one application of EDA: you ask questions about whether your data meets your expectations or not.
+To do data cleaning, you'll need to deploy all the tools of EDA: visualization, transformation, and modelling.
+
+### Prerequisites
+
+In this chapter we'll combine what you've learned about dplyr and ggplot2 to interactively ask questions, answer them with data, and then ask new questions.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+## Questions
+
+> "There are no routine statistical questions, only questionable statistical routines." --- Sir David Cox
+
+> "Far better an approximate answer to the right question, which is often vague, than an exact answer to the wrong question, which can always be made precise." --- John Tukey
+
+Your goal during EDA is to develop an understanding of your data.
+The easiest way to do this is to use questions as tools to guide your investigation.
+When you ask a question, the question focuses your attention on a specific part of your dataset and helps you decide which graphs, models, or transformations to make.
+
+EDA is fundamentally a creative process.
+And like most creative processes, the key to asking *quality* questions is to generate a large *quantity* of questions.
+It is difficult to ask revealing questions at the start of your analysis because you do not know what insights can be gleaned from your dataset.
+On the other hand, each new question that you ask will expose you to a new aspect of your data and increase your chance of making a discovery.
+You can quickly drill down into the most interesting parts of your data---and develop a set of thought-provoking questions---if you follow up each question with a new question based on what you find.
+
+There is no rule about which questions you should ask to guide your research.
+However, two types of questions will always be useful for making discoveries within your data.
+You can loosely word these questions as:
+
+1.  What type of variation occurs within my variables?
+
+2.  What type of covariation occurs between my variables?
+
+The rest of this chapter will look at these two questions.
+We'll explain what variation and covariation are, and we'll show you several ways to answer each question.
+
+## Variation
+
+**Variation** is the tendency of the values of a variable to change from measurement to measurement.
+You can see variation easily in real life; if you measure any continuous variable twice, you will get two different results.
+This is true even if you measure quantities that are constant, like the speed of light.
+Each of your measurements will include a small amount of error that varies from measurement to measurement.
+Variables can also vary if you measure across different subjects (e.g., the eye colors of different people) or at different times (e.g., the energy levels of an electron at different moments).
+Every variable has its own pattern of variation, which can reveal interesting information about how that it varies between measurements on the same observation as well as across observations.
+The best way to understand that pattern is to visualize the distribution of the variable's values, which you've learned about in @sec-data-visualization.
+
+We'll start our exploration by visualizing the distribution of weights (`carat`) of \~54,000 diamonds from the `diamonds` dataset.
+Since `carat` is a numerical variable, we can use a histogram:
+
+```{r}
+#| fig-alt: |
+#|   A histogram of carats of diamonds, with the x-axis ranging from 0 to 4.5 
+#|   and the y-axis ranging from 0 to 30000. The distribution is right skewed 
+#|   with very few diamonds in the bin centered at 0, almost 30000 diamonds in 
+#|   the bin centered at 0.5, approximately 15000 diamonds in the bin centered 
+#|   at 1, and much fewer, approximately 5000 diamonds in the bin centered at 
+#|   1.5. Beyond this, there's a trailing tail.
+
+ggplot(diamonds, aes(x = carat)) +
+  geom_histogram(binwidth = 0.5)
+```
+
+Now that you can visualize variation, what should you look for in your plots?
+And what type of follow-up questions should you ask?
+We've put together a list below of the most useful types of information that you will find in your graphs, along with some follow-up questions for each type of information.
+The key to asking good follow-up questions will be to rely on your curiosity (What do you want to learn more about?) as well as your skepticism (How could this be misleading?).
+
+### Typical values
+
+In both bar charts and histograms, tall bars show the common values of a variable, and shorter bars show less-common values.
+Places that do not have bars reveal values that were not seen in your data.
+To turn this information into useful questions, look for anything unexpected:
+
+-   Which values are the most common?
+    Why?
+
+-   Which values are rare?
+    Why?
+    Does that match your expectations?
+
+-   Can you see any unusual patterns?
+    What might explain them?
+
+Let's take a look at the distribution of `carat` for smaller diamonds.
+
+```{r}
+#| fig-alt: |
+#|   A histogram of carats of diamonds, with the x-axis ranging from 0 to 3 and 
+#|   the y-axis ranging from 0 to roughly 2500. The binwidth is quite narrow 
+#|   (0.01), resulting in a very large number of skinny bars. The distribution 
+#|   is right skewed, with many peaks followed by bars in decreasing heights, 
+#|   until a sharp increase at the next peak.
+
+smaller <- diamonds |> 
+  filter(carat < 3)
+
+ggplot(smaller, aes(x = carat)) +
+  geom_histogram(binwidth = 0.01)
+```
+
+This histogram suggests several interesting questions:
+
+-   Why are there more diamonds at whole carats and common fractions of carats?
+
+-   Why are there more diamonds slightly to the right of each peak than there are slightly to the left of each peak?
+
+Visualizations can also reveal clusters, which suggest that subgroups exist in your data.
+To understand the subgroups, ask:
+
+-   How are the observations within each subgroup similar to each other?
+
+-   How are the observations in separate clusters different from each other?
+
+-   How can you explain or describe the clusters?
+
+-   Why might the appearance of clusters be misleading?
+
+Some of these questions can be answered with the data while some will require domain expertise about the data.
+Many of them will prompt you to explore a relationship *between* variables, for example, to see if the values of one variable can explain the behavior of another variable.
+We'll get to that shortly.
+
+### Unusual values
+
+Outliers are observations that are unusual; data points that don't seem to fit the pattern.
+Sometimes outliers are data entry errors, sometimes they are simply values at the extremes that happened to be observed in this data collection, and other times they suggest important new discoveries.
+When you have a lot of data, outliers are sometimes difficult to see in a histogram.
+For example, take the distribution of the `y` variable from the diamonds dataset.
+The only evidence of outliers is the unusually wide limits on the x-axis.
+
+```{r}
+#| fig-alt: |
+#|   A histogram of lengths of diamonds. The x-axis ranges from 0 to 60 and 
+#|   the y-axis ranges from 0 to 12000. There is a peak around 5, and the 
+#|   data appear to be completely clustered around the peak.
+
+ggplot(diamonds, aes(x = y)) + 
+  geom_histogram(binwidth = 0.5)
+```
+
+There are so many observations in the common bins that the rare bins are very short, making it very difficult to see them (although maybe if you stare intently at 0 you'll spot something).
+To make it easy to see the unusual values, we need to zoom to small values of the y-axis with `coord_cartesian()`:
+
+```{r}
+#| fig-alt: |
+#|   A histogram of lengths of diamonds. The x-axis ranges from 0 to 60 and the 
+#|   y-axis ranges from 0 to 50. There is a peak around 5, and the data 
+#|   appear to be completely clustered around the peak. Other than those data, 
+#|   there is one bin at 0 with a height of about 8, one a little over 30 with 
+#|   a height of 1 and another one a little below 60 with a height of 1.
+
+ggplot(diamonds, aes(x = y)) + 
+  geom_histogram(binwidth = 0.5) +
+  coord_cartesian(ylim = c(0, 50))
+```
+
+`coord_cartesian()` also has an `xlim()` argument for when you need to zoom into the x-axis.
+ggplot2 also has `xlim()` and `ylim()` functions that work slightly differently: they throw away the data outside the limits.
+
+This allows us to see that there are three unusual values: 0, \~30, and \~60.
+We pluck them out with dplyr:
+
+```{r}
+#| include: false
+
+old <- options(tibble.print_max = 10, tibble.print_min = 10)
+```
+
+```{r}
+unusual <- diamonds |> 
+  filter(y < 3 | y > 20) |> 
+  select(price, x, y, z) |>
+  arrange(y)
+unusual
+```
+
+```{r}
+#| include: false
+
+options(old)
+```
+
+The `y` variable measures one of the three dimensions of these diamonds, in mm.
+We know that diamonds can't have a width of 0mm, so these values must be incorrect.
+By doing EDA, we have discovered missing data that was coded as 0, which we never would have found by simply searching for `NA`s.
+Going forward we might choose to re-code these values as `NA`s in order to prevent misleading calculations.
+We might also suspect that measurements of 32mm and 59mm are implausible: those diamonds are over an inch long, but don't cost hundreds of thousands of dollars!
+
+It's good practice to repeat your analysis with and without the outliers.
+If they have minimal effect on the results, and you can't figure out why they're there, it's reasonable to omit them, and move on.
+However, if they have a substantial effect on your results, you shouldn't drop them without justification.
+You'll need to figure out what caused them (e.g., a data entry error) and disclose that you removed them in your write-up.
+
+### Exercises
+
+1.  Explore the distribution of each of the `x`, `y`, and `z` variables in `diamonds`.
+    What do you learn?
+    Think about a diamond and how you might decide which dimension is the length, width, and depth.
+
+2.  Explore the distribution of `price`.
+    Do you discover anything unusual or surprising?
+    (Hint: Carefully think about the `binwidth` and make sure you try a wide range of values.)
+
+3.  How many diamonds are 0.99 carat?
+    How many are 1 carat?
+    What do you think is the cause of the difference?
+
+4.  Compare and contrast `coord_cartesian()` vs. `xlim()` or `ylim()` when zooming in on a histogram.
+    What happens if you leave `binwidth` unset?
+    What happens if you try and zoom so only half a bar shows?
+
+## Unusual values {#sec-unusual-values-eda}
+
+If you've encountered unusual values in your dataset, and simply want to move on to the rest of your analysis, you have two options.
+
+1.  Drop the entire row with the strange values:
+
+    ```{r}
+    #| eval: false
+
+    diamonds2 <- diamonds |> 
+      filter(between(y, 3, 20))
+    ```
+
+    We don't recommend this option because one invalid value doesn't imply that all the other values for that observation are also invalid.
+    Additionally, if you have low quality data, by the time that you've applied this approach to every variable you might find that you don't have any data left!
+
+2.  Instead, we recommend replacing the unusual values with missing values.
+    The easiest way to do this is to use `mutate()` to replace the variable with a modified copy.
+    You can use the `if_else()` function to replace unusual values with `NA`:
+
+    ```{r}
+    diamonds2 <- diamonds |> 
+      mutate(y = if_else(y < 3 | y > 20, NA, y))
+    ```
+
+It's not obvious where you should plot missing values, so ggplot2 doesn't include them in the plot, but it does warn that they've been removed:
+
+```{r}
+#| dev: "png"
+#| fig-alt: |
+#|   A scatterplot of widths vs. lengths of diamonds. There is a strong, 
+#|   linear association between the two variables. All but one of the diamonds 
+#|   has length greater than 3. The one outlier has a length of 0 and a width 
+#|   of about 6.5. 
+
+ggplot(diamonds2, aes(x = x, y = y)) + 
+  geom_point()
+```
+
+To suppress that warning, set `na.rm = TRUE`:
+
+```{r}
+#| eval: false
+
+ggplot(diamonds2, aes(x = x, y = y)) + 
+  geom_point(na.rm = TRUE)
+```
+
+Other times you want to understand what makes observations with missing values different to observations with recorded values.
+For example, in `nycflights13::flights`[^eda-1], missing values in the `dep_time` variable indicate that the flight was cancelled.
+So you might want to compare the scheduled departure times for cancelled and non-cancelled times.
+You can do this by making a new variable, using `is.na()` to check if `dep_time` is missing.
+
+[^eda-1]: Remember that when we need to be explicit about where a function (or dataset) comes from, we'll use the special form `package::function()` or `package::dataset`.
+
+```{r}
+#| fig-alt: |
+#|   A frequency polygon of scheduled departure times of flights. Two lines 
+#|   represent flights that are cancelled and not cancelled. The x-axis ranges 
+#|   from 0 to 25 minutes and the y-axis ranges from 0 to 10000. The number of 
+#|   flights not cancelled are much higher than those cancelled.
+
+nycflights13::flights |> 
+  mutate(
+    cancelled = is.na(dep_time),
+    sched_hour = sched_dep_time %/% 100,
+    sched_min = sched_dep_time %% 100,
+    sched_dep_time = sched_hour + (sched_min / 60)
+  ) |> 
+  ggplot(aes(x = sched_dep_time)) + 
+  geom_freqpoly(aes(color = cancelled), binwidth = 1/4)
+```
+
+However this plot isn't great because there are many more non-cancelled flights than cancelled flights.
+In the next section we'll explore some techniques for improving this comparison.
+
+### Exercises
+
+1.  What happens to missing values in a histogram?
+    What happens to missing values in a bar chart?
+    Why is there a difference in how missing values are handled in histograms and bar charts?
+
+2.  What does `na.rm = TRUE` do in `mean()` and `sum()`?
+
+3.  Recreate the frequency plot of `scheduled_dep_time` colored by whether the flight was cancelled or not.
+    Also facet by the `cancelled` variable.
+    Experiment with different values of the `scales` variable in the faceting function to mitigate the effect of more non-cancelled flights than cancelled flights.
+
+## Covariation
+
+If variation describes the behavior *within* a variable, covariation describes the behavior *between* variables.
+**Covariation** is the tendency for the values of two or more variables to vary together in a related way.
+The best way to spot covariation is to visualize the relationship between two or more variables.
+
+### A categorical and a numerical variable {#sec-cat-num}
+
+For example, let's explore how the price of a diamond varies with its quality (measured by `cut`) using `geom_freqpoly()`:
+
+```{r}
+#| fig-alt: |
+#|   A frequency polygon of prices of diamonds where each cut of carat (Fair, 
+#|   Good, Very Good, Premium, and Ideal) is represented with a different color 
+#|   line. The x-axis ranges from 0 to 30000 and the y-axis ranges from 0 to 
+#|   5000. The lines overlap a great deal, suggesting similar frequency 
+#|   distributions of prices of diamonds. One notable feature is that 
+#|   Ideal diamonds have the highest peak around 1500.
+
+ggplot(diamonds, aes(x = price)) + 
+  geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75)
+```
+
+Note that ggplot2 uses an ordered color scale for `cut` because it's defined as an ordered factor variable in the data.
+You'll learn more about these in @sec-ordered-factors.
+
+The default appearance of `geom_freqpoly()` is not that useful here because the height, determined by the overall count, differs so much across `cut`s, making it hard to see the differences in the shapes of their distributions.
+
+To make the comparison easier we need to swap what is displayed on the y-axis.
+Instead of displaying count, we'll display the **density**, which is the count standardized so that the area under each frequency polygon is one.
+
+```{r}
+#| fig-alt: |
+#|   A frequency polygon of densities of prices of diamonds where each cut of 
+#|   carat (Fair, Good, Very Good, Premium, and Ideal) is represented with a 
+#|   different color line. The x-axis ranges from 0 to 20000. The lines overlap 
+#|   a great deal, suggesting similar density distributions of prices of 
+#|   diamonds. One notable feature is that all but Fair diamonds have high peaks 
+#|   around a price of 1500 and Fair diamonds have a higher mean than others.
+
+ggplot(diamonds, aes(x = price, y = after_stat(density))) + 
+  geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75)
+```
+
+Note that we're mapping the density the `y`, but since `density` is not a variable in the `diamonds` dataset, we need to first calculate it.
+We use the `after_stat()` function to do so.
+
+There's something rather surprising about this plot - it appears that fair diamonds (the lowest quality) have the highest average price!
+But maybe that's because frequency polygons are a little hard to interpret - there's a lot going on in this plot.
+
+A visually simpler plot for exploring this relationship is using side-by-side boxplots.
+
+```{r}
+#| fig-alt: |
+#|   Side-by-side boxplots of prices of diamonds by cut. The distribution of 
+#|   prices is right skewed for each cut (Fair, Good, Very Good, Premium, and 
+#|   Ideal). The medians are close to each other, with the median for Ideal 
+#|   diamonds lowest and that for Fair highest.
+
+ggplot(diamonds, aes(x = cut, y = price)) +
+  geom_boxplot()
+```
+
+We see much less information about the distribution, but the boxplots are much more compact so we can more easily compare them (and fit more on one plot).
+It supports the counter-intuitive finding that better quality diamonds are typically cheaper!
+In the exercises, you'll be challenged to figure out why.
+
+`cut` is an ordered factor: fair is worse than good, which is worse than very good and so on.
+Many categorical variables don't have such an intrinsic order, so you might want to reorder them to make a more informative display.
+One way to do that is with `fct_reorder()`.
+You'll learn more about that function in @sec-modifying-factor-order, but we want to give you a quick preview here because it's so useful.
+For example, take the `class` variable in the `mpg` dataset.
+You might be interested to know how highway mileage varies across classes:
+
+```{r}
+#| fig-alt: |
+#|   Side-by-side boxplots of highway mileages of cars by class. Classes are 
+#|   on the x-axis (2seaters, compact, midsize, minivan, pickup, subcompact, 
+#|   and suv).
+
+ggplot(mpg, aes(x = class, y = hwy)) +
+  geom_boxplot()
+```
+
+To make the trend easier to see, we can reorder `class` based on the median value of `hwy`:
+
+```{r}
+#| fig-alt: |
+#|   Side-by-side boxplots of highway mileages of cars by class. Classes are 
+#|   on the x-axis and ordered by increasing median highway mileage (pickup, 
+#|   suv, minivan, 2seater, subcompact, compact, and midsize).
+
+ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) +
+  geom_boxplot()
+```
+
+If you have long variable names, `geom_boxplot()` will work better if you flip it 90°.
+You can do that by exchanging the x and y aesthetic mappings.
+
+```{r}
+#| fig-alt: |
+#|   Side-by-side boxplots of highway mileages of cars by class. Classes are 
+#|   on the y-axis and ordered by increasing median highway mileage.
+
+ggplot(mpg, aes(x = hwy, y = fct_reorder(class, hwy, median))) +
+  geom_boxplot()
+```
+
+#### Exercises
+
+1.  Use what you've learned to improve the visualization of the departure times of cancelled vs. non-cancelled flights.
+
+2.  Based on EDA, what variable in the diamonds dataset appears to be most important for predicting the price of a diamond?
+    How is that variable correlated with cut?
+    Why does the combination of those two relationships lead to lower quality diamonds being more expensive?
+
+3.  Instead of exchanging the x and y variables, add `coord_flip()` as a new layer to the vertical boxplot to create a horizontal one.
+    How does this compare to exchanging the variables?
+
+4.  One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of "outlying values".
+    One approach to remedy this problem is the letter value plot.
+    Install the lvplot package, and try using `geom_lv()` to display the distribution of price vs. cut.
+    What do you learn?
+    How do you interpret the plots?
+
+5.  Create a visualization of diamond prices vs. a categorical variable from the `diamonds` dataset using `geom_violin()`, then a faceted `geom_histogram()`, then a colored `geom_freqpoly()`, and then a colored `geom_density()`.
+    Compare and contrast the four plots.
+    What are the pros and cons of each method of visualizing the distribution of a numerical variable based on the levels of a categorical variable?
+
+6.  If you have a small dataset, it's sometimes useful to use `geom_jitter()` to avoid overplotting to more easily see the relationship between a continuous and categorical variable.
+    The ggbeeswarm package provides a number of methods similar to `geom_jitter()`.
+    List them and briefly describe what each one does.
+
+### Two categorical variables
+
+To visualize the covariation between categorical variables, you'll need to count the number of observations for each combination of levels of these categorical variables.
+One way to do that is to rely on the built-in `geom_count()`:
+
+```{r}
+#| fig-alt: |
+#|   A scatterplot of color vs. cut of diamonds. There is one point for each
+#|   combination of levels of cut (Fair, Good, Very Good, Premium, and Ideal) 
+#|   and color (D, E, F, G, G, I, and J). The sizes of the points represent 
+#|   the number of observations for that combination. The legend indicates 
+#|   that these sizes range between 1000 and 4000.
+
+ggplot(diamonds, aes(x = cut, y = color)) +
+  geom_count()
+```
+
+The size of each circle in the plot displays how many observations occurred at each combination of values.
+Covariation will appear as a strong correlation between specific x values and specific y values.
+
+Another approach for exploring the relationship between these variables is computing the counts with dplyr:
+
+```{r}
+diamonds |> 
+  count(color, cut)
+```
+
+Then visualize with `geom_tile()` and the fill aesthetic:
+
+```{r}
+#| fig-alt: |
+#|   A tile plot of cut vs. color of diamonds. Each tile represents a 
+#|   cut/color combination and tiles are colored according to the number of 
+#|   observations in each tile. There are more Ideal diamonds than other cuts, 
+#|   with the highest number being Ideal diamonds with color G. Fair diamonds 
+#|   and diamonds with color I are the lowest in frequency.
+
+diamonds |> 
+  count(color, cut) |>  
+  ggplot(aes(x = color, y = cut)) +
+  geom_tile(aes(fill = n))
+```
+
+If the categorical variables are unordered, you might want to use the seriation package to simultaneously reorder the rows and columns in order to more clearly reveal interesting patterns.
+For larger plots, you might want to try the heatmaply package, which creates interactive plots.
+
+#### Exercises
+
+1.  How could you rescale the count dataset above to more clearly show the distribution of cut within color, or color within cut?
+
+2.  What different data insights do you get with a segmented bar chart if color is mapped to the `x` aesthetic and `cut` is mapped to the `fill` aesthetic?
+    Calculate the counts that fall into each of the segments.
+
+3.  Use `geom_tile()` together with dplyr to explore how average flight departure delays vary by destination and month of year.
+    What makes the plot difficult to read?
+    How could you improve it?
+
+### Two numerical variables
+
+You've already seen one great way to visualize the covariation between two numerical variables: draw a scatterplot with `geom_point()`.
+You can see covariation as a pattern in the points.
+For example, you can see a positive relationship between the carat size and price of a diamond: diamonds with more carats have a higher price.
+The relationship is exponential.
+
+```{r}
+#| dev: "png"
+#| fig-alt: |
+#|   A scatterplot of price vs. carat. The relationship is positive, somewhat 
+#|   strong, and exponential.
+
+ggplot(smaller, aes(x = carat, y = price)) +
+  geom_point()
+```
+
+(In this section we'll use the `smaller` dataset to stay focused on the bulk of the diamonds that are smaller than 3 carats)
+
+Scatterplots become less useful as the size of your dataset grows, because points begin to overplot, and pile up into areas of uniform black, making it hard to judge differences in the density of the data across the 2-dimensional space as well as making it hard to spot the trend.
+You've already seen one way to fix the problem: using the `alpha` aesthetic to add transparency.
+
+```{r}
+#| dev: "png"
+#| fig-alt: |
+#|   A scatterplot of price vs. carat. The relationship is positive, somewhat 
+#|   strong, and exponential. The points are transparent, showing clusters where 
+#|   the number of points is higher than other areas, The most obvious clusters 
+#|   are for diamonds with 1, 1.5, and 2 carats.
+
+ggplot(smaller, aes(x = carat, y = price)) + 
+  geom_point(alpha = 1 / 100)
+```
+
+But using transparency can be challenging for very large datasets.
+Another solution is to use bin.
+Previously you used `geom_histogram()` and `geom_freqpoly()` to bin in one dimension.
+Now you'll learn how to use `geom_bin2d()` and `geom_hex()` to bin in two dimensions.
+
+`geom_bin2d()` and `geom_hex()` divide the coordinate plane into 2d bins and then use a fill color to display how many points fall into each bin.
+`geom_bin2d()` creates rectangular bins.
+`geom_hex()` creates hexagonal bins.
+You will need to install the hexbin package to use `geom_hex()`.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|   Plot 1: A binned density plot of price vs. carat. Plot 2: A hexagonal bin 
+#|   plot of price vs. carat. Both plots show that the highest density of 
+#|   diamonds have low carats and low prices.
+
+ggplot(smaller, aes(x = carat, y = price)) +
+  geom_bin2d()
+
+# install.packages("hexbin")
+ggplot(smaller, aes(x = carat, y = price)) +
+  geom_hex()
+```
+
+Another option is to bin one continuous variable so it acts like a categorical variable.
+Then you can use one of the techniques for visualizing the combination of a categorical and a continuous variable that you learned about.
+For example, you could bin `carat` and then for each group, display a boxplot:
+
+```{r}
+#| fig-alt: |
+#|   Side-by-side box plots of price by carat. Each box plot represents diamonds 
+#|   that are 0.1 carats apart in weight. The box plots show that as carat 
+#|   increases the median price increases as well. Additionally, diamonds with 
+#|   1.5 carats or lower have right skewed price distributions, 1.5 to 2 have 
+#|   roughly symmetric price distributions, and diamonds that weigh more have 
+#|   left skewed distributions. Cheaper, smaller diamonds have outliers on the 
+#|   higher end, more expensive, bigger diamonds have outliers on the lower end.
+
+ggplot(smaller, aes(x = carat, y = price)) + 
+  geom_boxplot(aes(group = cut_width(carat, 0.1)))
+```
+
+`cut_width(x, width)`, as used above, divides `x` into bins of width `width`.
+By default, boxplots look roughly the same (apart from number of outliers) regardless of how many observations there are, so it's difficult to tell that each boxplot summaries a different number of points.
+One way to show that is to make the width of the boxplot proportional to the number of points with `varwidth = TRUE`.
+
+#### Exercises
+
+1.  Instead of summarizing the conditional distribution with a boxplot, you could use a frequency polygon.
+    What do you need to consider when using `cut_width()` vs. `cut_number()`?
+    How does that impact a visualization of the 2d distribution of `carat` and `price`?
+
+2.  Visualize the distribution of `carat`, partitioned by `price`.
+
+3.  How does the price distribution of very large diamonds compare to small diamonds?
+    Is it as you expect, or does it surprise you?
+
+4.  Combine two of the techniques you've learned to visualize the combined distribution of cut, carat, and price.
+
+5.  Two dimensional plots reveal outliers that are not visible in one dimensional plots.
+    For example, some points in the following plot have an unusual combination of `x` and `y` values, which makes the points outliers even though their `x` and `y` values appear normal when examined separately.
+    Why is a scatterplot a better display than a binned plot for this case?
+
+    ```{r}
+    #| eval: false
+    diamonds |> 
+      filter(x >= 4) |> 
+      ggplot(aes(x = x, y = y)) +
+      geom_point() +
+      coord_cartesian(xlim = c(4, 11), ylim = c(4, 11))
+    ```
+
+6.  Instead of creating boxes of equal width with `cut_width()`, we could create boxes that contain roughly equal number of points with `cut_number()`.
+    What are the advantages and disadvantages of this approach?
+
+    ```{r}
+    #| eval: false
+    ggplot(smaller, aes(x = carat, y = price)) + 
+      geom_boxplot(aes(group = cut_number(carat, 20)))
+    ```
+
+## Patterns and models
+
+If a systematic relationship exists between two variables it will appear as a pattern in the data.
+If you spot a pattern, ask yourself:
+
+-   Could this pattern be due to coincidence (i.e. random chance)?
+
+-   How can you describe the relationship implied by the pattern?
+
+-   How strong is the relationship implied by the pattern?
+
+-   What other variables might affect the relationship?
+
+-   Does the relationship change if you look at individual subgroups of the data?
+
+Patterns in your data provide clues about relationships, i.e., they reveal covariation.
+If you think of variation as a phenomenon that creates uncertainty, covariation is a phenomenon that reduces it.
+If two variables covary, you can use the values of one variable to make better predictions about the values of the second.
+If the covariation is due to a causal relationship (a special case), then you can use the value of one variable to control the value of the second.
+
+Models are a tool for extracting patterns out of data.
+For example, consider the diamonds data.
+It's hard to understand the relationship between cut and price, because cut and carat, and carat and price are tightly related.
+It's possible to use a model to remove the very strong relationship between price and carat so we can explore the subtleties that remain.
+The following code fits a model that predicts `price` from `carat` and then computes the residuals (the difference between the predicted value and the actual value).
+The residuals give us a view of the price of the diamond, once the effect of carat has been removed.
+Note that instead of using the raw values of `price` and `carat`, we log transform them first, and fit a model to the log-transformed values.
+Then, we exponentiate the residuals to put them back in the scale of raw prices.
+
+```{r}
+#| message: false
+#| dev: "png"
+#| fig-alt: |
+#|   A scatterplot of residuals vs. carat of diamonds. The x-axis ranges from 0 
+#|   to 5, the y-axis ranges from 0 to almost 4. Much of the data are clustered 
+#|   around low values of carat and residuals. There is a clear, curved pattern 
+#|   showing decrease in residuals as carat increases.
+
+library(tidymodels)
+
+diamonds <- diamonds |>
+  mutate(
+    log_price = log(price),
+    log_carat = log(carat)
+  )
+
+diamonds_fit <- linear_reg() |>
+  fit(log_price ~ log_carat, data = diamonds)
+
+diamonds_aug <- augment(diamonds_fit, new_data = diamonds) |>
+  mutate(.resid = exp(.resid))
+
+ggplot(diamonds_aug, aes(x = carat, y = .resid)) + 
+  geom_point()
+```
+
+Once you've removed the strong relationship between carat and price, you can see what you expect in the relationship between cut and price: relative to their size, better quality diamonds are more expensive.
+
+```{r}
+#| fig-alt: |
+#|   Side-by-side box plots of residuals by cut. The x-axis displays the various 
+#|   cuts (Fair to Ideal), the y-axis ranges from 0 to almost 5. The medians are 
+#|   quite similar, between roughly 0.75 to 1.25. Each of the distributions of 
+#|   residuals is right skewed, with many outliers on the higher end.
+
+ggplot(diamonds_aug, aes(x = cut, y = .resid)) + 
+  geom_boxplot()
+```
+
+We're not discussing modelling in this book because understanding what models are and how they work is easiest once you have tools of data wrangling and programming in hand.
+
+## Summary
+
+In this chapter you've learned a variety of tools to help you understand the variation within your data.
+You've seen techniques that work with a single variable at a time and with a pair of variables.
+This might seem painfully restrictive if you have tens or hundreds of variables in your data, but they're foundation upon which all other techniques are built.
+
+In the next chapter, we'll focus on the tools we can use to communicate our results.
diff --git a/README.md b/README.md
index 7ec718ef5..b9ce43b97 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,66 @@
 # R for Data Science
 
-This is code and text behind the [R for Data Science](http://r4ds.had.co.nz)
-book. 
+<!-- badges: start -->
 
-The R packages used in this book can be installed via
+[![Render and deploy Book to Netlify](https://github.com/hadley/r4ds/actions/workflows/build_book.yaml/badge.svg)](https://github.com/hadley/r4ds/actions/workflows/build_book.yaml)
+
+<!-- badges: end -->
+
+This repository contains the source of [R for Data Science](http://r4ds.hadley.nz) book.
+The book is built using [Quarto](https://quarto.org/).
+
+## Images
+
+### Omnigraffle drawings
+
+-   Font: 12pt Guardian Sans Condensed / Ubuntu mono
+
+-   Export as 300 dpi png.
+
+-   Website font is 18 px = 13.5 pt, so scale dpi to match font sizes: 270 = 300 \* 12 / 13.5.
+    (I also verified this empirically by screenshotting.)
+
+    ``` r
+    #| echo: FALSE
+    #| out.width: NULL
+    knitr::include_graphics("diagrams/transform.png", dpi = 270)
+    ```
+
+### Screenshots
+
+-   Make sure you're using a light theme.
+    For small interface elements (eg. toolbars), zoom in twice.
+
+-   Screenshot with Cmd + Shift + 4.
+
+-   Don't need to set dpi:
+
+    ``` r
+    #| echo: FALSE
+    #| out.width: NULL
+    knitr::include_graphics("screenshots/rstudio-wg.png")
+    ```
+
+### O'Reilly
+
+To generate book for O'Reilly, build the book then:
 
 ```{r}
-devtools::install_github("hadley/r4ds")
+# pak::pak("hadley/htmlbook")
+htmlbook::convert_book()
+
+html <- list.files("oreilly", pattern = "[.]html$", full.names = TRUE)
+file.copy(html, "../r-for-data-science-2e/", overwrite = TRUE)
+
+pngs <- list.files("oreilly", pattern = "[.]png$", full.names = TRUE, recursive = TRUE)
+dest <- gsub("oreilly", "../r-for-data-science-2e/", pngs)
+fs::dir_create(unique(dirname(dest)))
+file.copy(pngs, dest, overwrite = TRUE)
 ```
-The site is built using [bookdown package](https://github.com/rstudio/bookdown).
-To create the site, you also need:
 
-* [pandoc](http://johnmacfarlane.net/pandoc/)
+Then commit and push to atlas.
+
+## Code of Conduct
+
+Please note that r4ds uses a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
+By contributing to this book, you agree to abide by its terms.
diff --git a/_bookdown.yml b/_bookdown.yml
deleted file mode 100644
index 679c1e9ed..000000000
--- a/_bookdown.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-new_session: yes
-
-rmd_files: [
-  "index.rmd",
-  "intro.Rmd",
-
-  "explore.Rmd",
-  "visualize.Rmd",
-  "workflow-basics.Rmd",
-  "transform.Rmd",
-  "workflow-scripts.Rmd",
-  "EDA.Rmd",
-  "workflow-projects.Rmd",
-
-  "wrangle.Rmd",
-  "tibble.Rmd",
-  "import.Rmd",
-  "tidy.Rmd",
-  "relational-data.Rmd",
-  "strings.Rmd",
-  "factors.Rmd",
-  "datetimes.Rmd",
-
-  "program.Rmd",
-  "pipes.Rmd",
-  "functions.Rmd",
-  "vectors.Rmd",
-  "iteration.Rmd",
-
-  "model.Rmd",
-  "model-basics.Rmd",
-  "model-building.Rmd",
-  "model-many.Rmd",
-
-  "communicate.Rmd",
-  "rmarkdown.Rmd",
-  "communicate-plots.Rmd",
-  "rmarkdown-formats.Rmd",
-  "rmarkdown-workflow.Rmd",
-]
-
-before_chapter_script: "_common.R"
diff --git a/_common.R b/_common.R
index 363008388..036b73352 100644
--- a/_common.R
+++ b/_common.R
@@ -1,15 +1,55 @@
 set.seed(1014)
-options(digits = 3)
 
 knitr::opts_chunk$set(
   comment = "#>",
   collapse = TRUE,
-  cache = TRUE,
-  out.width = "70%",
-  fig.align = 'center',
+  # cache = TRUE,
+  fig.retina = 2,
   fig.width = 6,
-  fig.asp = 0.618,  # 1 / phi
+  fig.asp = 2/3,
   fig.show = "hold"
 )
 
-options(dplyr.print_min = 6, dplyr.print_max = 6)
+options(
+  dplyr.print_min = 6,
+  dplyr.print_max = 6,
+  pillar.max_footer_lines = 2,
+  pillar.min_chars = 15,
+  stringr.view_n = 6,
+  # Temporarily deactivate cli output for quarto
+  cli.num_colors = 0,
+  cli.hyperlink = FALSE,
+  pillar.bold = TRUE,
+  width = 77 # 80 - 3 for #> comment
+)
+
+ggplot2::theme_set(ggplot2::theme_gray(12))
+
+# use results: "asis" when setting a status for a chapter
+status <- function(type) {
+  status <- switch(type,
+    polishing = "should be readable but is currently undergoing final polishing",
+    restructuring = "is undergoing heavy restructuring and may be confusing or incomplete",
+    drafting = "is currently a dumping ground for ideas, and we don't recommend reading it",
+    complete = "is largely complete and just needs final proof reading",
+    stop("Invalid `type`", call. = FALSE)
+  )
+
+  class <- switch(type,
+    polishing = "note",
+    restructuring = "important",
+    drafting = "important",
+    complete = "note"
+  )
+
+  cat(paste0(
+    "\n",
+    ":::: status\n",
+    "::: callout-", class, " \n",
+    "You are reading the work-in-progress second edition of R for Data Science. ",
+    "This chapter ", status, ". ",
+    "You can find the complete first edition at <https://r4ds.had.co.nz>.\n",
+    ":::\n",
+    "::::\n"
+  ))
+}
diff --git a/_freeze/arrow/execute-results/html.json b/_freeze/arrow/execute-results/html.json
new file mode 100644
index 000000000..686eabe6d
--- /dev/null
+++ b/_freeze/arrow/execute-results/html.json
@@ -0,0 +1,14 @@
+{
+  "hash": "8ae7678995e8995f137d44c9c7d335ba",
+  "result": {
+    "markdown": "---\nfreeze: true\n---\n\n\n# Arrow {#sec-arrow}\n\n\n\n:::: status\n::: callout-note \nYou are reading the work-in-progress second edition of R for Data Science. This chapter is largely complete and just needs final proof reading. You can find the complete first edition at <https://r4ds.had.co.nz>.\n:::\n::::\n\n\n## Introduction\n\nCSV files are designed to be easily read by humans.\nThey're a good interchange format because they're very simple and they can be read by every tool under the sun.\nBut CSV files aren't very efficient: you have to do quite a lot of work to read the data into R.\nIn this chapter, you'll learn about a powerful alternative: the [parquet format](https://parquet.apache.org/), an open standards-based format widely used by big data systems.\n\nWe'll pair parquet files with [Apache Arrow](https://arrow.apache.org), a multi-language toolbox designed for efficient analysis and transport of large datasets.\nWe'll use Apache Arrow via the the [arrow package](https://arrow.apache.org/docs/r/), which provides a dplyr backend allowing you to analyze larger-than-memory datasets using familiar dplyr syntax.\nAs an additional benefit, arrow is extremely fast: you'll see some examples later in the chapter.\n\nBoth arrow and dbplyr provide dplyr backends, so you might wonder when to use each.\nIn many cases, the choice is made for you, as in the data is already in a database or in parquet files, and you'll want to work with it as is.\nBut if you're starting with your own data (perhaps CSV files), you can either load it into a database or convert it to parquet.\nIn general, it's hard to know what will work best, so in the early stages of your analysis we'd encourage you to try both and pick the one that works the best for you.\n\n(A big thanks to Danielle Navarro who contributed the initial version of this chapter.)\n\n### Prerequisites\n\nIn this chapter, we'll continue to use the tidyverse, particularly dplyr, but we'll pair it with the arrow package which is designed specifically for working with large data.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tidyverse)\nlibrary(arrow)\n```\n:::\n\n\nLater in the chapter, we'll also see some connections between arrow and duckdb, so we'll also need dbplyr and duckdb.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dbplyr, warn.conflicts = FALSE)\nlibrary(duckdb)\n#> Loading required package: DBI\n```\n:::\n\n\n## Getting the data\n\nWe begin by getting a dataset worthy of these tools: a dataset of item checkouts from Seattle public libraries, available online at [data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6](https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6).\nThis dataset contains 41,389,465 rows that tell you how many times each book was checked out each month from April 2005 to October 2022.\n\nThe following code will get you a cached copy of the data.\nThe data is a 9GB CSV file, so it will take some time to download.\nI highly recommend using `curl::multidownload()` to get very large files as it's built for exactly this purpose: it gives you a progress bar and it can resume the download if its interrupted.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndir.create(\"data\", showWarnings = FALSE)\n\ncurl::multi_download(\n  \"https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv\",\n  \"data/seattle-library-checkouts.csv\",\n  resume = TRUE\n)\n```\n:::\n\n\n## Opening a dataset\n\nLet's start by taking a look at the data.\nAt 9GB, this file is large enough that we probably don't want to load the whole thing into memory.\nA good rule of thumb is that you usually want at least twice as much memory as the size of the data, and many laptops top out at 16 Gb.\nThis means we want to avoid `read_csv()` and instead use the `arrow::open_dataset()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n  sources = \"data/seattle-library-checkouts.csv\", \n  format = \"csv\"\n)\n```\n:::\n\n\nWhat happens when this code is run?\n`open_dataset()` will scan a few thousand rows to figure out the structure of the dataset.\nThen it records what it's found and stops; it will only read further rows as you specifically request them.\nThis metadata is what we see if we print `seattle_csv`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv\n#> FileSystemDataset with 1 csv file\n#> UsageClass: string\n#> CheckoutType: string\n#> MaterialType: string\n#> CheckoutYear: int64\n#> CheckoutMonth: int64\n#> Checkouts: int64\n#> Title: string\n#> ISBN: null\n#> Creator: string\n#> Subjects: string\n#> Publisher: string\n#> PublicationYear: string\n```\n:::\n\n\nThe first line in the output tells you that `seattle_csv` is stored locally on-disk as a single CSV file; it will only be loaded into memory as needed.\nThe remainder of the output tells you the column type that arrow has imputed for each column.\n\nWe can see what's actually in with `glimpse()`.\nThis reveals that there are \\~41 million rows and 12 columns, and shows us a few values.\n\n\n::: {.cell hash='arrow_cache/html/glimpse-data_07c924738790eb185ebdd8973443e90d'}\n\n```{.r .cell-code}\nseattle_csv |> glimpse()\n#> FileSystemDataset with 1 csv file\n#> 41,389,465 rows x 12 columns\n#> $ UsageClass      <string> \"Physical\", \"Physical\", \"Digital\", \"Physical\", \"Ph…\n#> $ CheckoutType    <string> \"Horizon\", \"Horizon\", \"OverDrive\", \"Horizon\", \"Hor…\n#> $ MaterialType    <string> \"BOOK\", \"BOOK\", \"EBOOK\", \"BOOK\", \"SOUNDDISC\", \"BOO…\n#> $ CheckoutYear     <int64> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…\n#> $ CheckoutMonth    <int64> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…\n#> $ Checkouts        <int64> 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 3, 2, 1, 3, 2,…\n#> $ Title           <string> \"Super rich : a guide to having it all / Russell S…\n#> $ ISBN            <string> \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\"…\n#> $ Creator         <string> \"Simmons, Russell\", \"Barclay, James, 1965-\", \"Tim …\n#> $ Subjects        <string> \"Self realization, Conduct of life, Attitude Psych…\n#> $ Publisher       <string> \"Gotham Books,\", \"Pyr,\", \"Random House, Inc.\", \"Di…\n#> $ PublicationYear <string> \"c2011.\", \"2010.\", \"2015\", \"2005.\", \"c2004.\", \"c20…\n```\n:::\n\n\nWe can start to use this dataset with dplyr verbs, using `collect()` to force arrow to perform the computation and return some data.\nFor example, this code tells us the total number of checkouts per year:\n\n\n::: {.cell hash='arrow_cache/html/unnamed-chunk-5_7a5e1ce0bed4d69e849dff75d0c0d8d3'}\n\n```{.r .cell-code}\nseattle_csv |> \n  count(CheckoutYear, wt = Checkouts) |> \n  arrange(CheckoutYear) |> \n  collect()\n#> # A tibble: 18 × 2\n#>   CheckoutYear       n\n#>          <int>   <int>\n#> 1         2005 3798685\n#> 2         2006 6599318\n#> 3         2007 7126627\n#> 4         2008 8438486\n#> 5         2009 9135167\n#> 6         2010 8608966\n#> # … with 12 more rows\n```\n:::\n\n\nThanks to arrow, this code will work regardless of how large the underlying dataset is.\nBut it's currently rather slow: on Hadley's computer, it took \\~10s to run.\nThat's not terrible given how much data we have, but we can make it much faster by switching to a better format.\n\n## The parquet format {#sec-parquet}\n\nTo make this data easier to work with, lets switch to the parquet file format and split it up into multiple files.\nThe following sections will first introduce you to parquet and partitioning, and then apply what we learned to the Seattle library data.\n\n### Advantages of parquet\n\nLike CSV, parquet is used for rectangular data, but instead of being a text format that you can read with any file editor, it's a custom binary format designed specifically for the needs of big data.\nThis means that:\n\n-   Parquet files are usually smaller the equivalent CSV file.\n    Parquet relies on [efficient encodings](https://parquet.apache.org/docs/file-format/data-pages/encodings/) to keep file size down, and supports file compression.\n    This helps make parquet files fast because there's less data to move from disk to memory.\n\n-   Parquet files have a rich type system.\n    As we talked about in @sec-col-types, a CSV file does not provide any information about column types.\n    For example, a CSV reader has to guess whether `\"08-10-2022\"` should be parsed as a string or a date.\n    In contrast, parquet files store data in a way that records the type along with the data.\n\n-   Parquet files are \"column-oriented\".\n    This means that they're organized column-by-column, much like R's data frame.\n    This typically leads to better performance for data analysis tasks compared to CSV files, which are organized row-by-row.\n\n-   Parquet files are \"chunked\", which makes it possible to work on different parts of the file at the same time, and, if you're lucky, to skip some chunks all together.\n\n### Partitioning\n\nAs datasets get larger and larger, storing all the data in a single file gets increasingly painful and it's often useful to split large datasets across many files.\nWhen this structuring is done intelligently, this strategy can lead to significant improvements in performance because many analyses will only require a subset of the files.\n\nThere are no hard and fast rules about how to partition your dataset: the results will depend on your data, access patterns, and the systems that read the data.\nYou're likely to need to do some experimentation before you find the ideal partitioning for your situation.\nAs a rough guide, arrow suggests that you avoid files smaller than 20MB and larger than 2GB and avoid partitions that produce more than 10,000 files.\nYou should also try to partition by variables that you filter by; as you'll see shortly, that allows arrow to skip a lot of work by reading only the relevant files.\n\n### Rewriting the Seattle library data\n\nLet's apply these ideas to the Seattle library data to see how they play out in practice.\nWe're going to partition by `CheckoutYear`, since it's likely some analyses will only want to look at recent data and partitioning by year yields 18 chunks of a reasonable size.\n\nTo rewrite the data we define the partition using `dplyr::group_by()` and then save the partitions to a directory with `arrow::write_dataset()`.\n`write_dataset()` has two important arguments: a directory where we'll create the files and the format we'll use.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npq_path <- \"data/seattle-library-checkouts\"\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n  group_by(CheckoutYear) |>\n  write_dataset(path = pq_path, format = \"parquet\")\n```\n:::\n\n\nThis takes about a minute to run; as we'll see shortly this is an initial investment that pays off by making future operations much much faster.\n\nLet's take a look at what we just produced:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntibble(\n  files = list.files(pq_path, recursive = TRUE),\n  size_MB = file.size(file.path(pq_path, files)) / 1024^2\n)\n#> # A tibble: 18 × 2\n#>   files                            size_MB\n#>   <chr>                              <dbl>\n#> 1 CheckoutYear=2005/part-0.parquet    109.\n#> 2 CheckoutYear=2006/part-0.parquet    164.\n#> 3 CheckoutYear=2007/part-0.parquet    178.\n#> 4 CheckoutYear=2008/part-0.parquet    195.\n#> 5 CheckoutYear=2009/part-0.parquet    214.\n#> 6 CheckoutYear=2010/part-0.parquet    222.\n#> # … with 12 more rows\n```\n:::\n\n\nOur single 9GB CSV file has been rewritten into 18 parquet files.\nThe file names use a \"self-describing\" convention used by the [Apache Hive](https://hive.apache.org) project.\nHive-style partitions name folders with a \"key=value\" convention, so as you might guess, the `CheckoutYear=2005` directory contains all the data where `CheckoutYear` is 2005.\nEach file is between 100 and 300 MB and the total size is now around 4 GB, a little over half the size of the original CSV file.\nThis is as we expect since parquet is a much more efficient format.\n\n## Using dplyr with arrow\n\nNow we've created these parquet files, we'll need to read them in again.\nWe use `open_dataset()` again, but this time we give it a directory:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_pq <- open_dataset(pq_path)\n```\n:::\n\n\nNow we can write our dplyr pipeline.\nFor example, we could count the total number of books checked out in each month for the last five years:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nquery <- seattle_pq |> \n  filter(CheckoutYear >= 2018, MaterialType == \"BOOK\") |>\n  group_by(CheckoutYear, CheckoutMonth) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(CheckoutYear, CheckoutMonth)\n```\n:::\n\n\nWriting dplyr code for arrow data is conceptually similar to dbplyr, @sec-import-databases: you write dplyr code, which is automatically transformed into a query that the Apache Arrow C++ library understands, which is then executed when you call `collect()`.\nIf we print out the `query` object we can see a little information about what we expect Arrow to return when the execution takes place:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nquery\n#> FileSystemDataset (query)\n#> CheckoutYear: int32\n#> CheckoutMonth: int64\n#> TotalCheckouts: int64\n#> \n#> * Grouped by CheckoutYear\n#> * Sorted by CheckoutYear [asc], CheckoutMonth [asc]\n#> See $.data for the source Arrow object\n```\n:::\n\n\nAnd we can get the results by calling `collect()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nquery |> collect()\n#> # A tibble: 58 × 3\n#> # Groups:   CheckoutYear [5]\n#>   CheckoutYear CheckoutMonth TotalCheckouts\n#>          <int>         <int>          <int>\n#> 1         2018             1         355101\n#> 2         2018             2         309813\n#> 3         2018             3         344487\n#> 4         2018             4         330988\n#> 5         2018             5         318049\n#> 6         2018             6         341825\n#> # … with 52 more rows\n```\n:::\n\n\nLike dbplyr, arrow only understands some R expressions, so you may not be able to write exactly the same code you usually would.\nHowever, the list of operations and functions supported is fairly extensive and continues to grow; find a complete list of currently supported functions in `?acero`.\n\n### Performance {#sec-parquet-fast}\n\nLet's take a quick look at the performance impact of switching from CSV to parquet.\nFirst, let's time how long it takes to calculate the number of books checked out in each month of 2021, when the data is stored as a single large csv:\n\n\n::: {.cell hash='arrow_cache/html/dataset-performance-csv_483a703c116b20d0e51a2183c096cfa2'}\n\n```{.r .cell-code}\nseattle_csv |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |> \n  system.time()\n#>    user  system elapsed \n#>  11.997   1.189  11.343\n```\n:::\n\n\nNow let's use our new version of the dataset in which the Seattle library checkout data has been partitioned into 18 smaller parquet files:\n\n\n::: {.cell hash='arrow_cache/html/dataset-performance-multiple-parquet_de9e0ac3cfc08b2e6eef4a12f94f8391'}\n\n```{.r .cell-code}\nseattle_pq |> \n  filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n  group_by(CheckoutMonth) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutMonth)) |>\n  collect() |> \n  system.time()\n#>    user  system elapsed \n#>   0.272   0.063   0.063\n```\n:::\n\n\nThe \\~100x speedup in performance is attributable to two factors: the multi-file partitioning, and the format of individual files:\n\n-   Partitioning improves performance because this query uses `CheckoutYear == 2021` to filter the data, and arrow is smart enough to recognize that it only needs to read 1 of the 18 parquet files.\n-   The parquet format improves performance by storing data in a binary format that can be read more directly into memory. The column-wise format and rich metadata means that arrow only needs to read the four columns actually used in the query (`CheckoutYear`, `MaterialType`, `CheckoutMonth`, and `Checkouts`).\n\nThis massive difference in performance is why it pays off to convert large CSVs to parquet!\n\n### Using dbplyr with arrow\n\nThere's one last advantage of parquet and arrow --- it's very easy to turn an arrow dataset into a DuckDB database (@sec-import-databases) by calling `arrow::to_duckdb()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_pq |> \n  to_duckdb() |>\n  filter(CheckoutYear >= 2018, MaterialType == \"BOOK\") |>\n  group_by(CheckoutYear) |>\n  summarize(TotalCheckouts = sum(Checkouts)) |>\n  arrange(desc(CheckoutYear)) |>\n  collect()\n#> Warning: Missing values are always removed in SQL aggregation functions.\n#> Use `na.rm = TRUE` to silence this warning\n#> This warning is displayed once every 8 hours.\n#> # A tibble: 5 × 2\n#>   CheckoutYear TotalCheckouts\n#>          <int>          <dbl>\n#> 1         2022        2431502\n#> 2         2021        2266438\n#> 3         2020        1241999\n#> 4         2019        3931688\n#> 5         2018        3987569\n```\n:::\n\n\nThe neat thing about `to_duckdb()` is that the transfer doesn't involve any memory copying, and speaks to the goals of the arrow ecosystem: enabling seamless transitions from one computing environment to another.\n\n## Summary\n\nIn this chapter, you've been given a taste of the arrow package, which provides a dplyr backend for working with large on-disk datasets.\nIt can work with CSV files, its much much faster if you convert your data to parquet.\nParquet is a binary data format that's designed specifically for data analysis on modern computers.\nFar fewer tools can work with parquet files compared to CSV, but it's partitioned, compressed, and columnar structure makes it much more efficient to analyze.\n\nNext up you'll learn about your first non-rectangular data source, which you'll handle using tools provided by the tidyr package.\nWe'll focus on data that comes from JSON files, but the general principles apply to tree-like data regardless of its source.\n",
+    "supporting": [],
+    "filters": [
+      "rmarkdown/pagebreak.lua"
+    ],
+    "includes": {},
+    "engineDependencies": {},
+    "preserve": {},
+    "postProcess": true
+  }
+}
\ No newline at end of file
diff --git a/_output.yaml b/_output.yaml
deleted file mode 100644
index a82ee78bf..000000000
--- a/_output.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-bookdown::gitbook:
-  config:
-    toc:
-      collapse: section
-      before: |
-        <li><strong><a href="./">R for Data Science</a></strong></li>
-    edit:
-      link: https://github.com/hadley/r4ds/edit/master/%s
-      text: "Edit"
-    sharing: no
-  css: r4ds.css
-
-bookdown::pdf_book:
-  latex_engine: "xelatex"
-
diff --git a/_quarto.yml b/_quarto.yml
new file mode 100644
index 000000000..4768ee9f3
--- /dev/null
+++ b/_quarto.yml
@@ -0,0 +1,86 @@
+project:
+  type: book
+  output-dir: _book
+
+book:
+  title: "R for Data Science (2e)"
+  reader-mode: true
+
+  page-footer:
+    left: |
+      R for Data Science (2e) was written by Hadley Wickham, Mine
+      Çetinkaya-Rundel, and Garrett Grolemund.
+    right: |
+      This book was built with <a href="https://quarto.org/">Quarto</a>.
+  cover-image: cover.jpg
+  favicon: cover.jpg
+  site-url: https://r4ds.hadley.nz/
+  repo-url: https://github.com/hadley/r4ds/
+  repo-branch: main
+  repo-actions: [edit, issue]
+  chapters:
+    - index.qmd
+
+    - preface-2e.qmd
+    - intro.qmd
+
+    - part: whole-game.qmd
+      chapters:
+        - data-visualize.qmd
+        - workflow-basics.qmd
+        - data-transform.qmd
+        - workflow-style.qmd
+        - data-tidy.qmd
+        - workflow-scripts.qmd
+        - data-import.qmd
+        - workflow-help.qmd
+
+    - part: visualize.qmd
+      chapters:
+        - layers.qmd
+        - EDA.qmd
+        - communication.qmd
+
+    - part: transform.qmd
+      chapters:
+        - logicals.qmd
+        - numbers.qmd
+        - strings.qmd
+        - regexps.qmd
+        - factors.qmd
+        - datetimes.qmd
+        - missing-values.qmd
+        - joins.qmd
+
+    - part: import.qmd
+      chapters:
+        - spreadsheets.qmd
+        - databases.qmd
+        - arrow.qmd
+        - rectangling.qmd
+        - webscraping.qmd
+
+    - part: program.qmd
+      chapters:
+        - functions.qmd
+        - iteration.qmd
+        - base-R.qmd
+
+    - part: communicate.qmd
+      chapters:
+        - quarto.qmd
+        - quarto-formats.qmd
+
+format:
+  html:
+    theme:
+      - cosmo
+      - r4ds.scss
+    code-link: true
+
+    author-meta: "Hadley Wickham, Mine Çetinkaya-Rundel, and Garrett Grolemund"
+    include-in-header: "plausible.html"
+    callout-appearance: simple
+
+editor: visual
+
diff --git a/arrow.qmd b/arrow.qmd
new file mode 100644
index 000000000..36f3e21d3
--- /dev/null
+++ b/arrow.qmd
@@ -0,0 +1,299 @@
+---
+freeze: true
+---
+
+# Arrow {#sec-arrow}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+CSV files are designed to be easily read by humans.
+They're a good interchange format because they're very simple and they can be read by every tool under the sun.
+But CSV files aren't very efficient: you have to do quite a lot of work to read the data into R.
+In this chapter, you'll learn about a powerful alternative: the [parquet format](https://parquet.apache.org/), an open standards-based format widely used by big data systems.
+
+We'll pair parquet files with [Apache Arrow](https://arrow.apache.org), a multi-language toolbox designed for efficient analysis and transport of large datasets.
+We'll use Apache Arrow via the [arrow package](https://arrow.apache.org/docs/r/), which provides a dplyr backend allowing you to analyze larger-than-memory datasets using familiar dplyr syntax.
+As an additional benefit, arrow is extremely fast: you'll see some examples later in the chapter.
+
+Both arrow and dbplyr provide dplyr backends, so you might wonder when to use each.
+In many cases, the choice is made for you, as the data is already in a database or in parquet files, and you'll want to work with it as is.
+But if you're starting with your own data (perhaps CSV files), you can either load it into a database or convert it to parquet.
+In general, it's hard to know what will work best, so in the early stages of your analysis we'd encourage you to try both and pick the one that works the best for you.
+
+(A big thanks to Danielle Navarro who contributed the initial version of this chapter.)
+
+### Prerequisites
+
+In this chapter, we'll continue to use the tidyverse, particularly dplyr, but we'll pair it with the arrow package which is designed specifically for working with large data.
+
+```{r setup}
+#| message: false
+#| warning: false
+library(tidyverse)
+library(arrow)
+```
+
+Later in the chapter, we'll also see some connections between arrow and duckdb, so we'll also need dbplyr and duckdb.
+
+```{r}
+library(dbplyr, warn.conflicts = FALSE)
+library(duckdb)
+```
+
+## Getting the data
+
+We begin by getting a dataset worthy of these tools: a dataset of item checkouts from Seattle public libraries, available online at [data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6](https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6).
+This dataset contains 41,389,465 rows that tell you how many times each book was checked out each month from April 2005 to October 2022.
+
+The following code will get you a cached copy of the data.
+The data is a 9GB CSV file, so it will take some time to download.
+I highly recommend using `curl::multi_download()` to get very large files as it's built for exactly this purpose: it gives you a progress bar and it can resume the download if its interrupted.
+
+```{r}
+#| eval: false
+dir.create("data", showWarnings = FALSE)
+
+curl::multi_download(
+  "https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv",
+  "data/seattle-library-checkouts.csv",
+  resume = TRUE
+)
+```
+
+## Opening a dataset
+
+Let's start by taking a look at the data.
+At 9GB, this file is large enough that we probably don't want to load the whole thing into memory.
+A good rule of thumb is that you usually want at least twice as much memory as the size of the data, and many laptops top out at 16 Gb.
+This means we want to avoid `read_csv()` and instead use the `arrow::open_dataset()`:
+
+```{r open-dataset}
+seattle_csv <- open_dataset(
+  sources = "data/seattle-library-checkouts.csv", 
+  col_types = schema(ISBN = string()),
+  format = "csv"
+)
+```
+
+What happens when this code is run?
+`open_dataset()` will scan a few thousand rows to figure out the structure of the dataset.
+The `ISBN` column contains blank values for the first 80,000 rows, so we have to specify the column type to help arrow work out the data structure.
+Once the data has been scanned by `open_dataset()`, it records what it's found and stops; it will only read further rows as you specifically request them.
+This metadata is what we see if we print `seattle_csv`:
+
+```{r}
+seattle_csv
+```
+
+The first line in the output tells you that `seattle_csv` is stored locally on-disk as a single CSV file; it will only be loaded into memory as needed.
+The remainder of the output tells you the column type that arrow has imputed for each column.
+
+We can see what's actually in with `glimpse()`.
+This reveals that there are \~41 million rows and 12 columns, and shows us a few values.
+
+```{r glimpse-data}
+#| cache: true
+seattle_csv |> glimpse()
+```
+
+We can start to use this dataset with dplyr verbs, using `collect()` to force arrow to perform the computation and return some data.
+For example, this code tells us the total number of checkouts per year:
+
+```{r}
+#| cache: true
+seattle_csv |> 
+  group_by(CheckoutYear) |> 
+  summarise(Checkouts = sum(Checkouts)) |> 
+  arrange(CheckoutYear) |> 
+  collect()
+```
+
+Thanks to arrow, this code will work regardless of how large the underlying dataset is.
+But it's currently rather slow: on Hadley's computer, it took \~10s to run.
+That's not terrible given how much data we have, but we can make it much faster by switching to a better format.
+
+## The parquet format {#sec-parquet}
+
+To make this data easier to work with, let's switch to the parquet file format and split it up into multiple files.
+The following sections will first introduce you to parquet and partitioning, and then apply what we learned to the Seattle library data.
+
+### Advantages of parquet
+
+Like CSV, parquet is used for rectangular data, but instead of being a text format that you can read with any file editor, it's a custom binary format designed specifically for the needs of big data.
+This means that:
+
+-   Parquet files are usually smaller than the equivalent CSV file.
+    Parquet relies on [efficient encodings](https://parquet.apache.org/docs/file-format/data-pages/encodings/) to keep file size down, and supports file compression.
+    This helps make parquet files fast because there's less data to move from disk to memory.
+
+-   Parquet files have a rich type system.
+    As we talked about in @sec-col-types, a CSV file does not provide any information about column types.
+    For example, a CSV reader has to guess whether `"08-10-2022"` should be parsed as a string or a date.
+    In contrast, parquet files store data in a way that records the type along with the data.
+
+-   Parquet files are "column-oriented".
+    This means that they're organized column-by-column, much like R's data frame.
+    This typically leads to better performance for data analysis tasks compared to CSV files, which are organized row-by-row.
+
+-   Parquet files are "chunked", which makes it possible to work on different parts of the file at the same time, and, if you're lucky, to skip some chunks altogether.
+
+There's one primary disadvantage to parquet files: they are no longer "human readable", i.e. if you look at a parquet file using `readr::read_file()`, you'll just see a bunch of gibberish.
+
+### Partitioning
+
+As datasets get larger and larger, storing all the data in a single file gets increasingly painful and it's often useful to split large datasets across many files.
+When this structuring is done intelligently, this strategy can lead to significant improvements in performance because many analyses will only require a subset of the files.
+
+There are no hard and fast rules about how to partition your dataset: the results will depend on your data, access patterns, and the systems that read the data.
+You're likely to need to do some experimentation before you find the ideal partitioning for your situation.
+As a rough guide, arrow suggests that you avoid files smaller than 20MB and larger than 2GB and avoid partitions that produce more than 10,000 files.
+You should also try to partition by variables that you filter by; as you'll see shortly, that allows arrow to skip a lot of work by reading only the relevant files.
+
+### Rewriting the Seattle library data
+
+Let's apply these ideas to the Seattle library data to see how they play out in practice.
+We're going to partition by `CheckoutYear`, since it's likely some analyses will only want to look at recent data and partitioning by year yields 18 chunks of a reasonable size.
+
+To rewrite the data we define the partition using `dplyr::group_by()` and then save the partitions to a directory with `arrow::write_dataset()`.
+`write_dataset()` has two important arguments: a directory where we'll create the files and the format we'll use.
+
+```{r}
+pq_path <- "data/seattle-library-checkouts"
+```
+
+```{r write-dataset}
+#| eval: !expr "!file.exists(pq_path)"
+
+seattle_csv |>
+  group_by(CheckoutYear) |>
+  write_dataset(path = pq_path, format = "parquet")
+```
+
+This takes about a minute to run; as we'll see shortly this is an initial investment that pays off by making future operations much much faster.
+
+Let's take a look at what we just produced:
+
+```{r show-parquet-files}
+tibble(
+  files = list.files(pq_path, recursive = TRUE),
+  size_MB = file.size(file.path(pq_path, files)) / 1024^2
+)
+```
+
+Our single 9GB CSV file has been rewritten into 18 parquet files.
+The file names use a "self-describing" convention used by the [Apache Hive](https://hive.apache.org) project.
+Hive-style partitions name folders with a "key=value" convention, so as you might guess, the `CheckoutYear=2005` directory contains all the data where `CheckoutYear` is 2005.
+Each file is between 100 and 300 MB and the total size is now around 4 GB, a little over half the size of the original CSV file.
+This is as we expect since parquet is a much more efficient format.
+
+## Using dplyr with arrow
+
+Now we've created these parquet files, we'll need to read them in again.
+We use `open_dataset()` again, but this time we give it a directory:
+
+```{r}
+seattle_pq <- open_dataset(pq_path)
+```
+
+Now we can write our dplyr pipeline.
+For example, we could count the total number of books checked out in each month for the last five years:
+
+```{r books-by-year-query}
+query <- seattle_pq |> 
+  filter(CheckoutYear >= 2018, MaterialType == "BOOK") |>
+  group_by(CheckoutYear, CheckoutMonth) |>
+  summarize(TotalCheckouts = sum(Checkouts)) |>
+  arrange(CheckoutYear, CheckoutMonth)
+```
+
+Writing dplyr code for arrow data is conceptually similar to dbplyr, @sec-import-databases: you write dplyr code, which is automatically transformed into a query that the Apache Arrow C++ library understands, which is then executed when you call `collect()`.
+If we print out the `query` object we can see a little information about what we expect Arrow to return when the execution takes place:
+
+```{r}
+query
+```
+
+And we can get the results by calling `collect()`:
+
+```{r books-by-year}
+query |> collect()
+```
+
+Like dbplyr, arrow only understands some R expressions, so you may not be able to write exactly the same code you usually would.
+However, the list of operations and functions supported is fairly extensive and continues to grow; find a complete list of currently supported functions in `?acero`.
+
+### Performance {#sec-parquet-fast}
+
+Let's take a quick look at the performance impact of switching from CSV to parquet.
+First, let's time how long it takes to calculate the number of books checked out in each month of 2021, when the data is stored as a single large csv:
+
+```{r dataset-performance-csv}
+#| cache: true
+
+seattle_csv |> 
+  filter(CheckoutYear == 2021, MaterialType == "BOOK") |>
+  group_by(CheckoutMonth) |>
+  summarize(TotalCheckouts = sum(Checkouts)) |>
+  arrange(desc(CheckoutMonth)) |>
+  collect() |> 
+  system.time()
+```
+
+Now let's use our new version of the dataset in which the Seattle library checkout data has been partitioned into 18 smaller parquet files:
+
+```{r dataset-performance-multiple-parquet}
+#| cache: true
+
+seattle_pq |> 
+  filter(CheckoutYear == 2021, MaterialType == "BOOK") |>
+  group_by(CheckoutMonth) |>
+  summarize(TotalCheckouts = sum(Checkouts)) |>
+  arrange(desc(CheckoutMonth)) |>
+  collect() |> 
+  system.time()
+```
+
+The \~100x speedup in performance is attributable to two factors: the multi-file partitioning, and the format of individual files:
+
+-   Partitioning improves performance because this query uses `CheckoutYear == 2021` to filter the data, and arrow is smart enough to recognize that it only needs to read 1 of the 18 parquet files.
+-   The parquet format improves performance by storing data in a binary format that can be read more directly into memory. The column-wise format and rich metadata means that arrow only needs to read the four columns actually used in the query (`CheckoutYear`, `MaterialType`, `CheckoutMonth`, and `Checkouts`).
+
+This massive difference in performance is why it pays off to convert large CSVs to parquet!
+
+### Using duckdb with arrow
+
+There's one last advantage of parquet and arrow --- it's very easy to turn an arrow dataset into a DuckDB database (@sec-import-databases) by calling `arrow::to_duckdb()`:
+
+```{r use-duckdb}
+seattle_pq |> 
+  to_duckdb() |>
+  filter(CheckoutYear >= 2018, MaterialType == "BOOK") |>
+  group_by(CheckoutYear) |>
+  summarize(TotalCheckouts = sum(Checkouts)) |>
+  arrange(desc(CheckoutYear)) |>
+  collect()
+```
+
+The neat thing about `to_duckdb()` is that the transfer doesn't involve any memory copying, and speaks to the goals of the arrow ecosystem: enabling seamless transitions from one computing environment to another.
+
+### Exercises
+
+1.  Figure out the most popular book each year.
+2.  Which author has the most books in the Seattle library system?
+3.  How has checkouts of books vs ebooks changed over the last 10 years?
+
+## Summary
+
+In this chapter, you've been given a taste of the arrow package, which provides a dplyr backend for working with large on-disk datasets.
+It can work with CSV files, and it's much much faster if you convert your data to parquet.
+Parquet is a binary data format that's designed specifically for data analysis on modern computers.
+Far fewer tools can work with parquet files compared to CSV, but its partitioned, compressed, and columnar structure makes it much more efficient to analyze.
+
+Next up you'll learn about your first non-rectangular data source, which you'll handle using tools provided by the tidyr package.
+We'll focus on data that comes from JSON files, but the general principles apply to tree-like data regardless of its source.
diff --git a/base-R.qmd b/base-R.qmd
new file mode 100644
index 000000000..b3de68cce
--- /dev/null
+++ b/base-R.qmd
@@ -0,0 +1,546 @@
+# A field guide to base R {#sec-base-r}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+To finish off the programming section, we're going to give you a quick tour of the most important base R functions that we don't otherwise discuss in the book.
+These tools are particularly useful as you do more programming and will help you read code you'll encounter in the wild.
+
+This is a good place to remind you that the tidyverse is not the only way to solve data science problems.
+We teach the tidyverse in this book because tidyverse packages share a common design philosophy, increasing the consistency across functions, and making each new function or package a little easier to learn and use.
+It's not possible to use the tidyverse without using base R, so we've actually already taught you a **lot** of base R functions: from `library()` to load packages, to `sum()` and `mean()` for numeric summaries, to the factor, date, and POSIXct data types, and of course all the basic operators like `+`, `-`, `/`, `*`, `|`, `&`, and `!`.
+What we haven't focused on so far is base R workflows, so we will highlight a few of those in this chapter.
+
+After you read this book, you'll learn other approaches to the same problems using base R, data.table, and other packages.
+You'll undoubtedly encounter these other approaches when you start reading R code written by others, particularly if you're using StackOverflow.
+It's 100% okay to write code that uses a mix of approaches, and don't let anyone tell you otherwise!
+
+In this chapter, we'll focus on four big topics: subsetting with `[`, subsetting with `[[` and `$`, the apply family of functions, and `for` loops.
+To finish off, we'll briefly discuss two essential plotting functions.
+
+### Prerequisites
+
+This package focuses on base R so doesn't have any real prerequisites, but we'll load the tidyverse in order to explain some of the differences.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+## Selecting multiple elements with `[` {#sec-subset-many}
+
+`[` is used to extract sub-components from vectors and data frames, and is called like `x[i]` or `x[i, j]`.
+In this section, we'll introduce you to the power of `[`, first showing you how you can use it with vectors, then how the same principles extend in a straightforward way to two-dimensional (2d) structures like data frames.
+We'll then help you cement that knowledge by showing how various dplyr verbs are special cases of `[`.
+
+### Subsetting vectors
+
+There are five main types of things that you can subset a vector with, i.e., that can be the `i` in `x[i]`:
+
+1.  **A vector of positive integers**.
+    Subsetting with positive integers keeps the elements at those positions:
+
+    ```{r}
+    x <- c("one", "two", "three", "four", "five")
+    x[c(3, 2, 5)]
+    ```
+
+    By repeating a position, you can actually make a longer output than input, making the term "subsetting" a bit of a misnomer.
+
+    ```{r}
+    x[c(1, 1, 5, 5, 5, 2)]
+    ```
+
+2.  **A vector of negative integers**.
+    Negative values drop the elements at the specified positions:
+
+    ```{r}
+    x[c(-1, -3, -5)]
+    ```
+
+3.  **A logical vector**.
+    Subsetting with a logical vector keeps all values corresponding to a `TRUE` value.
+    This is most often useful in conjunction with the comparison functions.
+
+    ```{r}
+    x <- c(10, 3, NA, 5, 8, 1, NA)
+
+    # All non-missing values of x
+    x[!is.na(x)]
+
+    # All even (or missing!) values of x
+    x[x %% 2 == 0]
+    ```
+
+    Unlike `filter()`, `NA` indices will be included in the output as `NA`s.
+
+4.  **A character vector**.
+    If you have a named vector, you can subset it with a character vector:
+
+    ```{r}
+    x <- c(abc = 1, def = 2, xyz = 5)
+    x[c("xyz", "def")]
+    ```
+
+    As with subsetting with positive integers, you can use a character vector to duplicate individual entries.
+
+5.  **Nothing**.
+    The final type of subsetting is nothing, `x[]`, which returns the complete `x`.
+    This is not useful for subsetting vectors, but as we'll see shortly, it is useful when subsetting 2d structures like tibbles.
+
+### Subsetting data frames
+
+There are quite a few different ways[^base-r-1] that you can use `[` with a data frame, but the most important way is to select rows and columns independently with `df[rows, cols]`. Here `rows` and `cols` are vectors as described above.
+For example, `df[rows, ]` and `df[, cols]` select just rows or just columns, using the empty subset to preserve the other dimension.
+
+[^base-r-1]: Read <https://adv-r.hadley.nz/subsetting.html#subset-multiple> to see how you can also subset a data frame like it is a 1d object and how you can subset it with a matrix.
+
+Here are a couple of examples:
+
+```{r}
+df <- tibble(
+  x = 1:3, 
+  y = c("a", "e", "f"), 
+  z = runif(3)
+)
+
+# Select first row and second column
+df[1, 2]
+
+# Select all rows and columns x and y
+df[, c("x" , "y")]
+
+# Select rows where `x` is greater than 1 and all columns
+df[df$x > 1, ]
+```
+
+We'll come back to `$` shortly, but you should be able to guess what `df$x` does from the context: it extracts the `x` variable from `df`.
+We need to use it here because `[` doesn't use tidy evaluation, so you need to be explicit about the source of the `x` variable.
+
+There's an important difference between tibbles and data frames when it comes to `[`.
+In this book, we've mainly used tibbles, which *are* data frames, but they tweak some behaviors to make your life a little easier.
+In most places, you can use "tibble" and "data frame" interchangeably, so when we want to draw particular attention to R's built-in data frame, we'll write `data.frame`.
+If `df` is a `data.frame`, then `df[, cols]` will return a vector if `col` selects a single column and a data frame if it selects more than one column.
+If `df` is a tibble, then `[` will always return a tibble.
+
+```{r}
+df1 <- data.frame(x = 1:3)
+df1[, "x"]
+
+df2 <- tibble(x = 1:3)
+df2[, "x"]
+```
+
+One way to avoid this ambiguity with `data.frame`s is to explicitly specify `drop = FALSE`:
+
+```{r}
+df1[, "x" , drop = FALSE]
+```
+
+### dplyr equivalents
+
+Several dplyr verbs are special cases of `[`:
+
+-   `filter()` is equivalent to subsetting the rows with a logical vector, taking care to exclude missing values:
+
+    ```{r}
+    #| results: false
+
+    df <- tibble(
+      x = c(2, 3, 1, 1, NA), 
+      y = letters[1:5], 
+      z = runif(5)
+    )
+    df |> filter(x > 1)
+
+    # same as
+    df[!is.na(df$x) & df$x > 1, ]
+    ```
+
+    Another common technique in the wild is to use `which()` for its side-effect of dropping missing values: `df[which(df$x > 1), ]`.
+
+-   `arrange()` is equivalent to subsetting the rows with an integer vector, usually created with `order()`:
+
+    ```{r}
+    #| results: false
+
+    df |> arrange(x, y)
+
+    # same as
+    df[order(df$x, df$y), ]
+    ```
+
+    You can use `order(decreasing = TRUE)` to sort all columns in descending order or `-rank(col)` to sort columns in decreasing order individually.
+
+-   Both `select()` and `relocate()` are similar to subsetting the columns with a character vector:
+
+    ```{r}
+    #| results: false
+
+    df |> select(x, z)
+
+    # same as
+    df[, c("x", "z")]
+    ```
+
+Base R also provides a function that combines the features of `filter()` and `select()`[^base-r-2] called `subset()`:
+
+[^base-r-2]: But it doesn't handle grouped data frames differently and it doesn't support selection helper functions like `starts_with()`.
+
+```{r}
+df |> 
+  filter(x > 1) |> 
+  select(y, z)
+```
+
+```{r}
+#| results: false
+
+# same as
+df |> subset(x > 1, c(y, z))
+```
+
+This function was the inspiration for much of dplyr's syntax.
+
+### Exercises
+
+1.  Create functions that take a vector as input and return:
+
+    a.  The elements at even-numbered positions.
+    b.  Every element except the last value.
+    c.  Only even values (and no missing values).
+
+2.  Why is `x[-which(x > 0)]` not the same as `x[x <= 0]`?
+    Read the documentation for `which()` and do some experiments to figure it out.
+
+## Selecting a single element with `$` and `[[` {#sec-subset-one}
+
+`[`, which selects many elements, is paired with `[[` and `$`, which extract a single element.
+In this section, we'll show you how to use `[[` and `$` to pull columns out of data frames, discuss a couple more differences between `data.frames` and tibbles, and emphasize some important differences between `[` and `[[` when used with lists.
+
+### Data frames
+
+`[[` and `$` can be used to extract columns out of a data frame.
+`[[` can access by position or by name, and `$` is specialized for access by name:
+
+```{r}
+tb <- tibble(
+  x = 1:4,
+  y = c(10, 4, 1, 21)
+)
+
+# by position
+tb[[1]]
+
+# by name
+tb[["x"]]
+tb$x
+```
+
+They can also be used to create new columns, the base R equivalent of `mutate()`:
+
+```{r}
+tb$z <- tb$x + tb$y
+tb
+```
+
+There are several other base R approaches to creating new columns including with `transform()`, `with()`, and `within()`.
+Hadley collected a few examples at <https://gist.github.com/hadley/1986a273e384fb2d4d752c18ed71bedf>.
+
+Using `$` directly is convenient when performing quick summaries.
+For example, if you just want to find the size of the biggest diamond or the possible values of `cut`, there's no need to use `summarize()`:
+
+```{r}
+max(diamonds$carat)
+
+levels(diamonds$cut)
+```
+
+dplyr also provides an equivalent to `[[`/`$` that we didn't mention in @sec-data-transform: `pull()`.
+`pull()` takes either a variable name or variable position and returns just that column.
+That means we could rewrite the above code to use the pipe:
+
+```{r}
+diamonds |> pull(carat) |> max()
+
+diamonds |> pull(cut) |> levels()
+```
+
+### Tibbles
+
+There are a couple of important differences between tibbles and base `data.frame`s when it comes to `$`.
+Data frames match the prefix of any variable names (so-called **partial matching**) and don't complain if a column doesn't exist:
+
+```{r}
+df <- data.frame(x1 = 1)
+df$x
+df$z
+```
+
+Tibbles are more strict: they only ever match variable names exactly and they will generate a warning if the column you are trying to access doesn't exist:
+
+```{r}
+tb <- tibble(x1 = 1)
+
+tb$x
+tb$z
+```
+
+For this reason we sometimes joke that tibbles are lazy and surly: they do less and complain more.
+
+### Lists
+
+`[[` and `$` are also really important for working with lists, and it's important to understand how they differ from `[`.
+Let's illustrate the differences with a list named `l`:
+
+```{r}
+l <- list(
+  a = 1:3, 
+  b = "a string", 
+  c = pi, 
+  d = list(-1, -5)
+)
+```
+
+-   `[` extracts a sub-list.
+    It doesn't matter how many elements you extract, the result will always be a list.
+
+    ```{r}
+    str(l[1:2])
+
+    str(l[1])
+
+    str(l[4])
+    ```
+
+    Like with vectors, you can subset with a logical, integer, or character vector.
+
+-   `[[` and `$` extract a single component from a list.
+    They remove a level of hierarchy from the list.
+
+    ```{r}
+    str(l[[1]])
+
+    str(l[[4]])
+
+    str(l$a)
+    ```
+
+The difference between `[` and `[[` is particularly important for lists because `[[` drills down into the list while `[` returns a new, smaller list.
+To help you remember the difference, take a look at the unusual pepper shaker shown in @fig-pepper.
+If this pepper shaker is your list `pepper`, then, `pepper[1]` is a pepper shaker containing a single pepper packet.
+`pepper[2]` would look the same, but would contain the second packet.
+`pepper[1:2]` would be a pepper shaker containing two pepper packets.
+`pepper[[1]]` would extract the pepper packet itself.
+
+```{r}
+#| label: fig-pepper
+#| echo: false
+#| out-width: "100%"
+#| fig-cap: |
+#|   (Left) A pepper shaker that Hadley once found in his hotel room.
+#|   (Middle) `pepper[1]`.
+#|   (Right) `pepper[[1]]`
+#| fig-alt: |
+#|   Three photos. On the left is a photo of a glass pepper shaker. Instead of 
+#|   the pepper shaker containing pepper, it contains a single packet of pepper.
+#|   In the middle is a photo of a single packet of pepper. On the right is a 
+#|   photo of the contents of a packet of pepper.
+
+knitr::include_graphics("diagrams/pepper.png")
+```
+
+This same principle applies when you use 1d `[` with a data frame: `df["x"]` returns a one-column data frame and `df[["x"]]` returns a vector.
+
+### Exercises
+
+1.  What happens when you use `[[` with a positive integer that's bigger than the length of the vector?
+    What happens when you subset with a name that doesn't exist?
+
+2.  What would `pepper[[1]][1]` be?
+    What about `pepper[[1]][[1]]`?
+
+## Apply family
+
+In @sec-iteration, you learned tidyverse techniques for iteration like `dplyr::across()` and the map family of functions.
+In this section, you'll learn about their base equivalents, the **apply family**.
+In this context apply and map are synonyms because another way of saying "map a function over each element of a vector" is "apply a function over each element of a vector".
+Here we'll give you a quick overview of this family so you can recognize them in the wild.
+
+The most important member of this family is `lapply()`, which is very similar to `purrr::map()`[^base-r-3].
+In fact, because we haven't used any of `map()`'s more advanced features, you can replace every `map()` call in @sec-iteration with `lapply()`.
+
+[^base-r-3]: It just lacks convenient features like progress bars and reporting which element caused the problem if there's an error.
+
+There's no exact base R equivalent to `across()` but you can get close by using `[` with `lapply()`.
+This works because under the hood, data frames are lists of columns, so calling `lapply()` on a data frame applies the function to each column.
+
+```{r}
+df <- tibble(a = 1, b = 2, c = "a", d = "b", e = 4)
+
+# First find numeric columns
+num_cols <- sapply(df, is.numeric)
+num_cols
+
+# Then transform each column with lapply() then replace the original values
+df[, num_cols] <- lapply(df[, num_cols, drop = FALSE], \(x) x * 2)
+df
+```
+
+The code above uses a new function, `sapply()`.
+It's similar to `lapply()` but it always tries to simplify the result, hence the `s` in its name, here producing a logical vector instead of a list.
+We don't recommend using it for programming, because the simplification can fail and give you an unexpected type, but it's usually fine for interactive use.
+purrr has a similar function called `map_vec()` that we didn't mention in @sec-iteration.
+
+Base R provides a stricter version of `sapply()` called `vapply()`, short for **v**ector apply.
+It takes an additional argument that specifies the expected type, ensuring that simplification occurs the same way regardless of the input.
+For example, we could replace the `sapply()` call above with this `vapply()` where we specify that we expect `is.numeric()` to return a logical vector of length 1:
+
+```{r}
+vapply(df, is.numeric, logical(1))
+```
+
+The distinction between `sapply()` and `vapply()` is really important when they're inside a function (because it makes a big difference to the function's robustness to unusual inputs), but it doesn't usually matter in data analysis.
+
+Another important member of the apply family is `tapply()` which computes a single grouped summary:
+
+```{r}
+diamonds |> 
+  group_by(cut) |> 
+  summarize(price = mean(price))
+
+tapply(diamonds$price, diamonds$cut, mean)
+```
+
+Unfortunately `tapply()` returns its results in a named vector which requires some gymnastics if you want to collect multiple summaries and grouping variables into a data frame (it's certainly possible to not do this and just work with free floating vectors, but in our experience that just delays the work).
+If you want to see how you might use `tapply()` or other base techniques to perform other grouped summaries, Hadley has collected a few techniques [in a gist](https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec).
+
+The final member of the apply family is the titular `apply()`, which works with matrices and arrays.
+In particular, watch out for `apply(df, 2, something)`, which is a slow and potentially dangerous way of doing `lapply(df, something)`.
+This rarely comes up in data science because we usually work with data frames and not matrices.
+
+## `for` loops
+
+`for` loops are the fundamental building block of iteration that both the apply and map families use under the hood.
+`for` loops are powerful and general tools that are important to learn as you become a more experienced R programmer.
+The basic structure of a `for` loop looks like this:
+
+```{r}
+#| eval: false
+
+for (element in vector) {
+  # do something with element
+}
+```
+
+The most straightforward use of `for` loops is to achieve the same effect as `walk()`: call some function with a side-effect on each element of a list.
+For example, in @sec-save-database instead of using `walk()`:
+
+```{r}
+#| eval: false
+
+paths |> walk(append_file)
+```
+
+We could have used a `for` loop:
+
+```{r}
+#| eval: false
+
+for (path in paths) {
+  append_file(path)
+}
+```
+
+Things get a little trickier if you want to save the output of the `for` loop, for example reading all of the excel files in a directory like we did in @sec-iteration:
+
+```{r}
+paths <- dir("data/gapminder", pattern = "\\.xlsx$", full.names = TRUE)
+files <- map(paths, readxl::read_excel)
+```
+
+There are a few different techniques that you can use, but we recommend being explicit about what the output is going to look like upfront.
+In this case, we're going to want a list the same length as `paths`, which we can create with `vector()`:
+
+```{r}
+files <- vector("list", length(paths))
+```
+
+Then instead of iterating over the elements of `paths`, we'll iterate over their indices, using `seq_along()` to generate one index for each element of paths:
+
+```{r}
+seq_along(paths)
+```
+
+Using the indices is important because it allows us to link to each position in the input with the corresponding position in the output:
+
+```{r}
+for (i in seq_along(paths)) {
+  files[[i]] <- readxl::read_excel(paths[[i]])
+}
+```
+
+To combine the list of tibbles into a single tibble you can use `do.call()` + `rbind()`:
+
+```{r}
+do.call(rbind, files)
+```
+
+Rather than making a list and saving the results as we go, a simpler approach is to build up the data frame piece-by-piece:
+
+```{r}
+out <- NULL
+for (path in paths) {
+  out <- rbind(out, readxl::read_excel(path))
+}
+```
+
+We recommend avoiding this pattern because it can become very slow when the vector is very long.
+This is the source of the persistent canard that `for` loops are slow: they're not, but iteratively growing a vector is.
+
+## Plots
+
+Many R users who don't otherwise use the tidyverse prefer ggplot2 for plotting due to helpful features like sensible defaults, automatic legends, and a modern look.
+However, base R plotting functions can still be useful because they're so concise --- it takes very little typing to do a basic exploratory plot.
+
+There are two main types of base plot you'll see in the wild: scatterplots and histograms, produced with `plot()` and `hist()` respectively.
+Here's a quick example from the diamonds dataset:
+
+```{r}
+#| dev: png
+#| fig-width: 4
+#| fig-asp: 1
+#| layout-ncol: 2
+#| fig-alt: |
+#|   On the left, histogram of carats of diamonds, ranging from 0 to 5 carats.
+#|   The distribution is unimodal and right-skewed. On the right, scatter 
+#|   plot of price vs. carat of diamonds, showing a positive relationship 
+#|   that fans out as both price and carat increases. The scatter plot 
+#|   shows very few diamonds bigger than 3 carats compared to diamonds between 
+#|   0 to 3 carats.
+
+# Left
+hist(diamonds$carat)
+
+# Right
+plot(diamonds$carat, diamonds$price)
+```
+
+Note that base plotting functions work with vectors, so you need to pull columns out of the data frame using `$` or some other technique.
+
+## Summary
+
+In this chapter, we've shown you a selection of base R functions useful for subsetting and iteration.
+Compared to approaches discussed elsewhere in the book, these functions tend to have more of a "vector" flavor than a "data frame" flavor because base R functions tend to take individual vectors, rather than a data frame and some column specification.
+This often makes life easier for programming and so becomes more important as you write more functions and begin to write your own packages.
+
+This chapter concludes the programming section of the book.
+You've made a solid start on your journey to becoming not just a data scientist who uses R, but a data scientist who can *program* in R.
+We hope these chapters have sparked your interest in programming and that you're looking forward to learning more outside of this book.
diff --git a/communicate-plots.Rmd b/communicate-plots.Rmd
deleted file mode 100644
index 8cd678471..000000000
--- a/communicate-plots.Rmd
+++ /dev/null
@@ -1,594 +0,0 @@
-# Graphics for communication
-
-## Introduction
-
-In [exploratory data analysis], you learned how to use plots as tools for _exploration_. When you make exploratory plots, you know---even before looking---which variables the plot will display. You made each plot for a purpose, could quickly look at it, and then move on to the next plot. In the course of most analyses, you'll produce tens or hundreds of plots, most of which are immediately thrown away.
-
-Now that you understand your data, you need to _communicate_ your understanding to others. Your audience will likely not share your background knowledge and will not be deeply invested in the data. To help others quickly build up a good mental model of the data, you will need to invest considerable effort in making your plots as self-explanatory as possible. In this chapter, you'll learn some of the tools that ggplot2 provides to do so.
-
-This chapter focuses on the tools you need to create good graphics. I assume that you know what you want, and just need to know how to do it. For that reason, I highly recommend pairing this chapter with a good general visualisation book. I particularly like [_The Truthful Art_](https://amzn.com/0321934075), by Albert Cairo. It doesn't teach the mechanics of creating visualisations, but instead focuses on what you need to think about in order to create effective graphics.
-
-### Prerequisites
-
-In this chapter, we'll focus once again on ggplot2. We'll also use a little dplyr for data manipulation, and a few ggplot2 extension packages, including __ggrepel__ and __viridis__. Rather than loading those extensions here, we'll refer to their functions explicitly, using the `::` notation. This will help make it clear which functions are built into ggplot2, and which come from other packages. Don't forget you'll need to install those packages with `install.packages()` if you don't already have them.
-
-```{r, message = FALSE}
-library(tidyverse)
-```
-
-## Label
-
-The easiest place to start when turning an exploratory graphic into an expository graphic is with good labels. You add labels with the `labs()` function. This example adds a plot title:
-
-```{r, message = FALSE}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(color = class)) +
-  geom_smooth(se = FALSE) +
-  labs(title = "Fuel efficiency generally decreases with engine size")
-```
-
-The purpose of a plot title is to summarise the main finding. Avoid titles that just describe what the plot is, e.g. "A scatterplot of engine displacement vs. fuel economy". 
-
-If you need to add more text, there are two other useful labels that you can use in ggplot2 2.2.0 and above (which should be available by the time you're reading this book):
-
-*   `subtitle` adds additional detail in a smaller font beneath the title.
-
-*   `caption` adds text at the bottom right of the plot, often used to describe
-    the source of the data.
-
-```{r, message = FALSE}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(color = class)) +
-  geom_smooth(se = FALSE) +
-  labs(
-    title = "Fuel efficiency generally decreases with engine size",
-    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
-    caption = "Data from fueleconomy.gov"
-  )
-```
-
-You can also use `labs()` to replace the axis and legend titles. It's usually a good idea to replace short variable names with more detailed descriptions, and to include the units.
-
-```{r, message = FALSE}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class)) +
-  geom_smooth(se = FALSE) +
-  labs(
-    x = "Engine displacement (L)",
-    y = "Highway fuel economy (mpg)",
-    colour = "Car type"
-  )
-```
-
-It's possible to use mathematical equations instead of text strings. Just switch `""` out for `quote()` and read about the available options in `?plotmath`:
-
-```{r, fig.asp = 1, out.width = "50%", fig.width = 3}
-df <- tibble(
-  x = runif(10),
-  y = runif(10)
-)
-ggplot(df, aes(x, y)) +
-  geom_point() +
-  labs(
-    x = quote(sum(x[i] ^ 2, i == 1, n)),
-    y = quote(alpha + beta + frac(delta, theta))
-  )
-```
-
-### Exercises
-
-1.  Create one plot on the fuel economy data with customised `title`,
-    `subtitle`, `caption`, `x`, `y`, and `colour` labels.
-
-1.  The `geom_smooth()` is somewhat misleading because the `hwy` for
-    large engines is skewed upwards due to the inclusion of lightweight
-    sports cars with big engines. Use your modelling tools to fit and display
-    a better model.
-
-1.  Take an exploratory graphic that you've created in the last month, and add
-    informative titles to make it easier for others to understand.
-
-## Annotations
-
-In addition to labelling major components of your plot, it's often useful to label individual observations or groups of observations. The first tool you have at your disposal is `geom_text()`. `geom_text()` is similar to `geom_point()`, but it has an additional aesthetic: `label`. This makes it possible to add textual labels to your plots.
-
-There are two possible sources of labels. First, you might have a tibble that provides labels. The plot below isn't terribly useful, but it illustrates a useful approach: pull out the most efficient car in each class with dplyr, and then label it on the plot:
-
-```{r}
-best_in_class <- mpg %>%
-  group_by(class) %>%
-  filter(row_number(desc(hwy)) == 1)
-
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class)) +
-  geom_text(aes(label = model), data = best_in_class)
-```
-
-This is hard to read because the labels overlap with each other, and with the points. We can make things a little better by switching to `geom_label()` which draws a rectangle behind the text. We also use the `nudge_y` parameter to move the labels slightly above the corresponding points:
-
-```{r}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class)) +
-  geom_label(aes(label = model), data = best_in_class, nudge_y = 2, alpha = 0.5)
-```
-
-That helps a bit, but if you look closely in the top-left hand corner, you'll notice that there are two labels practically on top of each other. This happens because the highway mileage and displacement for the best cars in the compact and subcompact categories are exactly the same. There's no way that we can fix these by applying the same transformation for every label. Instead, we can use the __ggrepel__ package by Kamil Slowikowski. This useful package will automatically adjust labels so that they don't overlap:
-
-```{r}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class)) +
-  geom_point(size = 3, shape = 1, data = best_in_class) +
-  ggrepel::geom_label_repel(aes(label = model), data = best_in_class)
-```
-
-Note another handy technique used here: I added a second layer of large, hollow points to highlight the points that I've labelled.
-
-You can sometimes use the same idea to replace the legend with labels placed directly on the plot. It's not wonderful for this plot, but it isn't too bad. (`theme(legend.position = "none"`) turns the legend off --- we'll talk about it more shortly.)
-
-```{r}
-class_avg <- mpg %>%
-  group_by(class) %>%
-  summarise(
-    displ = median(displ),
-    hwy = median(hwy)
-  )
-
-ggplot(mpg, aes(displ, hwy, colour = class)) +
-  ggrepel::geom_label_repel(aes(label = class),
-    data = class_avg,
-    size = 6,
-    label.size = 0,
-    segment.color = NA
-  ) +
-  geom_point() +
-  theme(legend.position = "none")
-```
-
-Alternatively, you might just want to add a single label to the plot, but you'll still need to create a data frame. Often, you want the label in the corner of the plot, so it's convenient to create a new data frame using `summarise()` to compute the maximum values of x and y.
-
-```{r}
-label <- mpg %>%
-  summarise(
-    displ = max(displ),
-    hwy = max(hwy),
-    label = "Increasing engine size is \nrelated to decreasing fuel economy."
-  )
-
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point() +
-  geom_text(aes(label = label), data = label, vjust = "top", hjust = "right")
-```
-
-If you want to place the text exactly on the borders of the plot, you can use `+Inf` and `-Inf`. Since we're no longer computing the positions from `mpg`, we can use `tibble()` to create the data frame:
-
-```{r}
-label <- tibble(
-  displ = Inf,
-  hwy = Inf,
-  label = "Increasing engine size is \nrelated to decreasing fuel economy."
-)
-
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point() +
-  geom_text(aes(label = label), data = label, vjust = "top", hjust = "right")
-```
-
-In these examples, I manually broke the label up into lines using `"\n"`. Another approach is to use `stringr::str_wrap()` to automatically add line breaks, given the number of characters you want per line:
-
-```{r}
-"Increasing engine size is related to decreasing fuel economy." %>%
-  stringr::str_wrap(width = 40) %>%
-  writeLines()
-```
-
-Note the use of `hjust` and `vjust` to control the alignment of the label. Figure \@ref(fig:just) shows all nine possible combinations.
-
-```{r just, echo = FALSE, fig.cap = "All nine combinations of `hjust` and `vjust`.", fig.asp = 0.5, fig.width = 4.5, out.width = "60%"}
-vjust <- c(bottom = 0, center = 0.5, top = 1)
-hjust <- c(left = 0, center = 0.5, right = 1)
-
-df <- tidyr::crossing(hj = names(hjust), vj = names(vjust)) %>%
-  mutate(
-    y = vjust[vj],
-    x = hjust[hj],
-    label = paste0("hjust = '", hj, "'\n", "vjust = '", vj, "'")
-  )
-
-ggplot(df, aes(x, y)) +
-  geom_point(colour = "grey70", size = 5) +
-  geom_point(size = 0.5, colour = "red") +
-  geom_text(aes(label = label, hjust = hj, vjust = vj), size = 4) +
-  labs(x = NULL, y = NULL) 
-```
-
-Remember, in addition to `geom_text()`, you have many other geoms in ggplot2 available to help annotate your plot. A few ideas:
-
-*   Use `geom_hline()` and `geom_vline()` to add reference lines. I often make
-    them thick (`size = 2`) and white (`colour = white`), and draw them
-    underneath the primary data layer. That makes them easy to see, without
-    drawing attention away from the data.
-
-*   Use `geom_rect()` to draw a rectangle around points of interest. The
-    boundaries of the rectangle are defined by aesthetics `xmin`, `xmax`,
-    `ymin`, `ymax`.
-
-*   Use `geom_segment()` with the `arrow` argument to draw attention
-    to a point with an arrow. Use aesthetics `x` and `y` to define the
-    starting location, and `xend` and `yend` to define the end location.
-
-The only limit is your imagination (and your patience with positioning annotations to be aesthetically pleasing)!
-
-### Exercises
-
-1.  Use `geom_text()` with infinite positions to place text at the
-    four corners of the plot.
-
-1.  Read the documentation for `annotate()`. How can you use it to add a text
-    label to a plot without having to create a tibble?
-
-1.  How do labels with `geom_text()` interact with faceting? How can you
-    add a label to a single facet? How can you put a different label in
-    each facet? (Hint: think about the underlying data.)
-
-1.  What arguments to `geom_label()` control the appearance of the background
-    box?
-
-1.  What are the four arguments to `arrow()`? How do they work? Create a series
-    of plots that demonstrate the most important options.
-
-## Scales
-
-The third way you can make your plot better for communication is to adjust the scales. Scales control the mapping from data values to things that you can perceive. Normally, ggplot2 automatically adds scales for you. For example, when you type:
-
-```{r default-scales, fig.show = "hide"}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class))
-```
-
-ggplot2 automatically adds default scales behind the scenes:
-
-```{r, fig.show = "hide"}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class)) +
-  scale_x_continuous() +
-  scale_y_continuous() +
-  scale_colour_discrete()
-```
-
-Note the naming scheme for scales: `scale_` followed by the name of the aesthetic, then `_`, then the name of the scale. The default scales are named according to the type of variable they align with: continuous, discrete, datetime, or date. There are lots of non-default scales which you'll learn about below.
-
-The default scales have been carefully chosen to do a good job for a wide range of inputs. Nevertheless, you might want to override the defaults for two reasons:
-
-*   You might want to tweak some of the parameters of the default scale.
-    This allows you to do things like change the breaks on the axes, or the
-    key labels on the legend.
-
-*   You might want to replace the scale altogether, and use a completely
-    different algorithm. Often you can do better than the default because
-    you know more about the data.
-
-### Axis ticks and legend keys
-
-There are two primary arguments that affect the appearance of the ticks on the axes and the keys on the legend: `breaks` and `labels`. Breaks controls the position of the ticks, or the values associated with the keys. Labels controls the text label associated with each tick/key. The most common use of `breaks` is to override the default choice:
-
-```{r}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point() +
-  scale_y_continuous(breaks = seq(15, 40, by = 5))
-```
-
-You can use `labels` in the same way (a character vector the same length as `breaks`), but you can also set it to `NULL` to suppress the labels altogether. This is useful for maps, or for publishing plots where you can't share the absolute numbers.
-
-```{r}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point() +
-  scale_x_continuous(labels = NULL) +
-  scale_y_continuous(labels = NULL)
-```
-
-You can also use `breaks` and `labels` to control the appearance of legends. Collectively axes and legends are called __guides__. Axes are used for x and y aesthetics; legends are used for everything else.
-
-Another use of `breaks` is when you have relatively few data points and want to highlight exactly where the observations occur. For example, take this plot that shows when each US president started and ended their term.
-
-```{r}
-presidential %>%
-  mutate(id = 33 + row_number()) %>%
-  ggplot(aes(start, id)) +
-    geom_point() +
-    geom_segment(aes(xend = end, yend = id)) +
-    scale_x_date(NULL, breaks = presidential$start, date_labels = "'%y")
-```
-
-Note that the specification of breaks and labels for date and datetime scales is a little different:
-
-* `date_labels` takes a format specification, in the same form as
-  `parse_datetime()`.
-
-* `date_breaks` (not shown here), takes a string like "2 days" or "1 month".
-
-### Legend layout
-
-You will most often use `breaks` and `labels` to tweak the axes. While they both also work for legends, there are a few other techniques you are more likely to use.
-
-To control the overall position of the legend, you need to use a `theme()` setting. We'll come back to themes at the end of the chapter, but in brief, they control the non-data parts of the plot. The theme setting `legend.position` controls where the legend is drawn:
-
-```{r fig.asp = 1, fig.align = "default", out.width = "50%", fig.width = 4}
-base <- ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class))
-
-base + theme(legend.position = "left")
-base + theme(legend.position = "top")
-base + theme(legend.position = "bottom")
-base + theme(legend.position = "right") # the default
-```
-
-You can also use `legend.position = "none"` to suppress the display of the legend altogether.
-
-To control the display of individual legends, use `guides()` along with `guide_legend()` or `guide_colourbar()`. The following example shows two important settings: controlling the number of rows the legend uses with `nrow`, and overriding one of the aesthetics to make the points bigger. This is particularly useful if you have used a low `alpha` to display many points on a plot.
-
-```{r}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(colour = class)) +
-  geom_smooth(se = FALSE) +
-  theme(legend.position = "bottom") +
-  guides(colour = guide_legend(nrow = 1, override.aes = list(size = 4)))
-```
-
-### Replacing a scale
-
-Instead of just tweaking the details a little, you can instead replace the scale altogether. There are two types of scales you're mostly likely to want to switch out: continuous position scales and colour scales. Fortunately, the same principles apply to all the other aesthetics, so once you've mastered position and colour, you'll be able to quickly pick up other scale replacements.
-
-It's very useful to plot transformations of your variable. For example, as we've seen in [diamond prices](diamond-prices) it's easier to see the precise relationship between `carat` and `price` if we log transform them:
-
-```{r, fig.align = "default", out.width = "50%"}
-ggplot(diamonds, aes(carat, price)) +
-  geom_bin2d()
-
-ggplot(diamonds, aes(log10(carat), log10(price))) +
-  geom_bin2d()
-```
-
-However, the disadvantage of this transformation is that the axes are now labelled with the transformed values, making it hard to interpret the plot. Instead of doing the transformation in the aesthetic mapping, we can instead do it with the scale. This is visually identical, except the axes are labelled on the original data scale.
-
-```{r}
-ggplot(diamonds, aes(carat, price)) +
-  geom_bin2d() + 
-  scale_x_log10() + 
-  scale_y_log10()
-```
-
-Another scale that is frequently customised is colour.The default categorical scale picks colours that are evenly spaced around the colour wheel. Useful alternatives are the ColorBrewer scales which have been hand tuned to work better for people with common types of colour blindness. The two plots below look similar, but there is enough difference in the shades of red and green that the dots on the right can be distinguished even by people with red-green colour blindness.
-
-```{r, fig.align = "default", out.width = "50%"}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(color = drv))
-
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(color = drv)) +
-  scale_colour_brewer(palette = "Set1")
-```
-
-Don't forget simpler techniques. If there are just a few colours, you can add a redundant shape mapping. This will also help ensure your plot is interpretable in black and white.
-
-```{r}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(color = drv, shape = drv)) +
-  scale_colour_brewer(palette = "Set1")
-```
-
-The ColorBrewer scales are documented online at <http://colorbrewer2.org/> and made available in R via the __RColorBrewer__ package, by Erich Neuwirth. Figure \@ref(fig:brewer) shows the complete list of all palettes. The sequential (top) and diverging (bottom) palettes are particularly useful if your categorical values are ordered, or have a "middle". This often arises if you've used `cut()` to make a continuous variable into a categorical variable.
-
-```{r brewer, fig.asp = 2.5, echo = FALSE, fig.cap = "All ColourBrewer scales."}
-par(mar = c(0, 3, 0, 0))
-RColorBrewer::display.brewer.all()
-```
-
-When you have a predefined mapping between values and colours, use `scale_colour_manual()`. For example, if we map presidential party to colour, we want to use the standard mapping of red for Republicans and blue for Democrats:
-
-```{r}
-presidential %>%
-  mutate(id = 33 + row_number()) %>%
-  ggplot(aes(start, id, colour = party)) +
-    geom_point() +
-    geom_segment(aes(xend = end, yend = id)) +
-    scale_colour_manual(values = c(Republican = "red", Democratic = "blue"))
-```
-
-For continuous colour, you can use the built-in `scale_colour_gradient()` or `scale_fill_gradient()`. If you have a diverging scale, you can use `scale_colour_gradient2()`. That allows you to give, for example, positive and negative values different colours. That's sometimes also useful if you want to distinguish points above or below the mean.
-
-Another option is `scale_colour_viridis()` provided by the __viridis__ package. It's a continuous analog of the categorical ColorBrewer scales. The designers, Nathaniel Smith and Stéfan van der Walt, carefully tailored a continuous colour scheme that has good perceptual properties. Here's an example from the viridis vignette.
-
-```{r, fig.align = "default", fig.asp = 1, out.width = "50%", fig.width = 4}
-df <- tibble(
-  x = rnorm(10000),
-  y = rnorm(10000)
-)
-ggplot(df, aes(x, y)) +
-  geom_hex() +
-  coord_fixed()
-
-ggplot(df, aes(x, y)) +
-  geom_hex() +
-  viridis::scale_fill_viridis() +
-  coord_fixed()
-```
-
-Note that all colour scales come in two variety: `scale_colour_x()` and `scale_fill_x()` for the `colour` and `fill` aesthetics respectively (the colour scales are available in both UK and US spellings).
-
-### Exercises
-
-1.  Why doesn't the following code override the default scale?
-
-    ```{r fig.show = "hide"}
-    ggplot(df, aes(x, y)) +
-      geom_hex() +
-      scale_colour_gradient(low = "white", high = "red") +
-      coord_fixed()
-    ```
-
-1.  What is the first argument to every scale? How does it compare to `labs()`?
-
-1.  Change the display of the presidential terms by:
-
-    1. Combining the two variants shown above.
-    1. Improving the display of the y axis.
-    1. Labelling each term with the name of the president.
-    1. Adding informative plot labels.
-    1. Placing breaks every 4 years (this is trickier than it seems!).
-
-1.  Use `override.aes` to make the legend on the following plot easier to see.
-
-    ```{r, dev = "png", out.width = "50%"}
-    ggplot(diamonds, aes(carat, price)) +
-      geom_point(aes(colour = cut), alpha = 1/20)
-    ```
-
-## Zooming
-
-There are three ways to control the plot limits:
-
-1. Adjusting what data are plotted
-1. Setting the limits in each scale
-1. Setting `xlim` and `ylim` in `coord_cartesian()`
-
-To zoom in on a region of the plot, it's generally best to use `coord_cartesian()`. Compare the following two plots:
-
-```{r out.width = "50%", fig.align = "default", message = FALSE}
-ggplot(mpg, mapping = aes(displ, hwy)) +
-  geom_point(aes(color = class)) +
-  geom_smooth() +
-  coord_cartesian(xlim = c(5, 7), ylim = c(10, 30))
-
-mpg %>%
-  filter(displ >= 5, displ <= 7, hwy >= 10, hwy <= 30) %>%
-  ggplot(aes(displ, hwy)) +
-  geom_point(aes(color = class)) +
-  geom_smooth()
-```
-
-You can also set the `limits` on individual scales. Reducing the limits is basically equivalent to subsetting the data. It is generally more useful if you want _expand_ the limits, for example, to match scales across different plots. For example, if we extract two classes of cars and plot them separately, it's difficult to compare the plots because all three scales (the x-axis, the y-axis, and the colour aesthetic) have different ranges.
-
-```{r out.width = "50%", fig.align = "default", fig.width = 4}
-suv <- mpg %>% filter(class == "suv")
-compact <- mpg %>% filter(class == "compact")
-
-ggplot(suv, aes(displ, hwy, colour = drv)) +
-  geom_point()
-
-ggplot(compact, aes(displ, hwy, colour = drv)) +
-  geom_point()
-```
-
-One way to overcome this problem is to share scales across multiple plots, training the scales with the `limits` of the full data.
-
-```{r out.width = "50%", fig.align = "default", fig.width = 4}
-x_scale <- scale_x_continuous(limits = range(mpg$displ))
-y_scale <- scale_y_continuous(limits = range(mpg$hwy))
-col_scale <- scale_colour_discrete(limits = unique(mpg$drv))
-
-ggplot(suv, aes(displ, hwy, colour = drv)) +
-  geom_point() +
-  x_scale +
-  y_scale +
-  col_scale
-
-ggplot(compact, aes(displ, hwy, colour = drv)) +
-  geom_point() +
-  x_scale +
-  y_scale +
-  col_scale
-```
-
-In this particular case, you could have simply used faceting, but this technique is useful more generally, if for instance, you want spread plots over multiple pages of a report.
-
-## Themes
-
-Finally, you can customise the non-data elements of your plot with a theme:
-
-```{r, message = FALSE}
-ggplot(mpg, aes(displ, hwy)) +
-  geom_point(aes(color = class)) +
-  geom_smooth(se = FALSE) +
-  theme_bw()
-```
-
-ggplot2 includes eight themes by default, as shown in Figure \@ref(fig:themes). Many more are included in add-on packages like __ggthemes__ (<https://github.com/jrnold/ggthemes>), by Jeffrey Arnold.
-
-```{r themes, echo = FALSE, fig.cap = "The eight themes built-in to ggplot2."}
-knitr::include_graphics("images/visualization-themes.png")
-```
-
-Many people wonder why the default theme has a grey background. This was a deliberate choice because it puts the data forward while still making the grid lines visible. The white grid lines are visible (which is important because they significantly aid position judgements), but they have little visual impact and we can easily tune them out. The grey background gives the plot a similar typographic colour to the text, ensuring that the graphics fit in with the flow of a document without jumping out with a bright white background. Finally, the grey background creates a continuous field of colour which ensures that the plot is perceived as a single visual entity.
-
-It's also possible to control individual components of each theme, like the size and colour of the font used for the y axis. Unfortunately, this level of detail is outside the scope of this book, so you'll need to read the [ggplot2 book](https://amzn.com/331924275X) for the full details. You can also create your own themes, if you are trying to match a particular corporate or journal style.
-
-## Saving your plots
-
-There are two main ways to get your plots out of R and into your final write-up: `ggsave()` and knitr. `ggsave()` will save the most recent plot to disk:
-
-```{r, fig.show = "none"}
-ggplot(mpg, aes(displ, hwy)) + geom_point()
-ggsave("my-plot.pdf")
-```
-```{r, include = FALSE}
-file.remove("my-plot.pdf")
-```
-
-If you don't specify the `width` and `height` they will be taken from the dimensions of the current plotting device. For reproducible code, you'll want to specify them.
-
-Generally, however, I think you should be assembling your final reports using R Markdown, so I want to focus on the important code chunk options that you should know about for graphics. You can learn more about `ggsave()` in the documentation.
-
-### Figure sizing
-
-The biggest challenge of graphics in R Markdown is getting your figures the right size and shape. There are five main options that control figure sizing: `fig.width`, `fig.height`, `fig.asp`, `out.width` and `out.height`. Image sizing is challenging because there are two sizes (the size of the figure created by R and the size at which it is inserted in the output document), and multiple ways of specifying the size (i.e., height, width, and aspect ratio: pick two of three).
-
-I only ever use three of the five options:
-
-* I find it most aesthetically pleasing for plots to have a consistent
-  width. To enforce this, I set `fig.width = 6` (6") and `fig.asp = 0.618`
-  (the golden ratio) in the defaults. Then in individual chunks, I only
-  adjust `fig.asp`.
-
-* I control the output size with `out.width` and set it to a percentage
-  of the line width). I default to `out.width = "70%"`
-  and `fig.align = "center"`. That give plots room to breathe, without taking
-  up too much space.
-
-* To put multiple plots in a single row I set the `out.width` to
-  `50%` for two plots, `33%` for 3 plots, or `25%` to 4 plots, and set
-  `fig.align = "default"`. Depending on what I'm trying to illustrate (e.g.
-  show data or show plot variations), I'll also tweak `fig.width`, as
-  discussed below.
-
-If you find that you're having to squint to read the text in your plot, you need to tweak `fig.width`. If `fig.width` is larger than the size the figure is rendered in the final doc, the text will be too small; if `fig.width` is smaller, the text will be too big. You'll often need to do a little experimentation to figure out the right ratio between the `fig.width` and the eventual width in your document. To illustrate the principle, the following three plots have `fig.width` of 4, 6, and 8 respectively:
-
-```{r, include = FALSE}
-plot <- ggplot(mpg, aes(displ, hwy)) + geom_point()
-```
-```{r, fig.width = 4, echo = FALSE}
-plot
-```
-```{r, fig.width = 6, echo = FALSE}
-plot
-```
-```{r, fig.width = 8, echo = FALSE}
-plot
-```
-
-If you want to make sure the font size is consistent across all your figures, whenever you set `out.width`, you'll also need to adjust `fig.width` to maintain the same ratio with your default `out.width`. For example, if your default `fig.width` is 6 and `out.width` is 0.7, when you set `out.width = "50%"` you'll need to set `fig.width` to 4.3 (6 * 0.5 / 0.7).
-
-### Other important options
-
-When mingling code and text, like I do in this book, I recommend setting `fig.show = "hold"` so that plots are shown after the code. This has the pleasant side effect of forcing you to break up large blocks of code with their explanations.
-
-To add a caption to the plot, use `fig.cap`. In R Markdown this will change the figure from inline to "floating".
-
-If you're producing PDF output, the default graphics type is PDF. This is a good default because PDFs are high quality vector graphics. However, they can produce very large and slow plots if you are displaying thousands of points. In that case, set `dev = "png"` to force the use of PNGs. They are slightly lower quality, but will be much more compact.
-
-It's a good idea to name code chunks that produce figures, even if you don't routinely label other chunks. The chunk label is used to generate the file name of the graphic on disk, so naming your chunks makes it much easier to pick out plots and reuse in other circumstances (i.e. if you want to quickly drop a single plot into an email or a tweet).
-
-## Learning more
-
-The absolute best place to learn more is the ggplot2 book: [_ggplot2: Elegant graphics for data analysis_](https://amzn.com/331924275X). It goes into much more depth about the underlying theory, and has many more examples of how to combine the individual pieces to solve practical problems. Unfortunately, the book is not available online for free, although you can find the source code at <https://github.com/hadley/ggplot2-book>.
-
-Another great resource is the ggplot2 extensions guide <http://www.ggplot2-exts.org/>. This site lists many of the packages that extend ggplot2 with new geoms and scales. It's a great place to start if you're trying to do something that seems hard with ggplot2.
diff --git a/communicate.Rmd b/communicate.Rmd
deleted file mode 100644
index c4d9cccc5..000000000
--- a/communicate.Rmd
+++ /dev/null
@@ -1,32 +0,0 @@
-# (PART) Communicate {-}
-
-# Introduction {#communicate-intro}
-
-So far, you've learned the tools to get your data into R, tidy it into a form convenient for analysis, and then understand your data through transformation, visualisation and modelling. However, it doesn't matter how great your analysis is unless you can explain it to others: you need to __communicate__ your results.
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/data-science-communicate.png")
-```
-
-Communication is the theme of the following four chapters:
-
-* In [R Markdown], you will learn about R Markdown, a tool for integrating
-  prose, code, and results. You can use R Markdown in notebook mode for 
-  analyst-to-analyst communication, and in report mode for 
-  analyst-to-decision-maker communication. Thanks to the power of R Markdown
-  formats, you can even use the same document for both purposes.
-  
-* In [Graphics for communication], you will learn how to take your exploratory
-  graphics and turn them into expository graphics, graphics that help the
-  newcomer to your analysis understand what's going on as quickly and 
-  easily as possible.
-  
-* In [R Markdown formats], you'll learn a little about the many other varieties
-  of outputs you can produce using R Markdown, including dashboards, websites,
-  and books.
-  
-* We'll finish up with [R Markdown workflow], where you'll learn about the
-  "analysis notebook" and how to systematically record your successes and 
-  failures so that you can learn from them.
-
-Unfortunately, these chapters focus mostly on the technical mechanics of communication, not the really hard problems of communicating your thoughts to other humans. However, there are lot of other great books about communication, which we'll point you to at the end of each chapter.
diff --git a/communicate.qmd b/communicate.qmd
new file mode 100644
index 000000000..073efc6da
--- /dev/null
+++ b/communicate.qmd
@@ -0,0 +1,36 @@
+# Communicate {#sec-communicate-intro .unnumbered}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+So far, you've learned the tools to get your data into R, tidy it into a form convenient for analysis, and then understand your data through transformation, and visualization.
+However, it doesn't matter how great your analysis is unless you can explain it to others: you need to **communicate** your results.
+
+```{r}
+#| label: fig-ds-communicate
+#| echo: false
+#| fig-cap: |
+#|   Communication is the final part of the data science process; if you
+#|   can't communicate your results to other humans, it doesn't matter how
+#|   great your analysis is.
+#| fig-alt: |
+#|   A diagram displaying the data science cycle with
+#|   communicate highlighed in blue.
+#| out.width: NULL
+
+knitr::include_graphics("diagrams/data-science/communicate.png", dpi = 270)
+```
+
+Communication is the theme of the following two chapters:
+
+-   In @sec-quarto, you will learn about Quarto, a tool for integrating prose, code, and results.
+    You can use Quarto for analyst-to-analyst communication as well as analyst-to-decision-maker communication.
+    Thanks to the power of Quarto formats, you can even use the same document for both purposes.
+
+-   In @sec-quarto-formats, you'll learn a little about the many other varieties of outputs you can produce using Quarto, including dashboards, websites, and books.
+
+These chapters focus mostly on the technical mechanics of communication, not the really hard problems of communicating your thoughts to other humans.
+However, there are lot of other great books about communication, which we'll point you to at the end of each chapter.
diff --git a/communication.qmd b/communication.qmd
new file mode 100644
index 000000000..af2ad1a1e
--- /dev/null
+++ b/communication.qmd
@@ -0,0 +1,1090 @@
+# Communication {#sec-communication}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+In @sec-exploratory-data-analysis, you learned how to use plots as tools for *exploration*.
+When you make exploratory plots, you know---even before looking---which variables the plot will display.
+You made each plot for a purpose, could quickly look at it, and then move on to the next plot.
+In the course of most analyses, you'll produce tens or hundreds of plots, most of which are immediately thrown away.
+
+Now that you understand your data, you need to *communicate* your understanding to others.
+Your audience will likely not share your background knowledge and will not be deeply invested in the data.
+To help others quickly build up a good mental model of the data, you will need to invest considerable effort in making your plots as self-explanatory as possible.
+In this chapter, you'll learn some of the tools that ggplot2 provides to do so.
+
+This chapter focuses on the tools you need to create good graphics.
+We assume that you know what you want, and just need to know how to do it.
+For that reason, we highly recommend pairing this chapter with a good general visualization book.
+We particularly like [The Truthful Art](https://www.amazon.com/gp/product/0321934075/), by Albert Cairo.
+It doesn't teach the mechanics of creating visualizations, but instead focuses on what you need to think about in order to create effective graphics.
+
+### Prerequisites
+
+In this chapter, we'll focus once again on ggplot2.
+We'll also use a little dplyr for data manipulation, **scales** to override the default breaks, labels, transformations and palettes, and a few ggplot2 extension packages, including **ggrepel** ([https://ggrepel.slowkow.com](https://ggrepel.slowkow.com/)) by Kamil Slowikowski and **patchwork** ([https://patchwork.data-imaginist.com](https://patchwork.data-imaginist.com/)) by Thomas Lin Pedersen.
+Don't forget that you'll need to install those packages with `install.packages()` if you don't already have them.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(scales)
+library(ggrepel)
+library(patchwork)
+```
+
+## Labels
+
+The easiest place to start when turning an exploratory graphic into an expository graphic is with good labels.
+You add labels with the `labs()` function.
+
+```{r}
+#| message: false
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, where 
+#|   points are colored according to the car class. A smooth curve following 
+#|   the trajectory of the relationship between highway fuel efficiency versus 
+#|   engine size of cars is overlaid. The x-axis is labelled "Engine 
+#|   displacement (L)" and the y-axis is labelled "Highway fuel economy (mpg)". 
+#|   The legend is labelled "Car type". The plot is titled "Fuel efficiency 
+#|   generally decreases with engine size". The subtitle is "Two seaters 
+#|   (sports cars) are an exception because of their light weight" and the 
+#|   caption is "Data from fueleconomy.gov".
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = class)) +
+  geom_smooth(se = FALSE) +
+  labs(
+    x = "Engine displacement (L)",
+    y = "Highway fuel economy (mpg)",
+    color = "Car type",
+    title = "Fuel efficiency generally decreases with engine size",
+    subtitle = "Two seaters (sports cars) are an exception because of their light weight",
+    caption = "Data from fueleconomy.gov"
+  )
+```
+
+The purpose of a plot title is to summarize the main finding.
+Avoid titles that just describe what the plot is, e.g., "A scatterplot of engine displacement vs. fuel economy".
+
+If you need to add more text, there are two other useful labels: `subtitle` adds additional detail in a smaller font beneath the title and `caption` adds text at the bottom right of the plot, often used to describe the source of the data.
+You can also use `labs()` to replace the axis and legend titles.
+It's usually a good idea to replace short variable names with more detailed descriptions, and to include the units.
+
+It's possible to use mathematical equations instead of text strings.
+Just switch `""` out for `quote()` and read about the available options in `?plotmath`:
+
+```{r}
+#| fig-asp: 1
+#| out-width: "50%"
+#| fig-width: 3
+#| fig-alt: |
+#|   Scatterplot with math text on the x and y axis labels. X-axis label 
+#|   says x_i, y-axis label says sum of x_i  squared, for i from 1 to n.
+
+df <- tibble(
+  x = 1:10,
+  y = cumsum(x^2)
+)
+
+ggplot(df, aes(x, y)) +
+  geom_point() +
+  labs(
+    x = quote(x[i]),
+    y = quote(sum(x[i] ^ 2, i == 1, n))
+  )
+```
+
+### Exercises
+
+1.  Create one plot on the fuel economy data with customized `title`, `subtitle`, `caption`, `x`, `y`, and `color` labels.
+
+2.  Recreate the following plot using the fuel economy data.
+    Note that both the colors and shapes of points vary by type of drive train.
+
+    ```{r}
+    #| echo: false
+    #| fig-alt: |
+    #|   Scatterplot of highway versus city fuel efficiency. Shapes and 
+    #|   colors of points are determined by type of drive train.
+
+    ggplot(mpg, aes(x = cty, y = hwy, color = drv, shape = drv)) +
+      geom_point() +
+      labs(
+        x = "City MPG",
+        y = "Highway MPG",
+        shape = "Type of\ndrive train",
+        color = "Type of\ndrive train"
+      )
+    ```
+
+3.  Take an exploratory graphic that you've created in the last month, and add informative titles to make it easier for others to understand.
+
+## Annotations
+
+In addition to labelling major components of your plot, it's often useful to label individual observations or groups of observations.
+The first tool you have at your disposal is `geom_text()`.
+`geom_text()` is similar to `geom_point()`, but it has an additional aesthetic: `label`.
+This makes it possible to add textual labels to your plots.
+
+There are two possible sources of labels.
+First, you might have a tibble that provides labels.
+In the following plot we pull out the cars with the highest engine size in each drive type and save their information as a new data frame called `label_info`.
+
+```{r}
+label_info <- mpg |>
+  group_by(drv) |>
+  arrange(desc(displ)) |>
+  slice_head(n = 1) |>
+  mutate(
+    drive_type = case_when(
+      drv == "f" ~ "front-wheel drive",
+      drv == "r" ~ "rear-wheel drive",
+      drv == "4" ~ "4-wheel drive"
+    )
+  ) |>
+  select(displ, hwy, drv, drive_type)
+
+label_info
+```
+
+Then, we use this new data frame to directly label the three groups to replace the legend with labels placed directly on the plot.
+Using the `fontface` and `size` arguments we can customize the look of the text labels.
+They're larger than the rest of the text on the plot and bolded.
+(`theme(legend.position = "none"`) turns all the legends off --- we'll talk about it more shortly.)
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway mileage versus engine size where points are colored 
+#|   by drive type. Smooth curves for each drive type are overlaid. 
+#|   Text labels identify the curves as front-wheel, rear-wheel, and 4-wheel.
+
+ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
+  geom_point(alpha = 0.3) +
+  geom_smooth(se = FALSE) +
+  geom_text(
+    data = label_info, 
+    aes(x = displ, y = hwy, label = drive_type),
+    fontface = "bold", size = 5, hjust = "right", vjust = "bottom"
+  ) +
+  theme(legend.position = "none")
+```
+
+Note the use of `hjust` (horizontal justification) and `vjust` (vertical justification) to control the alignment of the label.
+
+However the annotated plot we made above is hard to read because the labels overlap with each other, and with the points.
+We can use the `geom_label_repel()` function from the ggrepel package to address both of these issues.
+This useful package will automatically adjust labels so that they don't overlap:
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, where 
+#|   points are colored according to the car class. Some points are labelled 
+#|   with the car's name. The labels are box with white, transparent background 
+#|   and positioned to not overlap.
+
+ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
+  geom_point(alpha = 0.3) +
+  geom_smooth(se = FALSE) +
+  geom_label_repel(
+    data = label_info, 
+    aes(x = displ, y = hwy, label = drive_type),
+    fontface = "bold", size = 5, nudge_y = 2
+  ) +
+  theme(legend.position = "none")
+```
+
+You can also use the same idea to highlight certain points on a plot with `geom_text_repel()` from the ggrepel package.
+Note another handy technique used here: we added a second layer of large, hollow points to further highlight the labelled points.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars. Points 
+#|   where highway mileage is above 40 as well as above 20 with engine size 
+#|   above 5 are red, with a hollow red circle, and labelled with model name 
+#|   of the car.
+
+potential_outliers <- mpg |>
+  filter(hwy > 40 | (hwy > 20 & displ > 5))
+  
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point() +
+  geom_text_repel(data = potential_outliers, aes(label = model)) +
+  geom_point(data = potential_outliers, color = "red") +
+  geom_point(
+    data = potential_outliers,
+    color = "red", size = 3, shape = "circle open"
+  )
+```
+
+Remember, in addition to `geom_text()` and `geom_label()`, you have many other geoms in ggplot2 available to help annotate your plot.
+A couple ideas:
+
+-   Use `geom_hline()` and `geom_vline()` to add reference lines.
+    We often make them thick (`linewidth = 2`) and white (`color = white`), and draw them underneath the primary data layer.
+    That makes them easy to see, without drawing attention away from the data.
+
+-   Use `geom_rect()` to draw a rectangle around points of interest.
+    The boundaries of the rectangle are defined by aesthetics `xmin`, `xmax`, `ymin`, `ymax`.
+    Alternatively, look into the [ggforce package](https://ggforce.data-imaginist.com/index.html), specifically [`geom_mark_hull()`](https://ggforce.data-imaginist.com/reference/geom_mark_hull.html), which allows you to annotate subsets of points with hulls.
+
+-   Use `geom_segment()` with the `arrow` argument to draw attention to a point with an arrow.
+    Use aesthetics `x` and `y` to define the starting location, and `xend` and `yend` to define the end location.
+
+Another handy function for adding annotations to plots is `annotate()`.
+As a rule of thumb, geoms are generally useful for highlighting a subset of the data while `annotate()` is useful for adding one or few annotation elements to a plot.
+
+To demonstrate using `annotate()`, let's create some text to add to our plot.
+The text is a bit long, so we'll use `stringr::str_wrap()` to automatically add line breaks to it given the number of characters you want per line:
+
+```{r}
+trend_text <- "Larger engine sizes tend to have lower fuel economy." |>
+  str_wrap(width = 30)
+trend_text
+```
+
+Then, we add two layers of annotation: one with a label geom and the other with a segment geom.
+The `x` and `y` aesthetics in both define where the annotation should start, and the `xend` and `yend` aesthetics in the segment annotation define the end location of the segment.
+Note also that the segment is styled as an arrow.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars. A red 
+#|   arrow pointing down follows the trend of the points and the annotation 
+#|   placed next to the arrow reads "Larger engine sizes tend to have lower 
+#|   fuel economy". The arrow and the annotation text is red.
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point() +
+  annotate(
+    geom = "label", x = 3.5, y = 38,
+    label = trend_text,
+    hjust = "left", color = "red"
+  ) +
+  annotate(
+    geom = "segment",
+    x = 3, y = 35, xend = 5, yend = 25, color = "red",
+    arrow = arrow(type = "closed")
+  )
+```
+
+Annotation is a powerful tool for communicating main takeaways and interesting features of your visualizations.
+The only limit is your imagination (and your patience with positioning annotations to be aesthetically pleasing)!
+
+### Exercises
+
+1.  Use `geom_text()` with infinite positions to place text at the four corners of the plot.
+
+2.  Use `annotate()` to add a point geom in the middle of your last plot without having to create a tibble.
+    Customize the shape, size, or color of the point.
+
+3.  How do labels with `geom_text()` interact with faceting?
+    How can you add a label to a single facet?
+    How can you put a different label in each facet?
+    (Hint: Think about the dataset that is being passed to `geom_text()`.)
+
+4.  What arguments to `geom_label()` control the appearance of the background box?
+
+5.  What are the four arguments to `arrow()`?
+    How do they work?
+    Create a series of plots that demonstrate the most important options.
+
+## Scales
+
+The third way you can make your plot better for communication is to adjust the scales.
+Scales control how the aesthetic mappings manifest visually.
+
+### Default scales
+
+Normally, ggplot2 automatically adds scales for you.
+For example, when you type:
+
+```{r}
+#| label: default-scales
+#| fig-show: "hide"
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = class))
+```
+
+ggplot2 automatically adds default scales behind the scenes:
+
+```{r}
+#| fig-show: "hide"
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = class)) +
+  scale_x_continuous() +
+  scale_y_continuous() +
+  scale_color_discrete()
+```
+
+Note the naming scheme for scales: `scale_` followed by the name of the aesthetic, then `_`, then the name of the scale.
+The default scales are named according to the type of variable they align with: continuous, discrete, datetime, or date.
+`scale_x_continuous()` puts the numeric values from `displ` on a continuous number line on the x-axis, `scale_color_discrete()` chooses colors for each of the `class` of car, etc.
+There are lots of non-default scales which you'll learn about below.
+
+The default scales have been carefully chosen to do a good job for a wide range of inputs.
+Nevertheless, you might want to override the defaults for two reasons:
+
+-   You might want to tweak some of the parameters of the default scale.
+    This allows you to do things like change the breaks on the axes, or the key labels on the legend.
+
+-   You might want to replace the scale altogether, and use a completely different algorithm.
+    Often you can do better than the default because you know more about the data.
+
+### Axis ticks and legend keys
+
+Collectively axes and legends are called **guides**.
+Axes are used for x and y aesthetics; legends are used for everything else.
+
+There are two primary arguments that affect the appearance of the ticks on the axes and the keys on the legend: `breaks` and `labels`.
+Breaks controls the position of the ticks, or the values associated with the keys.
+Labels controls the text label associated with each tick/key.
+The most common use of `breaks` is to override the default choice:
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars,
+#|   colored by drive. The y-axis has breaks starting at 15 and ending at 40, 
+#|   increasing by 5.
+
+ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
+  geom_point() +
+  scale_y_continuous(breaks = seq(15, 40, by = 5)) 
+```
+
+You can use `labels` in the same way (a character vector the same length as `breaks`), but you can also set it to `NULL` to suppress the labels altogether.
+This can be useful for maps, or for publishing plots where you can't share the absolute numbers.
+You can also use `breaks` and `labels` to control the appearance of legends.
+For discrete scales for categorical variables, `labels` can be a named list of the existing levels names and the desired labels for them.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, colored 
+#|   by drive. The x and y-axes do not have any labels at the axis ticks. 
+#|   The legend has custom labels: 4-wheel, front, rear.
+
+ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
+  geom_point() +
+  scale_x_continuous(labels = NULL) +
+  scale_y_continuous(labels = NULL) +
+  scale_color_discrete(labels = c("4" = "4-wheel", "f" = "front", "r" = "rear"))
+```
+
+The `labels` argument coupled with labelling functions from the scales package is also useful for formatting numbers as currency, percent, etc.
+The plot on the left shows default labelling with `label_dollar()`, which adds a dollar sign as well as a thousand separator comma.
+The plot on the right adds further customization by dividing dollar values by 1,000 and adding a suffix "K" (for "thousands") as well as adding custom breaks.
+Note that `breaks` is in the original scale of the data.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| fig-alt: |
+#|   Two side-by-side box plots of price versus cut of diamonds. The outliers 
+#|   are transparent. On both plots the x-axis labels are formatted as dollars.
+#|   The x-axis labels on the plot start at $0 and go to $15,000, increasing 
+#|   by $5,000. The x-axis labels on the right plot start at $1K and go to 
+#|   $19K, increasing by $6K. 
+
+# Left
+ggplot(diamonds, aes(x = price, y = cut)) +
+  geom_boxplot(alpha = 0.05) +
+  scale_x_continuous(labels = label_dollar())
+
+# Right
+ggplot(diamonds, aes(x = price, y = cut)) +
+  geom_boxplot(alpha = 0.05) +
+  scale_x_continuous(
+    labels = label_dollar(scale = 1/1000, suffix = "K"), 
+    breaks = seq(1000, 19000, by = 6000)
+  )
+```
+
+Another handy label function is `label_percent()`:
+
+```{r}
+#| fig-alt: |
+#|   Segmented bar plots of cut, filled with levels of clarity. The y-axis 
+#|   labels start at 0% and go to 100%, increasing by 25%. The y-axis label 
+#|   name is "Percentage".
+
+ggplot(diamonds, aes(x = cut, fill = clarity)) +
+  geom_bar(position = "fill") +
+  scale_y_continuous(name = "Percentage", labels = label_percent())
+```
+
+Another use of `breaks` is when you have relatively few data points and want to highlight exactly where the observations occur.
+For example, take this plot that shows when each US president started and ended their term.
+
+```{r}
+#| fig-alt: |
+#|   Line plot of id number of presidents versus the year they started their 
+#|   presidency. Start year is marked with a point and a segment that starts 
+#|   there and ends at the end of the presidency. The x-axis labels are 
+#|   formatted as two digit years starting with an apostrophe, e.g., '53.
+
+presidential |>
+  mutate(id = 33 + row_number()) |>
+  ggplot(aes(x = start, y = id)) +
+  geom_point() +
+  geom_segment(aes(xend = end, yend = id)) +
+  scale_x_date(name = NULL, breaks = presidential$start, date_labels = "'%y")
+```
+
+Note that for the `breaks` argument we pulled out the `start` variable as a vector with `presidential$start` because we can't do an aesthetic mapping for this argument.
+Also note that the specification of breaks and labels for date and datetime scales is a little different:
+
+-   `date_labels` takes a format specification, in the same form as `parse_datetime()`.
+
+-   `date_breaks` (not shown here), takes a string like "2 days" or "1 month".
+
+### Legend layout
+
+You will most often use `breaks` and `labels` to tweak the axes.
+While they both also work for legends, there are a few other techniques you are more likely to use.
+
+To control the overall position of the legend, you need to use a `theme()` setting.
+We'll come back to themes at the end of the chapter, but in brief, they control the non-data parts of the plot.
+The theme setting `legend.position` controls where the legend is drawn:
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| fig-alt: |
+#|   Four scatterplots of highway fuel efficiency versus engine size of cars 
+#|   where points are colored based on class of car. Clockwise, the legend 
+#|   is placed on the right, left, top, and bottom of the plot.
+
+base <- ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = class))
+
+base + theme(legend.position = "right") # the default
+base + theme(legend.position = "left")
+base + 
+  theme(legend.position = "top") +
+  guides(color = guide_legend(nrow = 3))
+base + 
+  theme(legend.position = "bottom") +
+  guides(color = guide_legend(nrow = 3))
+```
+
+If your plot is short and wide, place the legend at the top or bottom, and if it's tall and narrow, place the legend at the left or right.
+You can also use `legend.position = "none"` to suppress the display of the legend altogether.
+
+To control the display of individual legends, use `guides()` along with `guide_legend()` or `guide_colorbar()`.
+The following example shows two important settings: controlling the number of rows the legend uses with `nrow`, and overriding one of the aesthetics to make the points bigger.
+This is particularly useful if you have used a low `alpha` to display many points on a plot.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars 
+#|   where points are colored based on class of car. Overlaid on the plot is a 
+#|   smooth curve. The legend is in the bottom and classes are listed 
+#|   horizontally in two rows. The points in the legend are larger than the points 
+#|   in the plot.
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = class)) +
+  geom_smooth(se = FALSE) +
+  theme(legend.position = "bottom") +
+  guides(color = guide_legend(nrow = 2, override.aes = list(size = 4)))
+```
+
+Note that the name of the argument in `guides()` matches the name of the aesthetic, just like in `labs()`.
+
+### Replacing a scale
+
+Instead of just tweaking the details a little, you can instead replace the scale altogether.
+There are two types of scales you're mostly likely to want to switch out: continuous position scales and color scales.
+Fortunately, the same principles apply to all the other aesthetics, so once you've mastered position and color, you'll be able to quickly pick up other scale replacements.
+
+It's very useful to plot transformations of your variable.
+For example, it's easier to see the precise relationship between `carat` and `price` if we log transform them:
+
+```{r}
+#| fig-align: default
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|   Two plots of price versus carat of diamonds. Data binned and the color of 
+#|   the rectangles representing each bin based on the number of points that 
+#|   fall into that bin. In the plot on the right, price and carat values 
+#|   are logged and the axis labels shows the logged values.
+
+# Left
+ggplot(diamonds, aes(x = carat, y = price)) +
+  geom_bin2d()
+
+# Right
+ggplot(diamonds, aes(x = log10(carat), y = log10(price))) +
+  geom_bin2d()
+```
+
+However, the disadvantage of this transformation is that the axes are now labelled with the transformed values, making it hard to interpret the plot.
+Instead of doing the transformation in the aesthetic mapping, we can instead do it with the scale.
+This is visually identical, except the axes are labelled on the original data scale.
+
+```{r}
+#| fig-alt: |
+#|   Plot of price versus carat of diamonds. Data binned and the color of 
+#|   the rectangles representing each bin based on the number of points that 
+#|   fall into that bin. The axis labels are on the original data scale.
+
+ggplot(diamonds, aes(x = carat, y = price)) +
+  geom_bin2d() + 
+  scale_x_log10() + 
+  scale_y_log10()
+```
+
+Another scale that is frequently customized is color.
+The default categorical scale picks colors that are evenly spaced around the color wheel.
+Useful alternatives are the ColorBrewer scales which have been hand tuned to work better for people with common types of color blindness.
+The two plots below look similar, but there is enough difference in the shades of red and green that the dots on the right can be distinguished even by people with red-green color blindness.[^communication-1]
+
+[^communication-1]: You can use a tool like [SimDaltonism](https://michelf.ca/projects/sim-daltonism/) to simulate color blindness to test these images.
+
+```{r}
+#| fig-align: default
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|   Two scatterplots of highway mileage versus engine size where points are 
+#|   colored by drive type. The plot on the left uses the default 
+#|   ggplot2 color palette and the plot on the right uses a different color 
+#|   palette.
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = drv))
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = drv)) +
+  scale_color_brewer(palette = "Set1")
+```
+
+Don't forget simpler techniques for improving accessibility.
+If there are just a few colors, you can add a redundant shape mapping.
+This will also help ensure your plot is interpretable in black and white.
+
+```{r}
+#| fig-alt: |
+#|   Two scatterplots of highway mileage versus engine size where both color 
+#|   and shape of points are based on drive type. The color palette is not 
+#|   the default ggplot2 palette.
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = drv, shape = drv)) +
+  scale_color_brewer(palette = "Set1")
+```
+
+The ColorBrewer scales are documented online at <https://colorbrewer2.org/> and made available in R via the **RColorBrewer** package, by Erich Neuwirth.
+@fig-brewer shows the complete list of all palettes.
+The sequential (top) and diverging (bottom) palettes are particularly useful if your categorical values are ordered, or have a "middle".
+This often arises if you've used `cut()` to make a continuous variable into a categorical variable.
+
+```{r}
+#| label: fig-brewer
+#| echo: false
+#| fig-cap: All colorBrewer scales.
+#| fig-asp: 2.5
+#| fig-alt: |
+#|   All colorBrewer scales. One group goes from light to dark colors. 
+#|   Another group is a set of non ordinal colors. And the last group has 
+#|   diverging scales (from dark to light to dark again). Within each set 
+#|   there are a number of palettes.
+
+par(mar = c(0, 3, 0, 0))
+RColorBrewer::display.brewer.all()
+```
+
+When you have a predefined mapping between values and colors, use `scale_color_manual()`.
+For example, if we map presidential party to color, we want to use the standard mapping of red for Republicans and blue for Democrats.
+One approach for assigning these colors is using hex color codes:
+
+```{r}
+#| fig-alt: |
+#|   Line plot of id number of presidents versus the year they started their 
+#|   presidency. Start year is marked with a point and a segment that starts 
+#|   there and ends at the end of the presidency. Democratic presidents are 
+#|   represented in blue and Republicans in red.
+
+presidential |>
+  mutate(id = 33 + row_number()) |>
+  ggplot(aes(x = start, y = id, color = party)) +
+  geom_point() +
+  geom_segment(aes(xend = end, yend = id)) +
+  scale_color_manual(values = c(Republican = "#E81B23", Democratic = "#00AEF3"))
+```
+
+For continuous color, you can use the built-in `scale_color_gradient()` or `scale_fill_gradient()`.
+If you have a diverging scale, you can use `scale_color_gradient2()`.
+That allows you to give, for example, positive and negative values different colors.
+That's sometimes also useful if you want to distinguish points above or below the mean.
+
+Another option is to use the viridis color scales.
+The designers, Nathaniel Smith and Stéfan van der Walt, carefully tailored continuous color schemes that are perceptible to people with various forms of color blindness as well as perceptually uniform in both color and black and white.
+These scales are available as continuous (`c`), discrete (`d`), and binned (`b`) palettes in ggplot2.
+
+```{r}
+#| fig-align: default
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-asp: 0.75
+#| fig-alt: |
+#|   Three hex plots where the color of the hexes show the number of observations 
+#|   that fall into that hex bin. The first plot uses the default, continuous 
+#|   ggplot2 scale. The second plot uses the viridis, continuous scale, and the 
+#|   third plot uses the viridis, binned scale.
+
+df <- tibble(
+  x = rnorm(10000),
+  y = rnorm(10000)
+)
+
+ggplot(df, aes(x, y)) +
+  geom_hex() +
+  coord_fixed() +
+  labs(title = "Default, continuous", x = NULL, y = NULL)
+
+ggplot(df, aes(x, y)) +
+  geom_hex() +
+  coord_fixed() +
+  scale_fill_viridis_c() +
+  labs(title = "Viridis, continuous", x = NULL, y = NULL)
+
+ggplot(df, aes(x, y)) +
+  geom_hex() +
+  coord_fixed() +
+  scale_fill_viridis_b() +
+  labs(title = "Viridis, binned", x = NULL, y = NULL)
+```
+
+Note that all color scales come in two varieties: `scale_color_*()` and `scale_fill_*()` for the `color` and `fill` aesthetics respectively (the color scales are available in both UK and US spellings).
+
+### Zooming
+
+There are three ways to control the plot limits:
+
+1.  Adjusting what data are plotted.
+2.  Setting the limits in each scale.
+3.  Setting `xlim` and `ylim` in `coord_cartesian()`.
+
+We'll demonstrate these options in a series of plots.
+The plot on the left shows the relationship between engine size and fuel efficiency, colored by type of drive train.
+The plot on the right shows the same variables, but subsets the data that are plotted.
+Subsetting the data has affected the x and y scales as well as the smooth curve.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| message: false
+#| fig-alt: |
+#|   On the left, scatterplot of highway mileage vs. displacement, with 
+#|   displacement. The smooth curve overlaid shows a decreasing, and then 
+#|   increasing trend, like a hockey stick. On the right, same variables 
+#|   are plotted with displacement ranging only from 5 to 6 and highway 
+#|   mileage ranging only from 10 to 25. The smooth curve overlaid shows a 
+#|   trend that's slightly increasing first and then decreasing. 
+
+# Left
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = drv)) +
+  geom_smooth()
+
+# Right
+mpg |>
+  filter(displ >= 5 & displ <= 6 & hwy >= 10 & hwy <= 25) |>
+  ggplot(aes(x = displ, y = hwy)) +
+  geom_point(aes(color = drv)) +
+  geom_smooth()
+```
+
+Let's compare these to the two plots below where the plot on the left sets the `limits` on individual scales and the plot on the right sets them in `coord_cartesian()`.
+We can see that reducing the limits is equivalent to subsetting the data.
+Therefore, to zoom in on a region of the plot, it's generally best to use `coord_cartesian()`.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| message: false
+#| warning: false
+#| fig-alt: |
+#|   On the left, scatterplot of highway mileage vs. displacement, with 
+#|   displacement ranging from 5 to 6 and highway mileage ranging from 
+#|   10 to 25. The smooth curve overlaid shows a trend that's slightly 
+#|   increasing first and then decreasing. On the right, same variables 
+#|   are plotted with the same limits, however the smooth curve overlaid
+#|   shows a relatively flat trend with a slight increase at the end.
+
+# Left
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = drv)) +
+  geom_smooth() +
+  scale_x_continuous(limits = c(5, 6)) +
+  scale_y_continuous(limits = c(10, 25))
+
+# Right
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = drv)) +
+  geom_smooth() +
+  coord_cartesian(xlim = c(5, 6), ylim = c(10, 25))
+```
+
+On the other hand, setting the `limits` on individual scales is generally more useful if you want to *expand* the limits, e.g., to match scales across different plots.
+For example, if we extract two classes of cars and plot them separately, it's difficult to compare the plots because all three scales (the x-axis, the y-axis, and the color aesthetic) have different ranges.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| fig-alt: |
+#|   On the left, a scatterplot of highway mileage vs. displacement of SUVs.
+#|   On the right, a scatterplot of the same variables for compact cars.
+#|   Points are colored by drive type for both plots. Among SUVs more of 
+#|   the cars are 4-wheel drive and the others are rear-wheel drive, while 
+#|   among compact cars more of the cars are front-wheel drive and the others 
+#|   are 4-wheel drive. SUV plot shows a clear negative relationship 
+#|   between higway mileage and displacement while in the compact cars plot 
+#|   the relationship is much flatter.
+
+suv <- mpg |> filter(class == "suv")
+compact <- mpg |> filter(class == "compact")
+
+# Left
+ggplot(suv, aes(x = displ, y = hwy, color = drv)) +
+  geom_point()
+
+# Right
+ggplot(compact, aes(x = displ, y = hwy, color = drv)) +
+  geom_point()
+```
+
+One way to overcome this problem is to share scales across multiple plots, training the scales with the `limits` of the full data.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| fig-alt: |
+#|   On the left, a scatterplot of highway mileage vs. displacement of SUVs.
+#|   On the right, a scatterplot of the same variables for compact cars.
+#|   Points are colored by drive type for both plots. Both plots are plotted 
+#|   on the same scale for highway mileage, displacement, and drive type, 
+#|   resulting in the legend showing all three types (front, rear, and 4-wheel 
+#|   drive) for both plots even though there are no front-wheel drive SUVs and 
+#|   no rear-wheel drive compact cars. Since the x and y scales are the same, 
+#|   and go well beyond minimum or maximum highway mileage and displacement, 
+#|   the points do not take up the entire plotting area.
+
+x_scale <- scale_x_continuous(limits = range(mpg$displ))
+y_scale <- scale_y_continuous(limits = range(mpg$hwy))
+col_scale <- scale_color_discrete(limits = unique(mpg$drv))
+
+# Left
+ggplot(suv, aes(x = displ, y = hwy, color = drv)) +
+  geom_point() +
+  x_scale +
+  y_scale +
+  col_scale
+
+# Right
+ggplot(compact, aes(x = displ, y = hwy, color = drv)) +
+  geom_point() +
+  x_scale +
+  y_scale +
+  col_scale
+```
+
+In this particular case, you could have simply used faceting, but this technique is useful more generally, if for instance, you want to spread plots over multiple pages of a report.
+
+### Exercises
+
+1.  Why doesn't the following code override the default scale?
+
+    ```{r}
+    #| fig-show: "hide"
+
+    df <- tibble(
+      x = rnorm(10000),
+      y = rnorm(10000)
+    )
+
+    ggplot(df, aes(x, y)) +
+      geom_hex() +
+      scale_color_gradient(low = "white", high = "red") +
+      coord_fixed()
+    ```
+
+2.  What is the first argument to every scale?
+    How does it compare to `labs()`?
+
+3.  Change the display of the presidential terms by:
+
+    a.  Combining the two variants that customize colors and x axis breaks.
+    b.  Improving the display of the y axis.
+    c.  Labelling each term with the name of the president.
+    d.  Adding informative plot labels.
+    e.  Placing breaks every 4 years (this is trickier than it seems!).
+
+4.  First, create the following plot.
+    Then, modify the code using `override.aes` to make the legend easier to see.
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(diamonds, aes(x = carat, y = price)) +
+      geom_point(aes(color = cut), alpha = 1/20)
+    ```
+
+## Themes {#sec-themes}
+
+Finally, you can customize the non-data elements of your plot with a theme:
+
+```{r}
+#| message: false
+#| fig-alt: |
+#|   Scatterplot of highway mileage vs. displacement of cars, colored by class 
+#|   of car. The plot background is white, with gray grid lines.
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(color = class)) +
+  geom_smooth(se = FALSE) +
+  theme_bw()
+```
+
+ggplot2 includes the eight themes shown in @fig-themes, with `theme_gray()` as the default.[^communication-2]
+Many more are included in add-on packages like **ggthemes** (<https://jrnold.github.io/ggthemes>), by Jeffrey Arnold.
+You can also create your own themes, if you are trying to match a particular corporate or journal style.
+
+[^communication-2]: Many people wonder why the default theme has a gray background.
+    This was a deliberate choice because it puts the data forward while still making the grid lines visible.
+    The white grid lines are visible (which is important because they significantly aid position judgments), but they have little visual impact and we can easily tune them out.
+    The gray background gives the plot a similar typographic color to the text, ensuring that the graphics fit in with the flow of a document without jumping out with a bright white background.
+    Finally, the gray background creates a continuous field of color which ensures that the plot is perceived as a single visual entity.
+
+```{r}
+#| label: fig-themes
+#| echo: false
+#| fig-cap: The eight themes built-in to ggplot2.
+#| fig-alt: |
+#|   Eight barplots created with ggplot2, each 
+#|   with one of the eight built-in themes: 
+#|   theme_bw() - White background with grid lines,
+#|   theme_light() - Light axes and grid lines,
+#|   theme_classic() - Classic theme, axes but no grid
+#|   lines, theme_linedraw() - Only black lines, 
+#|   theme_dark() - Dark background for contrast, 
+#|   theme_minimal() - Minimal theme, no background,
+#|   theme_gray() - Gray background (default theme),
+#|   theme_void() - Empty theme, only geoms are visible.
+
+knitr::include_graphics("images/visualization-themes.png")
+```
+
+It's also possible to control individual components of each theme, like the size and color of the font used for the y axis.
+We've already seen that `legend.position` controls where the legend is drawn.
+There are many other aspects of the legend that can be customized with `theme()`.
+For example, in the plot below we change the direction of the legend as well as put a black border around it.
+Note that customization of the legend box and plot title elements of the theme are done with `element_*()` functions.
+These functions specify the styling of non-data components, e.g., the title text is bolded in the `face` argument of `element_text()` and the legend border color is defined in the `color` argument of `element_rect()`.
+The theme elements that control the position of the title and the caption are `plot.title.position` and `plot.caption.position`, respectively.
+In the following plot these are set to `"plot"` to indicate these elements are aligned to the entire plot area, instead of the plot panel (the default).
+A few other helpful `theme()` components are used to change the placement for format of the title and caption text.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, colored 
+#|   by drive. The plot is titled 'Larger engine sizes tend to have lower fuel 
+#|   economy' with the caption pointing to the source of the data, fueleconomy.gov.
+#|   The caption and title are left justified, the legend is inside of the plot
+#|   with a black border.
+
+ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
+  geom_point() +
+  labs(
+    title = "Larger engine sizes tend to have lower fuel economy",
+    caption = "Source: https://fueleconomy.gov."
+  ) +
+  theme(
+    legend.position = c(0.6, 0.7),
+    legend.direction = "horizontal",
+    legend.box.background = element_rect(color = "black"),
+    plot.title = element_text(face = "bold"),
+    plot.title.position = "plot",
+    plot.caption.position = "plot",
+    plot.caption = element_text(hjust = 0)
+  )
+```
+
+For an overview of all `theme()` components, see help with `?theme`.
+The [ggplot2 book](https://ggplot2-book.org/) is also a great place to go for the full details on theming.
+
+### Exercises
+
+1.  Pick a theme offered by the ggthemes package and apply it to the last plot you made.
+2.  Make the axis labels of your plot blue and bolded.
+
+## Layout
+
+So far we talked about how to create and modify a single plot.
+What if you have multiple plots you want to lay out in a certain way?
+The patchwork package allows you to combine separate plots into the same graphic.
+We loaded this package earlier in the chapter.
+
+To place two plots next to each other, you can simply add them to each other.
+Note that you first need to create the plots and save them as objects (in the following example they're called `p1` and `p2`).
+Then, you place them next to each other with `+`.
+
+```{r}
+#| fig-width: 6
+#| fig-asp: 0.5
+#| fig-alt: |
+#|   Two plots (a scatterplot of highway mileage versus engine size and a 
+#|   side-by-side boxplots of highway mileage versus drive train) placed next 
+#|   to each other.
+
+p1 <- ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point() + 
+  labs(title = "Plot 1")
+p2 <- ggplot(mpg, aes(x = drv, y = hwy)) + 
+  geom_boxplot() + 
+  labs(title = "Plot 2")
+p1 + p2
+```
+
+It's important to note that in the above code chunk we did not use a new function from the patchwork package.
+Instead, the package added a new functionality to the `+` operator.
+
+You can also create complex plot layouts with patchwork.
+In the following, `|` places the `p1` and `p3` next to each other and `/` moves `p2` to the next line.
+
+```{r}
+#| fig-width: 6
+#| fig-asp: 0.8
+#| fig-alt: |
+#|   Three plots laid out such that first and third plot are next to each other 
+#|   and the second plot stretched beneath them. The first plot is a 
+#|   scatterplot of highway mileage versus engine size, third plot is a 
+#|   scatterplot of highway mileage versus city mileage, and the third plot is 
+#|   side-by-side boxplots of highway mileage versus drive train) placed next 
+#|   to each other.
+
+p3 <- ggplot(mpg, aes(x = cty, y = hwy)) + 
+  geom_point() + 
+  labs(title = "Plot 3")
+(p1 | p3) / p2
+```
+
+Additionally, patchwork allows you to collect legends from multiple plots into one common legend, customize the placement of the legend as well as dimensions of the plots, and add a common title, subtitle, caption, etc. to your plots.
+Below we create 5 plots.
+We have turned off the legends on the box plots and the scatterplot and collected the legends for the density plots at the top of the plot with `& theme(legend.position = "top")`.
+Note the use of the `&` operator here instead of the usual `+`.
+This is because we're modifying the theme for the patchwork plot as opposed to the individual ggplots.
+The legend is placed on top, inside the `guide_area()`.
+Finally, we have also customized the heights of the various components of our patchwork -- the guide has a height of 1, the box plots 3, density plots 2, and the faceted scatterplot 4.
+Patchwork divides up the area you have allotted for your plot using this scale and places the components accordingly.
+
+```{r}
+#| fig-width: 8
+#| fig-asp: 1
+#| fig-alt: |
+#|   Five plots laid out such that first two plots are next to each other. Plots 
+#|   three and four are underneath them. And the fifth plot stretches under them. 
+#|   The patchworked plot is titled "City and highway mileage for cars with 
+#|   different drive trains" and captioned "Source: https://fueleconomy.gov". 
+#|   The first two plots are side-by-side box plots. Plots 3 and 4 are density 
+#|   plots. And the fifth plot is a faceted scatterplot. Each of these plots show 
+#|   geoms colored by drive train, but the patchworked plot has only one legend 
+#|   that applies to all of them, above the plots and beneath the title.
+
+p1 <- ggplot(mpg, aes(x = drv, y = cty, color = drv)) + 
+  geom_boxplot(show.legend = FALSE) + 
+  labs(title = "Plot 1")
+
+p2 <- ggplot(mpg, aes(x = drv, y = hwy, color = drv)) + 
+  geom_boxplot(show.legend = FALSE) + 
+  labs(title = "Plot 2")
+
+p3 <- ggplot(mpg, aes(x = cty, color = drv, fill = drv)) + 
+  geom_density(alpha = 0.5) + 
+  labs(title = "Plot 3")
+
+p4 <- ggplot(mpg, aes(x = hwy, color = drv, fill = drv)) + 
+  geom_density(alpha = 0.5) + 
+  labs(title = "Plot 4")
+
+p5 <- ggplot(mpg, aes(x = cty, y = hwy, color = drv)) + 
+  geom_point(show.legend = FALSE) + 
+  facet_wrap(~drv) +
+  labs(title = "Plot 5")
+
+(guide_area() / (p1 + p2) / (p3 + p4) / p5) +
+  plot_annotation(
+    title = "City and highway mileage for cars with different drive trains",
+    caption = "Source: https://fueleconomy.gov."
+  ) +
+  plot_layout(
+    guides = "collect",
+    heights = c(1, 3, 2, 4)
+    ) &
+  theme(legend.position = "top")
+```
+
+If you'd like to learn more about combining and layout out multiple plots with patchwork, we recommend looking through the guides on the package website: <https://patchwork.data-imaginist.com>.
+
+### Exercises
+
+1.  What happens if you omit the parentheses in the following plot layout.
+    Can you explain why this happens?
+
+    ```{r}
+    #| fig-show: hide
+
+    p1 <- ggplot(mpg, aes(x = displ, y = hwy)) + 
+      geom_point() + 
+      labs(title = "Plot 1")
+    p2 <- ggplot(mpg, aes(x = drv, y = hwy)) + 
+      geom_boxplot() + 
+      labs(title = "Plot 2")
+    p3 <- ggplot(mpg, aes(x = cty, y = hwy)) + 
+      geom_point() + 
+      labs(title = "Plot 3")
+
+    (p1 | p2) / p3
+    ```
+
+2.  Using the three plots from the previous exercise, recreate the following patchwork.
+
+    ```{r}
+    #| fig-width: 7
+    #| fig-asp: 0.8
+    #| echo: false
+    #| fig-alt: |
+    #|   Three plots: Plot 1 is a scatterplot of highway mileage versus engine size. 
+    #|   Plot 2 is side-by-side box plots of highway mileage versus drive train. 
+    #|   Plot 3 is side-by-side box plots of city mileage versus drive train. 
+    #|   Plots 1 is on the first row. Plots 2 and 3 are on the next row, each span 
+    #|   half the width of Plot 1. Plot 1 is labelled "Fig. A", Plot 2 is labelled 
+    #|   "Fig. B", and Plot 3 is labelled "Fig. C".
+
+    p1 / (p2 + p3) +
+      plot_annotation(
+        tag_levels = c("A"), 
+        tag_prefix = "Fig. ",
+        tag_suffix = ":"
+      )
+    ```
+
+## Summary
+
+In this chapter you've learned about adding plot labels such as title, subtitle, caption as well as modifying default axis labels, using annotation to add informational text to your plot or to highlight specific data points, customizing the axis scales, and changing the theme of your plot.
+You've also learned about combining multiple plots in a single graph using both simple and complex plot layouts.
+
+While you've so far learned about how to make many different types of plots and how to customize them using a variety of techniques, we've barely scratched the surface of what you can create with ggplot2.
+If you want to get a comprehensive understanding of ggplot2, we recommend reading the book, [*ggplot2: Elegant Graphics for Data Analysis*](https://ggplot2-book.org).
+Other useful resources are the [*R Graphics Cookbook*](https://r-graphics.org) by Winston Chang and [*Fundamentals of Data Visualization*](https://clauswilke.com/dataviz/) by Claus Wilke.
diff --git a/contribs.txt b/contribs.txt
deleted file mode 100644
index b78da20f7..000000000
--- a/contribs.txt
+++ /dev/null
@@ -1,94 +0,0 @@
-   625	hadley
-    93	Garrett
-    77	Hadley Wickham
-    50	S'busiso Mkhondwane
-    21	behrman
-    11	Brett Klamer
-    10	Radu Grosu
-     9	Brandon Greenwell
-     8	Bill Behrman
-     7	Garrett Grolemund
-     7	Rademeyer Vermaak
-     7	Colin Gillespie
-     7	harrismcgehee
-     6	jjchern
-     6	Jakub Nowosad
-     6	OaCantona
-     5	kdpsingh
-     5	Julian During
-     4	Thomas Klebel
-     4	Mine Cetinkaya-Rundel
-     4	Jennifer (Jenny) Bryan
-     4	Terence Teo
-     4	Patrick Kennedy
-     3	Jonathan Page
-     3	Jose Roberto Ayala Solares
-     3	yahwes
-     3	seamus-mckinsey
-     3	Ian Lyttle
-     3	Ian Sealy
-     3	Yihui Xie
-     2	Cooper Morris
-     2	Christian G. Warden
-     2	Daniel Gromer
-     2	Devin Pastoor
-     2	Etienne B. Racine
-     2	Jim Hester
-     2	Joanne Jang
-     2	Kirill Sevastyanenko
-     2	MJMarshall
-     2	Nirmal Patel
-     2	Paul
-     2	Robert Schuessler
-     2	Will Beasley
-     2	rlzijdeman
-     2	robinlovelace
-     2	sibusiso16
-     2	spirgel
-     1	robinsones
-     1	Jeroen Janssens
-     1	Mustafa Ascha
-     1	Nelson Areal
-     1	Nick Clark
-     1	Alex
-     1	Hengni Cai
-     1	Gregory Jefferis
-     1	seanpwilliams
-     1	Peter Hurford
-     1	Flemming Villalona
-     1	Eric Watt
-     1	shoili
-     1	Earl Brown
-     1	Shannon Ellis
-     1	Steve Mortimer
-     1	TJ Mahr
-     1	Dylan Cashman
-     1	Derwin McGeary
-     1	Tom Prior
-     1	Ajay Deonarine
-     1	David Clark
-     1	adi pradhan
-     1	bahadir cankardes
-     1	batpigandme
-     1	Curtis Alexander
-     1	Ahmed ElGabbas
-     1	Christian Mongeau
-     1	jennybc
-     1	Ben Marwick
-     1	jonathanflint
-     1	Andrew Landgraf
-     1	koalabearski
-     1	nate-d-olson
-     1	nickelas
-     1	nwaff
-     1	zeal626
-     1	Jon Calder
-     1	Julia Stewart Lowndes
-     1	John Sears
-     1	Justinas Petuchovas
-     1	Kara Woo
-     1	Kenny Darrell
-     1	svenski
-     1	KyleHumphrey
-     1	Lawrence Wu
-     1	Matthew Sedaghatfar
diff --git a/contribute.qmd b/contribute.qmd
new file mode 100644
index 000000000..7838cc014
--- /dev/null
+++ b/contribute.qmd
@@ -0,0 +1,19 @@
+# Contributing {#sec-contributing}
+
+This book has been developed in the open, and it wouldn't be nearly as good without your contributions.
+There are a number of ways you can help make the book even better:
+
+-   If you don't understand something, please [let me know](mailto:h.wickham@gmail.com).
+    Your feedback on what is confusing or hard to understand is valuable.
+
+-   If you spot a typo, feel free to edit the underlying page and send a pull request.
+    If you've never done this before, the process is very easy:
+
+    -   Click the edit this page on the sidebar.
+
+    -   Make the changes using GitHub's in-page editor and save.
+
+    -   Submit a pull request and include a brief description of your changes.
+        "Fixing typos" is perfectly adequate.
+
+    -   If you make significant changes, include the phrase "I assign the copyright of this contribution to Hadley Wickham" - I need this so I can publish the printed book.
diff --git a/contribute.rmd b/contribute.rmd
deleted file mode 100644
index 8920eec5b..000000000
--- a/contribute.rmd
+++ /dev/null
@@ -1,23 +0,0 @@
-# Contributing
-
-This book has been developed in the open, and it wouldn't be nearly as good 
-without your contributions. There are a number of ways you can help make the
-book even better:
-
-* If you don't understand something, please 
-  [let me know](mailto:h.wickham@gmail.com). Your feedback on what is confusing
-  or hard to understand is valuable.
-
-* If you spot a typo, feel free to edit the underlying page and send a pull 
-  request. If you've never done this before, the process is very easy: 
-  
-    * Click the edit this page on the sidebar.
-  
-    * Make the changes using github's in-page editor and save.
-    
-    * Submit a pull request and include a brief description of your changes.
-      "Fixing typos" is perfectly adequate.
-      
-    * If you make significant changes, include the phrase "I assign the 
-      copyright of this contribution to Hadley Wickham" - I need this so I can 
-      publish the printed book.
diff --git a/contributors.R b/contributors.R
new file mode 100644
index 000000000..db69421c6
--- /dev/null
+++ b/contributors.R
@@ -0,0 +1,31 @@
+library(tidyverse)
+contribs_all_json <- gh::gh("/repos/:owner/:repo/contributors",
+  owner = "hadley",
+  repo = "r4ds",
+  .limit = Inf
+)
+contribs_all <- tibble(
+  login = contribs_all_json %>% map_chr("login"),
+  n = contribs_all_json %>% map_int("contributions")
+)
+
+contribs_old <- read_csv("contributors.csv", col_types = list())
+contribs_new <- contribs_all %>% anti_join(contribs_old)
+
+# Get info for new contributors
+needed_json <- map(
+  contribs_new$login,
+  ~ gh::gh("/users/:username", username = .x)
+)
+info_new <- tibble(
+  login = map_chr(needed_json, "login", .default = NA),
+  name = map_chr(needed_json, "name", .default = NA),
+  blog = map_chr(needed_json, "blog", .default = NA)
+)
+info_old <- contribs_old %>% select(login, name, blog)
+info_all <- bind_rows(info_old, info_new)
+
+contribs_all <- contribs_all %>%
+  left_join(info_all) %>%
+  arrange(login)
+write_csv(contribs_all, "contributors.csv")
diff --git a/contributors.csv b/contributors.csv
new file mode 100644
index 000000000..bd5eee945
--- /dev/null
+++ b/contributors.csv
@@ -0,0 +1,263 @@
+login,n,name,blog
+a-rosenberg,1,NA,NA
+a2800276,1,Tim Becker,NA
+Abinashbunty,1,Abinash Satapathy,https://www.abinash.nl/
+adam-gruer,1,Adam Gruer,adamgruer.rbind.io
+adidoit,1,adi pradhan,http://adidoit.github.io
+Adrianzo,1,A. s.,NA
+aephidayatuloh,1,Aep Hidyatuloh,NA
+agila5,1,Andrea Gilardi,NA
+ajay-d,1,Ajay Deonarine,http://deonarine.com/
+AlanFeder,1,NA,NA
+alansuidaihe,1,Daihe Sui,
+alberto-agudo,9,NA,
+AlbertRapp,1,NA,NA
+aleloi,1,NA,NA
+alonzi,1,pete,NA
+ALShum,1,Alex,www.ALShum.com
+amacfarland,1,Andrew M.,NA
+andland,1,Andrew Landgraf,http://andrewlandgraf.com
+andyhuynh92,1,NA,
+angela-li,1,Angela Li,NA
+AnttiRask,1,Antti Rask,youcanbeapirate.com
+aquarhead,1,LOU Xun,https://aqd.is
+ariespirgel,2,NA,https://arie.rbind.io
+august-18,1,NA,NA
+aviast,1,Michael Henry,NA
+azzaea,1,Azza Ahmed,https://azzaea.netlify.com/
+bambooforest,1,Steven Moran,https://scholar.google.com/citations?user=PpTOh08AAAAJ&hl=en
+BarkleyBG,1,Brian G. Barkley,BarkleyBG.netlify.com
+batpigandme,5,Mara Averick,https://twitter.com/dataandme
+BB1464,1,Oluwafemi OYEDELE,statisticalinference.netlify.app
+bbrewington,1,Brent Brewington,NA
+behrman,29,Bill Behrman,NA
+benherbertson,3,Ben Herbertson,NA
+benmarwick,2,Ben Marwick,http://faculty.washington.edu/bmarwick/
+bensteinberg,4,Ben Steinberg,NA
+bentyeh,1,Benjamin Yeh,https://bentyeh.github.io
+betulturkoglu,1,Betul Turkoglu,NA
+bgreenwell,9,Brandon Greenwell,NA
+BinxiePeterson,1,Bianca Peterson,NA
+BirgerNi,1,Birger Niklas,NA
+bklamer,11,Brett Klamer,NA
+boardtc,1,NA,NA
+c-hoh,1,Christian,hohenfeld.is
+caddycarine,1,Caddy,NA
+camillevleonard,1,Camille V Leonard,https://www.camillevleonard.com/
+canovasjm,1,NA,NA
+cedricbatailler,1,Cedric Batailler,cedricbatailler.me
+christina-wei,1,Christina Wei,
+chrMongeau,1,Christian Mongeau,http://mongeau.net
+coopermor,2,Cooper Morris,NA
+csgillespie,7,Colin Gillespie,http://www.mas.ncl.ac.uk/~ncsg3/
+csrvermaak,7,Rademeyer Vermaak,NA
+cthierst,1,Chloe Thierstein,
+ctsa,1,Chris Saunders,http://www.linkedin.com/in/christophertsaunders
+curious-abhinav,1,Abhinav Singh,https://curious-abhinav.github.io
+curtisalexander,1,Curtis Alexander,https://www.calex.org
+cwarden,2,Christian G. Warden,http://xn.pinkhamster.net/
+cwickham,1,Charlotte Wickham,http://cwick.co.nz
+darrkj,1,Kenny Darrell,http://darrkj.github.io/blogs
+davidkane9,1,David Kane,www.davidkane.info
+davidrsch,6,David,NA
+davidrubinger,1,David Rubinger,NA
+DDClark,1,David Clark,NA
+derwinmcgeary,1,Derwin McGeary,http://derwinmcgeary.github.io
+dgromer,2,Daniel Gromer,NA
+Divider85,3,NA,NA
+djbirke,1,NA,NA
+djnavarro,1,Danielle Navarro,https://djnavarro.net
+DOH-RPS1303,1,Russell Shean,NA
+dongzhuoer,5,Zhuoer Dong,https://dongzhuoer.github.io
+dpastoor,2,Devin Pastoor,NA
+DSGeoff,1,NA,NA
+dthakkar09,1,Devarshi Thakkar,
+duju211,13,Julian During,NA
+dylancashman,1,Dylan Cashman,https://www.eecs.tufts.edu/~dcashm01/
+eddelbuettel,1,Dirk Eddelbuettel,http://dirk.eddelbuettel.com
+EdwinTh,4,Edwin Thoen,thats-so-random.com
+elgabbas,1,Ahmed El-Gabbas,https://elgabbas.github.io
+enryH,1,Henry Webel,NA
+ercan7,1,Ercan Karadas,NA
+EricKit,1,Eric Kitaif,NA
+ericwatt,1,Eric Watt,www.ericdwatt.com
+erikerhardt,2,Erik Erhardt,StatAcumen.com
+etiennebr,2,Etienne B. Racine,NA
+evjrob,1,Everett Robinson,NA
+fellennert,1,NA,NA
+flemmingmiguel,1,Flemming Miguel,NA
+florisvdh,1,Floris Vanderhaeghe,NA
+funkybluehen,1,NA,NA
+gabrivera,1,NA,NA
+gadenbuie,1,Garrick Aden-Buie,https://garrickadenbuie.com
+ganong123,1,Peter Ganong,voices.uchicago.edu/ganong
+garrettgman,103,Garrett Grolemund,NA
+GeroVanMi,1,Gerome Meyer,https://astralibra.ch
+gl-eb,1,Gleb Ebert,glebsite.ch
+GoldbergData,1,Josh Goldberg,https://twitter.com/GoldbergData
+gridgrad,1,bahadir cankardes,NA
+gustavdelius,2,Gustav W Delius,NA
+hadley,1173,Hadley Wickham,http://hadley.nz
+hao-trivago,2,Hao Chen,NA
+harrismcgehee,7,Harris McGehee,https://gist.github.com/harrismcgehee
+hendrikweisser,1,NA,NA
+hengnicai,1,Hengni Cai,NA
+Iain-S,1,Iain,NA
+iansealy,3,Ian Sealy,NA
+ijlyttle,3,Ian Lyttle,NA
+ivan-krukov,1,Ivan Krukov,NA
+jacobkap,1,Jacob Kaplan,http://crimedatatool.com/
+jazzlw,1,Jazz Weisman,NA
+jdblischak,1,John Blischak,https://jdblischak.com/
+jdstorey,1,John D. Storey,http://jdstorey.github.io/
+jefferis,1,Gregory Jefferis,http://www2.mrc-lmb.cam.ac.uk/group-leaders/h-to-m/gregory-jefferis/
+JeffreyRStevens,2,Jeffrey Stevens,https://decisionslab.unl.edu/
+JeldorPKU,1,蒋雨蒙,https://jeldorpku.github.io
+jennybc,5,Jennifer (Jenny) Bryan,https://jennybryan.org
+jenren,1,Jen Ren,NA
+jeroenjanssens,1,Jeroen Janssens,http://jeroenjanssens.com
+jeromecholewa,1,NA,NA
+jilmun,3,Janet Wesner,jilmun.github.io
+jimhester,2,Jim Hester,http://www.jimhester.com
+jjchern,6,JJ Chen,NA
+jkolacz,1,Jacek Kolacz,NA
+joannejang,2,Joanne Jang,joannejang.com
+johannes4998,1,NA,NA
+johnsears,1,John Sears,NA
+jonathanflint,1,NA,NA
+jonmcalder,1,Jon Calder,http://joncalder.co.za
+jonpage,3,Jonathan Page,economistry.com
+jonthegeek,1,Jon Harmon,http://jonthegeek.com
+jooyoungseo,2,JooYoung Seo,https://jooyoungseo.github.io
+jpetuchovas,1,Justinas Petuchovas,NA
+jrdnbradford,1,Jordan,www.linkedin.com/in/jrdnbradford
+jrnold,4,Jeffrey Arnold,http://jrnold.me
+jroberayalas,7,Jose Roberto Ayala Solares,jroberayalas.netlify.com
+jtr13,1,Joyce Robbins,NA
+juandering,1,NA,NA
+jules32,1,Julia Stewart Lowndes,http://jules32.github.io
+kaetschap,1,Sonja,NA
+karawoo,1,Kara Woo,http://karawoo.com
+katrinleinweber,1,Katrin Leinweber,NA
+kdpsingh,5,Karandeep Singh,http://umich.edu/~kdpsingh
+kevinxperese,5,Kevin Perese,NA
+kferris10,1,Kevin Ferris,NA
+kirillseva,2,Kirill Sevastyanenko,NA
+KittJonathan,15,Jonathan Kitt,NA
+koalabearski,1,NA,NA
+krlmlr,1,Kirill Müller,NA
+kucharsky,1,Rafał Kucharski,NA
+kwstat,1,Kevin Wright,NA
+landesbergn,1,Noah Landesberg,noahlandesberg.com
+lawwu,1,Lawrence Wu,NA
+lindbrook,1,NA,NA
+lwjohnst86,2,Luke W Johnston,lukewjohnston.com
+MarckK,1,Kara de la Marck,https://www.linkedin.com/in/karadelamarck
+marwahaha,1,Kunal Marwaha,kunalmarwaha.com/about
+matanhakim,1,Matan Hakim,NA
+MatthiasLiew,3,Matthias Liew,NA
+MattWittbrodt,1,Matt Wittbrodt,mattwittbrodt.com
+maurolepore,2,Mauro Lepore,https://fgeo.netlify.com/
+mbeveridge,7,Mark Beveridge,https://twitter.com/mbeveridge
+mcewenkhundi,1,NA,NA
+mcsnowface,6,"mcsnowface, PhD",NA
+mfherman,1,Matt Herman,mattherman.info
+michaelboerman,1,Michael Boerman,https://michaelboerman.com
+mine-cetinkaya-rundel,158,Mine Cetinkaya-Rundel,https://stat.duke.edu/~mc301
+mitsuoxv,31,Mitsuo Shiota,https://mitsuoxv.rbind.io/
+mjhendrickson,1,Matthew Hendrickson,https://about.me/matthew.j.hendrickson
+MJMarshall,2,NA,NA
+mkfin7,1,Misty Knight-Finley,
+mmhamdy,1,Mohammed Hamdy,NA
+mnazarov,1,Maxim Nazarov,NA
+mpaulacaldas,4,Maria Paula Caldas,mpaulacaldas.com
+mustafaascha,1,Mustafa Ascha,NA
+nareal,1,Nelson Areal,nelsonareal.net
+nate-d-olson,1,Nate Olson,NA
+nateaff,1,Nathanael,nateaff.com
+nattalides,1,NA,NA
+NedJWestern,1,Ned Western,NA
+nickclark1000,1,Nick Clark,NA
+nickelas,1,NA,NA
+nirmalpatel,2,Nirmal Patel,http://playpowerlabs.com
+nischalshrestha,1,Nischal Shrestha,http://nischalshrestha.me
+njtierney,1,Nicholas Tierney,http://www.njtierney.com
+Nowosad,6,Jakub Nowosad,https://nowosad.github.io
+nstjhp,1,Nick Pullen,
+olivier6088,1,NA,NA
+oliviercailloux,1,Olivier Cailloux,https://www.lamsade.dauphine.fr/~ocailloux/
+p0bs,1,Robin Penfold,p0bs.com
+pabloedug,1,Pablo E. Garcia,NA
+padamson,1,Paul Adamson,padamson.github.io
+penelopeysm,1,Penelope Y,NA
+peterhurford,1,Peter Hurford,http://www.peterhurford.com
+petzi53,14,Peter Baumgartner,https://notes.peter-baumgartner.net/
+pkq,4,Patrick Kennedy,NA
+pooyataher,1,Pooya Taherkhani,https://gitlab.com/pooyat
+PursuitOfDataScience,14,Y. Yu,https://youzhi.netlify.app/
+radugrosu,10,Radu Grosu,radugrosu.com
+Ranae,2,Ranae Dietzel,ranae.github.io
+rastrau,2,Ralph Straumann,https://ralphstraumann.ch
+raynamharris,1,Rayna M Harris,https://www.raynamharris.com
+ReeceGoding,1,NA,NA
+rgertenbach,1,Robin Gertenbach,NA
+RIngyao,1,Jajo,NA
+rivaquiroga,1,Riva Quiroga,https://rivaquiroga.cl/
+RJHKnight,1,Richard Knight,NA
+rlzijdeman,2,Richard Zijdeman,NA
+robertchu03,1,NA,NA
+RobinKohrs,1,Robin Kohrs,https://quarantino.netlify.app/
+Robinlovelace,2,Robin,http://robinlovelace.net
+robinsones,1,Emily Robinson,robinsones.github.io
+robtenorio,1,Rob Tenorio,NA
+RodAli,1,Rod Mazloomi,NA
+RohanAlexander,5,Rohan Alexander,https://www.rohanalexander.com/
+RomeroBarata,1,Romero Morais,NA
+rudeboybert,1,Albert Y. Kim,http://rudeboybert.rbind.io/
+saghirb,3,Saghir,http://www.ilustat.com
+salmasian,1,Hojjat Salmasian,NA
+sauercrowd,1,Jonas,https://blog.sauercrowdlabs.xyz
+sciencificity,3,Vebash Naidoo,https://sciencificity-blog.netlify.app/
+seamus-mckinsey,4,Seamus McKinsey,NA
+seanpwilliams,1,NA,NA
+seasmith,1,Luke Smith,https://seasmith.github.io
+sedaghatfar,3,Matthew Sedaghatfar,NA
+sekR4,1,Sebastian Kraus,https://www.linkedin.com/in/sebastiankrausjena
+sfirke,1,Sam Firke,samfirke.com
+ShanEllis,1,Shannon Ellis,shanellis.com
+shoili,1,NA,shoili.github.io
+Shurakai,2,Christian Heinrich,NA
+sibusiso16,52,S'busiso Mkhondwane,NA
+sm-raiyyan,1,SM Raiyyan,
+sonicdoe,11,Jakob Krigovsky,https://sonicdoe.com
+stephan-koenig,3,Stephan Koenig,stephankoenig.me
+stephenbalogun,6,Stephen Balogun,https://stephenbalogun.github.io/stbalogun/
+StevenMMortimer,1,Steven M. Mortimer,https://stevenmortimer.com
+stragu,4,Stéphane Guillou,https://stragu.github.io/
+sulgik,2,Sulgi Kim,
+svenski,1,Sergiusz Bleja,NA
+talgalili,1,Tal Galili,https://www.r-statistics.com
+Taurenamo,1,Alec Fisher,
+tgerarden,1,Todd Gerarden,http://toddgerarden.com
+thomasggodfrey,1,Tom Godfrey,
+timbroderick,1,Tim Broderick,http://www.timbroderick.net
+timwaterhouse,1,Tim Waterhouse,NA
+tjmahr,1,TJ Mahr,tjmahr.com
+tklebel,4,Thomas Klebel,https://thomasklebel.eu
+tomjamesprior,1,Tom Prior,NA
+tteo,4,Terence Teo,tteo.github.io
+twgardner2,1,NA,NA
+ulyngs,4,Ulrik Lyngs,www.ulriklyngs.com
+uribo,1,Shinya Uryu,https://uribo.hatenablog.com
+vanderlindenma,1,Martin Van der Linden,NA
+waltersom,1,Walter Somerville,NA
+werkstattcodes,1,NA,http://werk.statt.codes
+wibeasley,2,Will Beasley,http://scholar.google.com/citations?user=ffsJTC0AAAAJ&hl=en
+yihui,4,Yihui Xie,https://yihui.name
+yimingli,3,Yiming (Paul) Li,https://yimingli.net
+yingxingwu,1,NA,NA
+yutannihilation,1,Hiroaki Yutani,https://twitter.com/yutannihilation
+yuyu-aung,1,Yu Yu Aung,NA
+zachbogart,1,Zach Bogart,zachbogart.com
+zeal626,1,NA,NA
+zekiakyol,16,Zeki Akyol,zekiakyol.com
diff --git a/cover.jpg b/cover.jpg
new file mode 100644
index 000000000..fbe1bff6b
Binary files /dev/null and b/cover.jpg differ
diff --git a/cover.png b/cover.png
deleted file mode 100644
index a7150bdfa..000000000
Binary files a/cover.png and /dev/null differ
diff --git a/data-import.qmd b/data-import.qmd
new file mode 100644
index 000000000..4f97b9222
--- /dev/null
+++ b/data-import.qmd
@@ -0,0 +1,539 @@
+# Data import {#sec-data-import}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+Working with data provided by R packages is a great way to learn data science tools, but you want to apply what you've learned to your own data at some point.
+In this chapter, you'll learn the basics of reading data files into R.
+
+Specifically, this chapter will focus on reading plain-text rectangular files.
+We'll start with practical advice for handling features like column names, types, and missing data.
+You will then learn about reading data from multiple files at once and writing data from R to a file.
+Finally, you'll learn how to handcraft data frames in R.
+
+### Prerequisites
+
+In this chapter, you'll learn how to load flat files in R with the **readr** package, which is part of the core tidyverse.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+## Reading data from a file
+
+To begin, we'll focus on the most common rectangular data file type: CSV, which is short for comma-separated values.
+Here is what a simple CSV file looks like.
+The first row, commonly called the header row, gives the column names, and the following six rows provide the data.
+The columns are separated, aka delimited, by commas.
+
+```{r}
+#| echo: false
+#| message: false
+#| comment: ""
+
+read_lines("data/students.csv") |> cat(sep = "\n")
+```
+
+@tbl-students-table shows a representation of the same data as a table.
+
+```{r}
+#| label: tbl-students-table
+#| echo: false
+#| message: false
+#| tbl-cap: Data from the students.csv file as a table.
+
+read_csv("data/students.csv") |>
+  knitr::kable()
+```
+
+We can read this file into R using `read_csv()`.
+The first argument is the most important: the path to the file.
+You can think about the path as the address of the file: the file is called `students.csv` and that it lives in the `data` folder.
+
+```{r}
+#| message: true
+
+students <- read_csv("data/students.csv")
+```
+
+The code above will work if you have the `students.csv` file in a `data` folder in your project.
+You can download the `students.csv` file from <https://pos.it/r4ds-students-csv> or you can read it directly from that URL with:
+
+```{r}
+#| eval: false
+
+students <- read_csv("https://pos.it/r4ds-students-csv")
+```
+
+When you run `read_csv()`, it prints out a message telling you the number of rows and columns of data, the delimiter that was used, and the column specifications (names of columns organized by the type of data the column contains).
+It also prints out some information about retrieving the full column specification and how to quiet this message.
+This message is an integral part of readr, and we'll return to it in @sec-col-types.
+
+### Practical advice
+
+Once you read data in, the first step usually involves transforming it in some way to make it easier to work with in the rest of your analysis.
+Let's take another look at the `students` data with that in mind.
+
+```{r}
+students
+```
+
+In the `favourite.food` column, there are a bunch of food items, and then the character string `N/A`, which should have been a real `NA` that R will recognize as "not available".
+This is something we can address using the `na` argument.
+By default, `read_csv()` only recognizes empty strings (`""`) in this dataset as `NA`s, we want it to also recognize the character string `"N/A"`.
+
+```{r}
+#| message: false
+students <- read_csv("data/students.csv", na = c("N/A", ""))
+
+students
+```
+
+You might also notice that the `Student ID` and `Full Name` columns are surrounded by backticks.
+That's because they contain spaces, breaking R's usual rules for variable names; they're **non-syntactic** names.
+To refer to these variables, you need to surround them with backticks, `` ` ``:
+
+```{r}
+students |> 
+  rename(
+    student_id = `Student ID`,
+    full_name = `Full Name`
+  )
+```
+
+An alternative approach is to use `janitor::clean_names()` to use some heuristics to turn them all into snake case at once[^data-import-1].
+
+[^data-import-1]: The [janitor](http://sfirke.github.io/janitor/) package is not part of the tidyverse, but it offers handy functions for data cleaning and works well within data pipelines that use `|>`.
+
+```{r}
+#| message: false
+
+students |> janitor::clean_names()
+```
+
+Another common task after reading in data is to consider variable types.
+For example, `meal_plan` is a categorical variable with a known set of possible values, which in R should be represented as a factor:
+
+```{r}
+students |>
+  janitor::clean_names() |>
+  mutate(meal_plan = factor(meal_plan))
+```
+
+Note that the values in the `meal_plan` variable have stayed the same, but the type of variable denoted underneath the variable name has changed from character (`<chr>`) to factor (`<fct>`).
+You'll learn more about factors in @sec-factors.
+
+Before you analyze these data, you'll probably want to fix the `age` and `id` columns.
+Currently, `age` is a character variable because one of the observations is typed out as `five` instead of a numeric `5`.
+We discuss the details of fixing this issue in @sec-import-spreadsheets.
+
+```{r}
+students <- students |>
+  janitor::clean_names() |>
+  mutate(
+    meal_plan = factor(meal_plan),
+    age = parse_number(if_else(age == "five", "5", age))
+  )
+
+students
+```
+
+A new function here is `if_else()`, which has three arguments.
+The first argument `test` should be a logical vector.
+The result will contain the value of the second argument, `yes`, when `test` is `TRUE`, and the value of the third argument, `no`, when it is `FALSE`.
+Here we're saying if `age` is the character string `"five"`, make it `"5"`, and if not leave it as `age`.
+You will learn more about `if_else()` and logical vectors in @sec-logicals.
+
+### Other arguments
+
+There are a couple of other important arguments that we need to mention, and they'll be easier to demonstrate if we first show you a handy trick: `read_csv()` can read text strings that you've created and formatted like a CSV file:
+
+```{r}
+#| message: false
+
+read_csv(
+  "a,b,c
+  1,2,3
+  4,5,6"
+)
+```
+
+Usually, `read_csv()` uses the first line of the data for the column names, which is a very common convention.
+But it's not uncommon for a few lines of metadata to be included at the top of the file.
+You can use `skip = n` to skip the first `n` lines or use `comment = "#"` to drop all lines that start with (e.g.) `#`:
+
+```{r}
+#| message: false
+
+read_csv(
+  "The first line of metadata
+  The second line of metadata
+  x,y,z
+  1,2,3",
+  skip = 2
+)
+
+read_csv(
+  "# A comment I want to skip
+  x,y,z
+  1,2,3",
+  comment = "#"
+)
+```
+
+In other cases, the data might not have column names.
+You can use `col_names = FALSE` to tell `read_csv()` not to treat the first row as headings and instead label them sequentially from `X1` to `Xn`:
+
+```{r}
+#| message: false
+
+read_csv(
+  "1,2,3
+  4,5,6",
+  col_names = FALSE
+)
+```
+
+Alternatively, you can pass `col_names` a character vector which will be used as the column names:
+
+```{r}
+#| message: false
+
+read_csv(
+  "1,2,3
+  4,5,6",
+  col_names = c("x", "y", "z")
+)
+```
+
+These arguments are all you need to know to read the majority of CSV files that you'll encounter in practice.
+(For the rest, you'll need to carefully inspect your `.csv` file and read the documentation for `read_csv()`'s many other arguments.)
+
+### Other file types
+
+Once you've mastered `read_csv()`, using readr's other functions is straightforward; it's just a matter of knowing which function to reach for:
+
+-   `read_csv2()` reads semicolon-separated files.
+    These use `;` instead of `,` to separate fields and are common in countries that use `,` as the decimal marker.
+
+-   `read_tsv()` reads tab-delimited files.
+
+-   `read_delim()` reads in files with any delimiter, attempting to automatically guess the delimiter if you don't specify it.
+
+-   `read_fwf()` reads fixed-width files.
+    You can specify fields by their widths with `fwf_widths()` or by their positions with `fwf_positions()`.
+
+-   `read_table()` reads a common variation of fixed-width files where columns are separated by white space.
+
+-   `read_log()` reads Apache-style log files.
+
+### Exercises
+
+1.  What function would you use to read a file where fields were separated with "\|"?
+
+2.  Apart from `file`, `skip`, and `comment`, what other arguments do `read_csv()` and `read_tsv()` have in common?
+
+3.  What are the most important arguments to `read_fwf()`?
+
+4.  Sometimes strings in a CSV file contain commas.
+    To prevent them from causing problems, they need to be surrounded by a quoting character, like `"` or `'`. By default, `read_csv()` assumes that the quoting character will be `"`.
+    To read the following text into a data frame, what argument to `read_csv()` do you need to specify?
+
+    ```{r}
+    #| eval: false
+
+    "x,y\n1,'a,b'"
+    ```
+
+5.  Identify what is wrong with each of the following inline CSV files.
+    What happens when you run the code?
+
+    ```{r}
+    #| eval: false
+
+    read_csv("a,b\n1,2,3\n4,5,6")
+    read_csv("a,b,c\n1,2\n1,2,3,4")
+    read_csv("a,b\n\"1")
+    read_csv("a,b\n1,2\na,b")
+    read_csv("a;b\n1;3")
+    ```
+
+6.  Practice referring to non-syntactic names in the following data frame by:
+
+    a.  Extracting the variable called `1`.
+    b.  Plotting a scatterplot of `1` vs. `2`.
+    c.  Creating a new column called `3`, which is `2` divided by `1`.
+    d.  Renaming the columns to `one`, `two`, and `three`.
+
+    ```{r}
+    annoying <- tibble(
+      `1` = 1:10,
+      `2` = `1` * 2 + rnorm(length(`1`))
+    )
+    ```
+
+## Controlling column types {#sec-col-types}
+
+A CSV file doesn't contain any information about the type of each variable (i.e. whether it's a logical, number, string, etc.), so readr will try to guess the type.
+This section describes how the guessing process works, how to resolve some common problems that cause it to fail, and, if needed, how to supply the column types yourself.
+Finally, we'll mention a few general strategies that are useful if readr is failing catastrophically and you need to get more insight into the structure of your file.
+
+### Guessing types
+
+readr uses a heuristic to figure out the column types.
+For each column, it pulls the values of 1,000[^data-import-2] rows spaced evenly from the first row to the last, ignoring missing values.
+It then works through the following questions:
+
+[^data-import-2]: You can override the default of 1000 with the `guess_max` argument.
+
+-   Does it contain only `F`, `T`, `FALSE`, or `TRUE` (ignoring case)? If so, it's a logical.
+-   Does it contain only numbers (e.g., `1`, `-4.5`, `5e6`, `Inf`)? If so, it's a number.
+-   Does it match the ISO8601 standard? If so, it's a date or date-time. (We'll return to date-times in more detail in @sec-creating-datetimes).
+-   Otherwise, it must be a string.
+
+You can see that behavior in action in this simple example:
+
+```{r}
+#| message: false
+
+read_csv("
+  logical,numeric,date,string
+  TRUE,1,2021-01-15,abc
+  false,4.5,2021-02-15,def
+  T,Inf,2021-02-16,ghi
+")
+```
+
+This heuristic works well if you have a clean dataset, but in real life, you'll encounter a selection of weird and beautiful failures.
+
+### Missing values, column types, and problems
+
+The most common way column detection fails is that a column contains unexpected values, and you get a character column instead of a more specific type.
+One of the most common causes for this is a missing value, recorded using something other than the `NA` that readr expects.
+
+Take this simple 1 column CSV file as an example:
+
+```{r}
+simple_csv <- "
+  x
+  10
+  .
+  20
+  30"
+```
+
+If we read it without any additional arguments, `x` becomes a character column:
+
+```{r}
+#| message: false
+
+read_csv(simple_csv)
+```
+
+In this very small case, you can easily see the missing value `.`.
+But what happens if you have thousands of rows with only a few missing values represented by `.`s sprinkled among them?
+One approach is to tell readr that `x` is a numeric column, and then see where it fails.
+You can do that with the `col_types` argument, which takes a named list where the names match the column names in the CSV file:
+
+```{r}
+df <- read_csv(
+  simple_csv, 
+  col_types = list(x = col_double())
+)
+```
+
+Now `read_csv()` reports that there was a problem, and tells us we can find out more with `problems()`:
+
+```{r}
+problems(df)
+```
+
+This tells us that there was a problem in row 3, col 1 where readr expected a double but got a `.`.
+That suggests this dataset uses `.` for missing values.
+So then we set `na = "."`, the automatic guessing succeeds, giving us the numeric column that we want:
+
+```{r}
+#| message: false
+
+read_csv(simple_csv, na = ".")
+```
+
+### Column types
+
+readr provides a total of nine column types for you to use:
+
+-   `col_logical()` and `col_double()` read logicals and real numbers. They're relatively rarely needed (except as above), since readr will usually guess them for you.
+-   `col_integer()` reads integers. We seldom distinguish integers and doubles in this book because they're functionally equivalent, but reading integers explicitly can occasionally be useful because they occupy half the memory of doubles.
+-   `col_character()` reads strings. This can be useful to specify explicitly when you have a column that is a numeric identifier, i.e., long series of digits that identifies an object but doesn't make sense to apply mathematical operations to. Examples include phone numbers, social security numbers, credit card numbers, etc.
+-   `col_factor()`, `col_date()`, and `col_datetime()` create factors, dates, and date-times respectively; you'll learn more about those when we get to those data types in @sec-factors and @sec-dates-and-times.
+-   `col_number()` is a permissive numeric parser that will ignore non-numeric components, and is particularly useful for currencies. You'll learn more about it in @sec-numbers.
+-   `col_skip()` skips a column so it's not included in the result, which can be useful for speeding up reading the data if you have a large CSV file and you only want to use some of the columns.
+
+It's also possible to override the default column by switching from `list()` to `cols()` and specifying `.default`:
+
+```{r}
+another_csv <- "
+x,y,z
+1,2,3"
+
+read_csv(
+  another_csv, 
+  col_types = cols(.default = col_character())
+)
+```
+
+Another useful helper is `cols_only()` which will read in only the columns you specify:
+
+```{r}
+read_csv(
+  another_csv,
+  col_types = cols_only(x = col_character())
+)
+```
+
+## Reading data from multiple files {#sec-readr-directory}
+
+Sometimes your data is split across multiple files instead of being contained in a single file.
+For example, you might have sales data for multiple months, with each month's data in a separate file: `01-sales.csv` for January, `02-sales.csv` for February, and `03-sales.csv` for March.
+With `read_csv()` you can read these data in at once and stack them on top of each other in a single data frame.
+
+```{r}
+#| message: false
+
+sales_files <- c("data/01-sales.csv", "data/02-sales.csv", "data/03-sales.csv")
+read_csv(sales_files, id = "file")
+```
+
+Once again, the code above will work if you have the CSV files in a `data` folder in your project.
+You can download these files from <https://pos.it/r4ds-01-sales>, <https://pos.it/r4ds-02-sales>, and <https://pos.it/r4ds-03-sales> or you can read them directly with:
+
+```{r}
+#| eval: false
+
+sales_files <- c(
+  "https://pos.it/r4ds-01-sales",
+  "https://pos.it/r4ds-02-sales",
+  "https://pos.it/r4ds-03-sales"
+)
+read_csv(sales_files, id = "file")
+```
+
+The `id` argument adds a new column called `file` to the resulting data frame that identifies the file the data come from.
+This is especially helpful in circumstances where the files you're reading in do not have an identifying column that can help you trace the observations back to their original sources.
+
+If you have many files you want to read in, it can get cumbersome to write out their names as a list.
+Instead, you can use the base `list.files()` function to find the files for you by matching a pattern in the file names.
+You'll learn more about these patterns in @sec-regular-expressions.
+
+```{r}
+sales_files <- list.files("data", pattern = "sales\\.csv$", full.names = TRUE)
+sales_files
+```
+
+## Writing to a file {#sec-writing-to-a-file}
+
+readr also comes with two useful functions for writing data back to disk: `write_csv()` and `write_tsv()`.
+The most important arguments to these functions are `x` (the data frame to save) and `file` (the location to save it).
+You can also specify how missing values are written with `na`, and if you want to `append` to an existing file.
+
+```{r}
+#| eval: false
+
+write_csv(students, "students.csv")
+```
+
+Now let's read that csv file back in.
+Note that the variable type information that you just set up is lost when you save to CSV because you're starting over with reading from a plain text file again:
+
+```{r}
+#| warning: false
+#| message: false
+
+students
+write_csv(students, "students-2.csv")
+read_csv("students-2.csv")
+```
+
+This makes CSVs a little unreliable for caching interim results---you need to recreate the column specification every time you load in.
+There are two main alternative:
+
+1.  `write_rds()` and `read_rds()` are uniform wrappers around the base functions `readRDS()` and `saveRDS()`.
+    These store data in R's custom binary format called RDS.
+    This means that when you reload the object, you are loading the *exact same* R object that you stored.
+
+    ```{r}
+    write_rds(students, "students.rds")
+    read_rds("students.rds")
+    ```
+
+2.  The arrow package allows you to read and write parquet files, a fast binary file format that can be shared across programming languages.
+    We'll return to arrow in more depth in @sec-arrow.
+
+    ```{r}
+    #| eval: false
+
+    library(arrow)
+    write_parquet(students, "students.parquet")
+    read_parquet("students.parquet")
+    #> # A tibble: 6 × 5
+    #>   student_id full_name        favourite_food     meal_plan             age
+    #>        <dbl> <chr>            <chr>              <fct>               <dbl>
+    #> 1          1 Sunil Huffmann   Strawberry yoghurt Lunch only              4
+    #> 2          2 Barclay Lynn     French fries       Lunch only              5
+    #> 3          3 Jayendra Lyne    NA                 Breakfast and lunch     7
+    #> 4          4 Leon Rossini     Anchovies          Lunch only             NA
+    #> 5          5 Chidiegwu Dunkel Pizza              Breakfast and lunch     5
+    #> 6          6 Güvenç Attila    Ice cream          Lunch only              6
+    ```
+
+Parquet tends to be much faster than RDS and is usable outside of R, but does require the arrow package.
+
+```{r}
+#| include: false
+file.remove("students-2.csv")
+file.remove("students.rds")
+```
+
+## Data entry
+
+Sometimes you'll need to assemble a tibble "by hand" doing a little data entry in your R script.
+There are two useful functions to help you do this which differ in whether you layout the tibble by columns or by rows.
+`tibble()` works by column:
+
+```{r}
+tibble(
+  x = c(1, 2, 5), 
+  y = c("h", "m", "g"),
+  z = c(0.08, 0.83, 0.60)
+)
+```
+
+Laying out the data by column can make it hard to see how the rows are related, so an alternative is `tribble()`, short for **tr**ansposed t**ibble**, which lets you lay out your data row by row.
+`tribble()` is customized for data entry in code: column headings start with `~` and entries are separated by commas.
+This makes it possible to lay out small amounts of data in an easy to read form:
+
+```{r}
+tribble(
+  ~x, ~y, ~z,
+  1, "h", 0.08,
+  2, "m", 0.83,
+  5, "g", 0.60
+)
+```
+
+## Summary
+
+In this chapter, you've learned how to load CSV files with `read_csv()` and to do your own data entry with `tibble()` and `tribble()`.
+You've learned how csv files work, some of the problems you might encounter, and how to overcome them.
+We'll come to data import a few times in this book: @sec-import-spreadsheets from Excel and Google Sheets, @sec-import-databases will show you how to load data from databases, @sec-arrow from parquet files, @sec-rectangling from JSON, and @sec-scraping from websites.
+
+We're just about at the end of this section of the book, but there's one important last topic to cover: how to get help.
+So in the next chapter, you'll learn some good places to look for help, how to create a reprex to maximize your chances of getting good help, and some general advice on keeping up with the world of R.
diff --git a/data-tidy.qmd b/data-tidy.qmd
new file mode 100644
index 000000000..c9f962c16
--- /dev/null
+++ b/data-tidy.qmd
@@ -0,0 +1,591 @@
+# Data tidying {#sec-data-tidy}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+> "Happy families are all alike; every unhappy family is unhappy in its own way."\
+> --- Leo Tolstoy
+
+> "Tidy datasets are all alike, but every messy dataset is messy in its own way."\
+> --- Hadley Wickham
+
+In this chapter, you will learn a consistent way to organize your data in R using a system called **tidy data**.
+Getting your data into this format requires some work up front, but that work pays off in the long term.
+Once you have tidy data and the tidy tools provided by packages in the tidyverse, you will spend much less time munging data from one representation to another, allowing you to spend more time on the data questions you care about.
+
+In this chapter, you'll first learn the definition of tidy data and see it applied to a simple toy dataset.
+Then we'll dive into the primary tool you'll use for tidying data: pivoting.
+Pivoting allows you to change the form of your data without changing any of the values.
+
+### Prerequisites
+
+In this chapter, we'll focus on tidyr, a package that provides a bunch of tools to help tidy up your messy datasets.
+tidyr is a member of the core tidyverse.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+From this chapter on, we'll suppress the loading message from `library(tidyverse)`.
+
+## Tidy data {#sec-tidy-data}
+
+You can represent the same underlying data in multiple ways.
+The example below shows the same data organized in three different ways.
+Each dataset shows the same values of four variables: *country*, *year*, *population*, and number of documented *cases* of TB (tuberculosis), but each dataset organizes the values in a different way.
+
+```{r}
+table1
+
+table2
+
+table3
+```
+
+These are all representations of the same underlying data, but they are not equally easy to use.
+One of them, `table1`, will be much easier to work with inside the tidyverse because it's **tidy**.
+
+There are three interrelated rules that make a dataset tidy:
+
+1.  Each variable is a column; each column is a variable.
+2.  Each observation is a row; each row is an observation.
+3.  Each value is a cell; each cell is a single value.
+
+@fig-tidy-structure shows the rules visually.
+
+```{r}
+#| label: fig-tidy-structure
+#| echo: false
+#| fig-cap: | 
+#|   The following three rules make a dataset tidy: variables are columns,
+#|   observations are rows, and values are cells.
+#| fig-alt: | 
+#|   Three panels, each representing a tidy data frame. The first panel
+#|   shows that each variable is a column. The second panel shows that each
+#|   observation is a row. The third panel shows that each value is
+#|   a cell.
+
+knitr::include_graphics("images/tidy-1.png", dpi = 270)
+```
+
+Why ensure that your data is tidy?
+There are two main advantages:
+
+1.  There's a general advantage to picking one consistent way of storing data.
+    If you have a consistent data structure, it's easier to learn the tools that work with it because they have an underlying uniformity.
+
+2.  There's a specific advantage to placing variables in columns because it allows R's vectorized nature to shine.
+    As you learned in @sec-mutate and @sec-summarize, most built-in R functions work with vectors of values.
+    That makes transforming tidy data feel particularly natural.
+
+dplyr, ggplot2, and all the other packages in the tidyverse are designed to work with tidy data.
+Here are a few small examples showing how you might work with `table1`.
+
+```{r}
+#| fig-width: 5
+#| fig-alt: |
+#|   This figure shows the number of cases in 1999 and 2000 for 
+#|   Afghanistan, Brazil, and China, with year on the x-axis and number 
+#|   of cases on the y-axis. Each point on the plot represents the number 
+#|   of cases in a given country in a given year. The points for each
+#|   country are differentiated from others by color and shape and connected
+#|   with a line, resulting in three, non-parallel, non-intersecting lines.
+#|   The numbers of cases in China are highest for both 1999 and 2000, with
+#|   values above 200,000 for both years. The number of cases in Brazil is
+#|   approximately 40,000 in 1999 and approximately 75,000 in 2000. The
+#|   numbers of cases in Afghanistan are lowest for both 1999 and 2000, with
+#|   values that appear to be very close to 0 on this scale.
+
+# Compute rate per 10,000
+table1 |>
+  mutate(rate = cases / population * 10000)
+
+# Compute total cases per year
+table1 |> 
+  group_by(year) |> 
+  summarize(total_cases = sum(cases))
+
+# Visualize changes over time
+ggplot(table1, aes(x = year, y = cases)) +
+  geom_line(aes(group = country), color = "grey50") +
+  geom_point(aes(color = country, shape = country)) +
+  scale_x_continuous(breaks = c(1999, 2000)) # x-axis breaks at 1999 and 2000
+```
+
+### Exercises
+
+1.  For each of the sample tables, describe what each observation and each column represents.
+
+2.  Sketch out the process you'd use to calculate the `rate` for `table2` and `table3`.
+    You will need to perform four operations:
+
+    a.  Extract the number of TB cases per country per year.
+    b.  Extract the matching population per country per year.
+    c.  Divide cases by population, and multiply by 10000.
+    d.  Store back in the appropriate place.
+
+    You haven't yet learned all the functions you'd need to actually perform these operations, but you should still be able to think through the transformations you'd need.
+
+## Lengthening data {#sec-pivoting}
+
+The principles of tidy data might seem so obvious that you wonder if you'll ever encounter a dataset that isn't tidy.
+Unfortunately, however, most real data is untidy.
+There are two main reasons:
+
+1.  Data is often organized to facilitate some goal other than analysis.
+    For example, it's common for data to be structured to make data entry, not analysis, easy.
+
+2.  Most people aren't familiar with the principles of tidy data, and it's hard to derive them yourself unless you spend a lot of time working with data.
+
+This means that most real analyses will require at least a little tidying.
+You'll begin by figuring out what the underlying variables and observations are.
+Sometimes this is easy; other times you'll need to consult with the people who originally generated the data.
+Next, you'll **pivot** your data into a tidy form, with variables in the columns and observations in the rows.
+
+tidyr provides two functions for pivoting data: `pivot_longer()` and `pivot_wider()`.
+We'll first start with `pivot_longer()` because it's the most common case.
+Let's dive into some examples.
+
+### Data in column names {#sec-billboard}
+
+The `billboard` dataset records the billboard rank of songs in the year 2000:
+
+```{r}
+billboard
+```
+
+In this dataset, each observation is a song.
+The first three columns (`artist`, `track` and `date.entered`) are variables that describe the song.
+Then we have 76 columns (`wk1`-`wk76`) that describe the rank of the song in each week[^data-tidy-1].
+Here, the column names are one variable (the `week`) and the cell values are another (the `rank`).
+
+[^data-tidy-1]: The song will be included as long as it was in the top 100 at some point in 2000, and is tracked for up to 72 weeks after it appears.
+
+To tidy this data, we'll use `pivot_longer()`:
+
+```{r, R.options=list(pillar.print_min = 10)}
+billboard |> 
+  pivot_longer(
+    cols = starts_with("wk"), 
+    names_to = "week", 
+    values_to = "rank"
+  )
+```
+
+After the data, there are three key arguments:
+
+-   `cols` specifies which columns need to be pivoted, i.e. which columns aren't variables. This argument uses the same syntax as `select()` so here we could use `!c(artist, track, date.entered)` or `starts_with("wk")`.
+-   `names_to` names the variable stored in the column names, we named that variable `week`.
+-   `values_to` names the variable stored in the cell values, we named that variable `rank`.
+
+Note that in the code `"week"` and `"rank"` are quoted because those are new variables we're creating, they don't yet exist in the data when we run the `pivot_longer()` call.
+
+Now let's turn our attention to the resulting, longer data frame.
+What happens if a song is in the top 100 for less than 76 weeks?
+Take 2 Pac's "Baby Don't Cry", for example.
+The above output suggests that it was only in the top 100 for 7 weeks, and all the remaining weeks are filled in with missing values.
+These `NA`s don't really represent unknown observations; they were forced to exist by the structure of the dataset[^data-tidy-2], so we can ask `pivot_longer()` to get rid of them by setting `values_drop_na = TRUE`:
+
+[^data-tidy-2]: We'll come back to this idea in @sec-missing-values.
+
+```{r}
+billboard |> 
+  pivot_longer(
+    cols = starts_with("wk"), 
+    names_to = "week", 
+    values_to = "rank",
+    values_drop_na = TRUE
+  )
+```
+
+The number of rows is now much lower, indicating that many rows with `NA`s were dropped.
+
+You might also wonder what happens if a song is in the top 100 for more than 76 weeks?
+We can't tell from this data, but you might guess that additional columns `wk77`, `wk78`, ... would be added to the dataset.
+
+This data is now tidy, but we could make future computation a bit easier by converting values of `week` from character strings to numbers using `mutate()` and `readr::parse_number()`.
+`parse_number()` is a handy function that will extract the first number from a string, ignoring all other text.
+
+```{r}
+billboard_longer <- billboard |> 
+  pivot_longer(
+    cols = starts_with("wk"), 
+    names_to = "week", 
+    values_to = "rank",
+    values_drop_na = TRUE
+  ) |> 
+  mutate(
+    week = parse_number(week)
+  )
+billboard_longer
+```
+
+Now that we have all the week numbers in one variable and all the rank values in another, we're in a good position to visualize how song ranks vary over time.
+The code is shown below and the result is in @fig-billboard-ranks.
+We can see that very few songs stay in the top 100 for more than 20 weeks.
+
+```{r}
+#| label: fig-billboard-ranks
+#| fig-cap: |
+#|   A line plot showing how the rank of a song changes over time.
+#| fig-alt: |
+#|   A line plot with week on the x-axis and rank on the y-axis, where
+#|   each line represents a song. Most songs appear to start at a high rank,
+#|   rapidly accelerate to a low rank, and then decay again. There are
+#|   surprisingly few tracks in the region when week is >20 and rank is
+#|   >50.
+
+billboard_longer |> 
+  ggplot(aes(x = week, y = rank, group = track)) + 
+  geom_line(alpha = 0.25) + 
+  scale_y_reverse()
+```
+
+### How does pivoting work?
+
+Now that you've seen how we can use pivoting to reshape our data, let's take a little time to gain some intuition about what pivoting does to the data.
+Let's start with a very simple dataset to make it easier to see what's happening.
+Suppose we have three patients with `id`s A, B, and C, and we take two blood pressure measurements on each patient.
+We'll create the data with `tribble()`, a handy function for constructing small tibbles by hand:
+
+```{r}
+df <- tribble(
+  ~id,  ~bp1, ~bp2,
+   "A",  100,  120,
+   "B",  140,  115,
+   "C",  120,  125
+)
+```
+
+We want our new dataset to have three variables: `id` (already exists), `measurement` (the column names), and `value` (the cell values).
+To achieve this, we need to pivot `df` longer:
+
+```{r}
+df |> 
+  pivot_longer(
+    cols = bp1:bp2,
+    names_to = "measurement",
+    values_to = "value"
+  )
+```
+
+How does the reshaping work?
+It's easier to see if we think about it column by column.
+As shown in @fig-pivot-variables, the values in a column that was already a variable in the original dataset (`id`) need to be repeated, once for each column that is pivoted.
+
+```{r}
+#| label: fig-pivot-variables
+#| echo: false
+#| fig-cap: | 
+#|   Columns that are already variables need to be repeated, once for
+#|   each column that is pivoted.
+#| fig-alt: | 
+#|   A diagram showing how `pivot_longer()` transforms a simple
+#|   dataset, using color to highlight how the values in the `id` column
+#|   ("A", "B", "C") are each repeated twice in the output because there are
+#|   two columns being pivoted ("bp1" and "bp2").
+
+knitr::include_graphics("diagrams/tidy-data/variables.png", dpi = 270)
+```
+
+The column names become values in a new variable, whose name is defined by `names_to`, as shown in @fig-pivot-names.
+They need to be repeated once for each row in the original dataset.
+
+```{r}
+#| label: fig-pivot-names
+#| echo: false
+#| fig-cap: |
+#|   The column names of pivoted columns become values in a new column. The 
+#|   values need to be repeated once for each row of the original dataset.
+#| fig-alt: | 
+#|   A diagram showing how `pivot_longer()` transforms a simple
+#|   data set, using color to highlight how column names ("bp1" and 
+#|   "bp2") become the values in a new `measurement` column. They are repeated
+#|   three times because there were three rows in the input.
+
+knitr::include_graphics("diagrams/tidy-data/column-names.png", dpi = 270)
+```
+
+The cell values also become values in a new variable, with a name defined by `values_to`.
+They are unwound row by row.
+@fig-pivot-values illustrates the process.
+
+```{r}
+#| label: fig-pivot-values
+#| echo: false
+#| fig-cap: |
+#|   The number of values is preserved (not repeated), but unwound
+#|   row-by-row.
+#| fig-alt: | 
+#|   A diagram showing how `pivot_longer()` transforms data,
+#|   using color to highlight how the cell values (blood pressure measurements)
+#|   become the values in a new `value` column. They are unwound row-by-row,
+#|   so the original rows (100,120), then (140,115), then (120,125), become 
+#|   a column running from 100 to 125.
+
+knitr::include_graphics("diagrams/tidy-data/cell-values.png", dpi = 270)
+```
+
+### Many variables in column names
+
+A more challenging situation occurs when you have multiple pieces of information crammed into the column names, and you would like to store these in separate new variables.
+For example, take the `who2` dataset, the source of `table1` and friends that you saw above:
+
+```{r}
+who2
+```
+
+This dataset, collected by the World Health Organisation, records information about tuberculosis diagnoses.
+There are two columns that are already variables and are easy to interpret: `country` and `year`.
+They are followed by 56 columns like `sp_m_014`, `ep_m_4554`, and `rel_m_3544`.
+If you stare at these columns for long enough, you'll notice there's a pattern.
+Each column name is made up of three pieces separated by `_`.
+The first piece, `sp`/`rel`/`ep`, describes the method used for the diagnosis, the second piece, `m`/`f` is the `gender` (coded as a binary variable in this dataset), and the third piece, `014`/`1524`/`2534`/`3544`/`4554`/`5564`/`65` is the `age` range (`014` represents 0-14, for example).
+
+So in this case we have six pieces of information recorded in `who2`: the country and the year (already columns); the method of diagnosis, the gender category, and the age range category (contained in the other column names); and the count of patients in that category (cell values).
+To organize these six pieces of information in six separate columns, we use `pivot_longer()` with a vector of column names for `names_to` and instructors for splitting the original variable names into pieces for `names_sep` as well as a column name for `values_to`:
+
+```{r}
+who2 |> 
+  pivot_longer(
+    cols = !(country:year),
+    names_to = c("diagnosis", "gender", "age"), 
+    names_sep = "_",
+    values_to = "count"
+  )
+```
+
+An alternative to `names_sep` is `names_pattern`, which you can use to extract variables from more complicated naming scenarios, once you've learned about regular expressions in @sec-regular-expressions.
+
+Conceptually, this is only a minor variation on the simpler case you've already seen.
+@fig-pivot-multiple-names shows the basic idea: now, instead of the column names pivoting into a single column, they pivot into multiple columns.
+You can imagine this happening in two steps (first pivoting and then separating) but under the hood it happens in a single step because that's faster.
+
+```{r}
+#| label: fig-pivot-multiple-names
+#| echo: false
+#| fig-cap: |
+#|   Pivoting columns with multiple pieces of information in the names 
+#|   means that each column name now fills in values in multiple output 
+#|   columns.
+#| fig-alt: |
+#|   A diagram that uses color to illustrate how supplying `names_sep` 
+#|   and multiple `names_to` creates multiple variables in the output.
+#|   The input has variable names "x_1" and "y_2" which are split up
+#|   by "_" to create name and number columns in the output. This is
+#|   is similar case with a single `names_to`, but what would have been a
+#|   single output variable is now separated into multiple variables.
+
+knitr::include_graphics("diagrams/tidy-data/multiple-names.png", dpi = 270)
+```
+
+### Data and variable names in the column headers
+
+The next step up in complexity is when the column names include a mix of variable values and variable names.
+For example, take the `household` dataset:
+
+```{r}
+household
+```
+
+This dataset contains data about five families, with the names and dates of birth of up to two children.
+The new challenge in this dataset is that the column names contain the names of two variables (`dob`, `name)` and the values of another (`child,` with values 1 or 2).
+To solve this problem we again need to supply a vector to `names_to` but this time we use the special `".value"` sentinel; this isn't the name of a variable but a unique value that tells `pivot_longer()` to do something different.
+This overrides the usual `values_to` argument to use the first component of the pivoted column name as a variable name in the output.
+
+```{r}
+household |> 
+  pivot_longer(
+    cols = !family, 
+    names_to = c(".value", "child"), 
+    names_sep = "_", 
+    values_drop_na = TRUE
+  )
+```
+
+We again use `values_drop_na = TRUE`, since the shape of the input forces the creation of explicit missing variables (e.g., for families with only one child).
+
+@fig-pivot-names-and-values illustrates the basic idea with a simpler example.
+When you use `".value"` in `names_to`, the column names in the input contribute to both values and variable names in the output.
+
+```{r}
+#| label: fig-pivot-names-and-values
+#| echo: false
+#| fig-cap: |
+#|   Pivoting with `names_to = c(".value", "num")` splits the column names
+#|   into two components: the first part determines the output column
+#|   name (`x` or `y`), and the second part determines the value of the
+#|   `num` column.
+#| fig-alt: |
+#|   A diagram that uses color to illustrate how the special ".value"
+#|   sentinel works. The input has names "x_1", "x_2", "y_1", and "y_2",
+#|   and we want to use the first component ("x", "y") as a variable name
+#|   and the second ("1", "2") as the value for a new "num" column.
+
+knitr::include_graphics("diagrams/tidy-data/names-and-values.png", dpi = 270)
+```
+
+## Widening data
+
+So far we've used `pivot_longer()` to solve the common class of problems where values have ended up in column names.
+Next we'll pivot (HA HA) to `pivot_wider()`, which makes datasets **wider** by increasing columns and reducing rows and helps when one observation is spread across multiple rows.
+This seems to arise less commonly in the wild, but it does seem to crop up a lot when dealing with governmental data.
+
+We'll start by looking at `cms_patient_experience`, a dataset from the Centers of Medicare and Medicaid services that collects data about patient experiences:
+
+```{r}
+cms_patient_experience
+```
+
+The core unit being studied is an organization, but each organization is spread across six rows, with one row for each measurement taken in the survey organization.
+We can see the complete set of values for `measure_cd` and `measure_title` by using `distinct()`:
+
+```{r}
+cms_patient_experience |> 
+  distinct(measure_cd, measure_title)
+```
+
+Neither of these columns will make particularly great variable names: `measure_cd` doesn't hint at the meaning of the variable and `measure_title` is a long sentence containing spaces.
+We'll use `measure_cd` as the source for our new column names for now, but in a real analysis you might want to create your own variable names that are both short and meaningful.
+
+`pivot_wider()` has the opposite interface to `pivot_longer()`: instead of choosing new column names, we need to provide the existing columns that define the values (`values_from`) and the column name (`names_from)`:
+
+```{r}
+cms_patient_experience |> 
+  pivot_wider(
+    names_from = measure_cd,
+    values_from = prf_rate
+  )
+```
+
+The output doesn't look quite right; we still seem to have multiple rows for each organization.
+That's because, we also need to tell `pivot_wider()` which column or columns have values that uniquely identify each row; in this case those are the variables starting with `"org"`:
+
+```{r}
+cms_patient_experience |> 
+  pivot_wider(
+    id_cols = starts_with("org"),
+    names_from = measure_cd,
+    values_from = prf_rate
+  )
+```
+
+This gives us the output that we're looking for.
+
+### How does `pivot_wider()` work?
+
+To understand how `pivot_wider()` works, let's again start with a very simple dataset.
+This time we have two patients with `id`s A and B, we have three blood pressure measurements on patient A and two on patient B:
+
+```{r}
+df <- tribble(
+  ~id, ~measurement, ~value,
+  "A",        "bp1",    100,
+  "B",        "bp1",    140,
+  "B",        "bp2",    115, 
+  "A",        "bp2",    120,
+  "A",        "bp3",    105
+)
+```
+
+We'll take the values from the `value` column and the names from the `measurement` column:
+
+```{r}
+df |> 
+  pivot_wider(
+    names_from = measurement,
+    values_from = value
+  )
+```
+
+To begin the process `pivot_wider()` needs to first figure out what will go in the rows and columns.
+The new column names will be the unique values of `measurement`.
+
+```{r}
+df |> 
+  distinct(measurement) |> 
+  pull()
+```
+
+By default, the rows in the output are determined by all the variables that aren't going into the new names or values.
+These are called the `id_cols`.
+Here there is only one column, but in general there can be any number.
+
+```{r}
+df |> 
+  select(-measurement, -value) |> 
+  distinct()
+```
+
+`pivot_wider()` then combines these results to generate an empty data frame:
+
+```{r}
+df |> 
+  select(-measurement, -value) |> 
+  distinct() |> 
+  mutate(x = NA, y = NA, z = NA)
+```
+
+It then fills in all the missing values using the data in the input.
+In this case, not every cell in the output has a corresponding value in the input as there's no third blood pressure measurement for patient B, so that cell remains missing.
+We'll come back to this idea that `pivot_wider()` can "make" missing values in @sec-missing-values.
+
+You might also wonder what happens if there are multiple rows in the input that correspond to one cell in the output.
+The example below has two rows that correspond to `id` "A" and `measurement` "bp1":
+
+```{r}
+df <- tribble(
+  ~id, ~measurement, ~value,
+  "A",        "bp1",    100,
+  "A",        "bp1",    102,
+  "A",        "bp2",    120,
+  "B",        "bp1",    140, 
+  "B",        "bp2",    115
+)
+```
+
+If we attempt to pivot this we get an output that contains list-columns, which you'll learn more about in @sec-rectangling:
+
+```{r}
+df |>
+  pivot_wider(
+    names_from = measurement,
+    values_from = value
+  )
+```
+
+Since you don't know how to work with this sort of data yet, you'll want to follow the hint in the warning to figure out where the problem is:
+
+```{r}
+df |> 
+  group_by(id, measurement) |> 
+  summarize(n = n(), .groups = "drop") |> 
+  filter(n > 1)
+```
+
+It's then up to you to figure out what's gone wrong with your data and either repair the underlying damage or use your grouping and summarizing skills to ensure that each combination of row and column values only has a single row.
+
+## Summary
+
+In this chapter you learned about tidy data: data that has variables in columns and observations in rows.
+Tidy data makes working in the tidyverse easier, because it's a consistent structure understood by most functions, the main challenge is transforming the data from whatever structure you receive it in to a tidy format.
+To that end, you learned about `pivot_longer()` and `pivot_wider()` which allow you to tidy up many untidy datasets.
+The examples we presented here are a selection of those from `vignette("pivot", package = "tidyr")`, so if you encounter a problem that this chapter doesn't help you with, that vignette is a good place to try next.
+
+Another challenge is that, for a given dataset, it can be impossible to label the longer or the wider version as the "tidy" one.
+This is partly a reflection of our definition of tidy data, where we said tidy data has one variable in each column, but we didn't actually define what a variable is (and it's surprisingly hard to do so).
+It's totally fine to be pragmatic and to say a variable is whatever makes your analysis easiest.
+So if you're stuck figuring out how to do some computation, consider switching up the organisation of your data; don't be afraid to untidy, transform, and re-tidy as needed!
+
+If you enjoyed this chapter and want to learn more about the underlying theory, you can learn more about the history and theoretical underpinnings in the [Tidy Data](https://www.jstatsoft.org/article/view/v059i10) paper published in the Journal of Statistical Software.
+
+Now that you're writing a substantial amount of R code, it's time to learn more about organizing your code into files and directories.
+In the next chapter, you'll learn all about the advantages of scripts and projects, and some of the many tools that they provide to make your life easier.
diff --git a/data-transform.qmd b/data-transform.qmd
new file mode 100644
index 000000000..3011f0ec2
--- /dev/null
+++ b/data-transform.qmd
@@ -0,0 +1,887 @@
+# Data transformation {#sec-data-transform}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+Visualization is an important tool for generating insight, but it's rare that you get the data in exactly the right form you need to make the graph you want.
+Often you'll need to create some new variables or summaries to answer your questions with your data, or maybe you just want to rename the variables or reorder the observations to make the data a little easier to work with.
+You'll learn how to do all that (and more!) in this chapter, which will introduce you to data transformation using the **dplyr** package and a new dataset on flights that departed from New York City in 2013.
+
+The goal of this chapter is to give you an overview of all the key tools for transforming a data frame.
+We'll start with functions that operate on rows and then columns of a data frame, then circle back to talk more about the pipe, an important tool that you use to combine verbs.
+We will then introduce the ability to work with groups.
+We will end the chapter with a case study that showcases these functions in action and we'll come back to the functions in more detail in later chapters, as we start to dig into specific types of data (e.g., numbers, strings, dates).
+
+### Prerequisites
+
+In this chapter we'll focus on the dplyr package, another core member of the tidyverse.
+We'll illustrate the key ideas using data from the nycflights13 package, and use ggplot2 to help us understand the data.
+
+```{r}
+#| label: setup
+
+library(nycflights13)
+library(tidyverse)
+```
+
+Take careful note of the conflicts message that's printed when you load the tidyverse.
+It tells you that dplyr overwrites some functions in base R.
+If you want to use the base version of these functions after loading dplyr, you'll need to use their full names: `stats::filter()` and `stats::lag()`.
+So far we've mostly ignored which package a function comes from because most of the time it doesn't matter.
+However, knowing the package can help you find help and find related functions, so when we need to be precise about which package a function comes from, we'll use the same syntax as R: `packagename::functionname()`.
+
+### nycflights13
+
+To explore the basic dplyr verbs, we're going to use `nycflights13::flights`.
+This dataset contains all `r format(nrow(nycflights13::flights), big.mark = ",")` flights that departed from New York City in 2013.
+The data comes from the US [Bureau of Transportation Statistics](http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0), and is documented in `?flights`.
+
+```{r}
+flights
+```
+
+`flights` is a tibble, a special type of data frame used by the tidyverse to avoid some common gotchas.
+The most important difference between tibbles and data frames is the way tibbles print; they are designed for large datasets, so they only show the first few rows and only the columns that fit on one screen.
+There are a few options to see everything.
+If you're using RStudio, the most convenient is probably `View(flights)`, which will open an interactive scrollable and filterable view.
+Otherwise you can use `print(flights, width = Inf)` to show all columns, or use `glimpse()`:
+
+```{r}
+glimpse(flights)
+```
+
+In both views, the variables names are followed by abbreviations that tell you the type of each variable: `<int>` is short for integer, `<dbl>` is short for double (aka real numbers), `<chr>` for character (aka strings), and `<dttm>` for date-time.
+These are important because the operations you can perform on a column depend so much on its "type".
+
+### dplyr basics
+
+You're about to learn the primary dplyr verbs (functions) which will allow you to solve the vast majority of your data manipulation challenges.
+But before we discuss their individual differences, it's worth stating what they have in common:
+
+1.  The first argument is always a data frame.
+
+2.  The subsequent arguments typically describe which columns to operate on, using the variable names (without quotes).
+
+3.  The output is always a new data frame.
+
+Because each verb does one thing well, solving complex problems will usually require combining multiple verbs, and we'll do so with the pipe, `|>`.
+We'll discuss the pipe more in @sec-the-pipe, but in brief, the pipe takes the thing on its left and passes it along to the function on its right so that `x |> f(y)` is equivalent to `f(x, y)`, and `x |> f(y) |> g(z)` is equivalent to `g(f(x, y), z)`.
+The easiest way to pronounce the pipe is "then".
+That makes it possible to get a sense of the following code even though you haven't yet learned the details:
+
+```{r}
+#| eval: false
+
+flights |>
+  filter(dest == "IAH") |> 
+  group_by(year, month, day) |> 
+  summarize(
+    arr_delay = mean(arr_delay, na.rm = TRUE)
+  )
+```
+
+dplyr's verbs are organized into four groups based on what they operate on: **rows**, **columns**, **groups**, or **tables**.
+In the following sections you'll learn the most important verbs for rows, columns, and groups, then we'll come back to the join verbs that work on tables in @sec-joins.
+Let's dive in!
+
+## Rows
+
+The most important verbs that operate on rows of a dataset are `filter()`, which changes which rows are present without changing their order, and `arrange()`, which changes the order of the rows without changing which are present.
+Both functions only affect the rows, and the columns are left unchanged.
+We'll also discuss `distinct()` which finds rows with unique values but unlike `arrange()` and `filter()` it can also optionally modify the columns.
+
+### `filter()`
+
+`filter()` allows you to keep rows based on the values of the columns[^data-transform-1].
+The first argument is the data frame.
+The second and subsequent arguments are the conditions that must be true to keep the row.
+For example, we could find all flights that departed more than 120 minutes (two hours) late:
+
+[^data-transform-1]: Later, you'll learn about the `slice_*()` family which allows you to choose rows based on their positions.
+
+```{r}
+flights |> 
+  filter(dep_delay > 120)
+```
+
+As well as `>` (greater than), you can use `>=` (greater than or equal to), `<` (less than), `<=` (less than or equal to), `==` (equal to), and `!=` (not equal to).
+You can also combine conditions with `&` or `,` to indicate "and" (check for both conditions) or with `|` to indicate "or" (check for either condition):
+
+```{r}
+# Flights that departed on January 1
+flights |> 
+  filter(month == 1 & day == 1)
+
+# Flights that departed in January or February
+flights |> 
+  filter(month == 1 | month == 2)
+```
+
+There's a useful shortcut when you're combining `|` and `==`: `%in%`.
+It keeps rows where the variable equals one of the values on the right:
+
+```{r}
+# A shorter way to select flights that departed in January or February
+flights |> 
+  filter(month %in% c(1, 2))
+```
+
+We'll come back to these comparisons and logical operators in more detail in @sec-logicals.
+
+When you run `filter()` dplyr executes the filtering operation, creating a new data frame, and then prints it.
+It doesn't modify the existing `flights` dataset because dplyr functions never modify their inputs.
+To save the result, you need to use the assignment operator, `<-`:
+
+```{r}
+jan1 <- flights |> 
+  filter(month == 1 & day == 1)
+```
+
+### Common mistakes
+
+When you're starting out with R, the easiest mistake to make is to use `=` instead of `==` when testing for equality.
+`filter()` will let you know when this happens:
+
+```{r}
+#| error: true
+
+flights |> 
+  filter(month = 1)
+```
+
+Another mistakes is you write "or" statements like you would in English:
+
+```{r}
+#| eval: false
+
+flights |> 
+  filter(month == 1 | 2)
+```
+
+This "works", in the sense that it doesn't throw an error, but it doesn't do what you want because `|` first checks the condition `month == 1` and then checks the condition `2`, which is not a sensible condition to check.
+We'll learn more about what's happening here and why in @sec-boolean-operations.
+
+### `arrange()`
+
+`arrange()` changes the order of the rows based on the value of the columns.
+It takes a data frame and a set of column names (or more complicated expressions) to order by.
+If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns.
+For example, the following code sorts by the departure time, which is spread over four columns.
+We get the earliest years first, then within a year the earliest months, etc.
+
+```{r}
+flights |> 
+  arrange(year, month, day, dep_time)
+```
+
+You can use `desc()` on a column inside of `arrange()` to re-order the data frame based on that column in descending (big-to-small) order.
+For example, this code orders flights from most to least delayed:
+
+```{r}
+flights |> 
+  arrange(desc(dep_delay))
+```
+
+Note that the number of rows has not changed -- we're only arranging the data, we're not filtering it.
+
+### `distinct()`
+
+`distinct()` finds all the unique rows in a dataset, so in a technical sense, it primarily operates on the rows.
+Most of the time, however, you'll want the distinct combination of some variables, so you can also optionally supply column names:
+
+```{r}
+# Remove duplicate rows, if any
+flights |> 
+  distinct()
+
+# Find all unique origin and destination pairs
+flights |> 
+  distinct(origin, dest)
+```
+
+Alternatively, if you want to the keep other columns when filtering for unique rows, you can use the `.keep_all = TRUE` option.
+
+```{r}
+flights |> 
+  distinct(origin, dest, .keep_all = TRUE)
+```
+
+It's not a coincidence that all of these distinct flights are on January 1: `distinct()` will find the first occurrence of a unique row in the dataset and discard the rest.
+
+If you want to find the number of occurrences instead, you're better off swapping `distinct()` for `count()`, and with the `sort = TRUE` argument you can arrange them in descending order of number of occurrences.
+You'll learn more about count in @sec-counts.
+
+```{r}
+flights |>
+  count(origin, dest, sort = TRUE)
+```
+
+### Exercises
+
+1.  In a single pipeline for each condition, find all flights that meet the condition:
+
+    -   Had an arrival delay of two or more hours
+    -   Flew to Houston (`IAH` or `HOU`)
+    -   Were operated by United, American, or Delta
+    -   Departed in summer (July, August, and September)
+    -   Arrived more than two hours late, but didn't leave late
+    -   Were delayed by at least an hour, but made up over 30 minutes in flight
+
+2.  Sort `flights` to find the flights with longest departure delays.
+    Find the flights that left earliest in the morning.
+
+3.  Sort `flights` to find the fastest flights.
+    (Hint: Try including a math calculation inside of your function.)
+
+4.  Was there a flight on every day of 2013?
+
+5.  Which flights traveled the farthest distance?
+    Which traveled the least distance?
+
+6.  Does it matter what order you used `filter()` and `arrange()` if you're using both?
+    Why/why not?
+    Think about the results and how much work the functions would have to do.
+
+## Columns
+
+There are four important verbs that affect the columns without changing the rows: `mutate()` creates new columns that are derived from the existing columns, `select()` changes which columns are present, `rename()` changes the names of the columns, and `relocate()` changes the positions of the columns.
+
+### `mutate()` {#sec-mutate}
+
+The job of `mutate()` is to add new columns that are calculated from the existing columns.
+In the transform chapters, you'll learn a large set of functions that you can use to manipulate different types of variables.
+For now, we'll stick with basic algebra, which allows us to compute the `gain`, how much time a delayed flight made up in the air, and the `speed` in miles per hour:
+
+```{r}
+flights |> 
+  mutate(
+    gain = dep_delay - arr_delay,
+    speed = distance / air_time * 60
+  )
+```
+
+By default, `mutate()` adds new columns on the right hand side of your dataset, which makes it difficult to see what's happening here.
+We can use the `.before` argument to instead add the variables to the left hand side[^data-transform-2]:
+
+[^data-transform-2]: Remember that in RStudio, the easiest way to see a dataset with many columns is `View()`.
+
+```{r}
+flights |> 
+  mutate(
+    gain = dep_delay - arr_delay,
+    speed = distance / air_time * 60,
+    .before = 1
+  )
+```
+
+The `.` is a sign that `.before` is an argument to the function, not the name of a third new variable we are creating.
+You can also use `.after` to add after a variable, and in both `.before` and `.after` you can use the variable name instead of a position.
+For example, we could add the new variables after `day`:
+
+```{r}
+#| results: false
+
+flights |> 
+  mutate(
+    gain = dep_delay - arr_delay,
+    speed = distance / air_time * 60,
+    .after = day
+  )
+```
+
+Alternatively, you can control which variables are kept with the `.keep` argument.
+A particularly useful argument is `"used"` which specifies that we only keep the columns that were involved or created in the `mutate()` step.
+For example, the following output will contain only the variables `dep_delay`, `arr_delay`, `air_time`, `gain`, `hours`, and `gain_per_hour`.
+
+```{r}
+#| results: false
+
+flights |> 
+  mutate(
+    gain = dep_delay - arr_delay,
+    hours = air_time / 60,
+    gain_per_hour = gain / hours,
+    .keep = "used"
+  )
+```
+
+Note that since we haven't assigned the result of the above computation back to `flights`, the new variables `gain,` `hours`, and `gain_per_hour` will only be printed but will not be stored in a data frame.
+And if we want them to be available in a data frame for future use, we should think carefully about whether we want the result to be assigned back to `flights`, overwriting the original data frame with many more variables, or to a new object.
+Often, the right answer is a new object that is named informatively to indicate its contents, e.g., `delay_gain`, but you might also have good reasons for overwriting `flights`.
+
+### `select()` {#sec-select}
+
+It's not uncommon to get datasets with hundreds or even thousands of variables.
+In this situation, the first challenge is often just focusing on the variables you're interested in.
+`select()` allows you to rapidly zoom in on a useful subset using operations based on the names of the variables:
+
+-   Select columns by name:
+
+    ```{r}
+    #| results: false
+
+    flights |> 
+      select(year, month, day)
+    ```
+
+-   Select all columns between year and day (inclusive):
+
+    ```{r}
+    #| results: false
+
+    flights |> 
+      select(year:day)
+    ```
+
+-   Select all columns except those from year to day (inclusive):
+
+    ```{r}
+    #| results: false
+
+    flights |> 
+      select(!year:day)
+    ```
+
+    You can also use `-` instead of `!` (and you're likely to see that in the wild); we recommend `!` because it reads as "not", and combines well with `&` and `|`.
+
+-   Select all columns that are characters:
+
+    ```{r}
+    #| results: false
+
+    flights |> 
+      select(where(is.character))
+    ```
+
+There are a number of helper functions you can use within `select()`:
+
+-   `starts_with("abc")`: matches names that begin with "abc".
+-   `ends_with("xyz")`: matches names that end with "xyz".
+-   `contains("ijk")`: matches names that contain "ijk".
+-   `num_range("x", 1:3)`: matches `x1`, `x2` and `x3`.
+
+See `?select` for more details.
+Once you know regular expressions (the topic of @sec-regular-expressions) you'll also be able to use `matches()` to select variables that match a pattern.
+
+You can rename variables as you `select()` them by using `=`.
+The new name appears on the left hand side of the `=`, and the old variable appears on the right hand side:
+
+```{r}
+flights |> 
+  select(tail_num = tailnum)
+```
+
+### `rename()`
+
+If you want to keep all the existing variables and just want to rename a few, you can use `rename()` instead of `select()`:
+
+```{r}
+flights |> 
+  rename(tail_num = tailnum)
+```
+
+If you have a bunch of inconsistently named columns and it would be painful to fix them all by hand, check out `janitor::clean_names()` which provides some useful automated cleaning.
+
+### `relocate()`
+
+Use `relocate()` to move variables around.
+You might want to collect related variables together or move important variables to the front.
+By default `relocate()` moves variables to the front:
+
+```{r}
+flights |> 
+  relocate(time_hour, air_time)
+```
+
+You can also specify where to put them using the `.before` and `.after` arguments, just like in `mutate()`:
+
+```{r}
+#| results: false
+
+flights |> 
+  relocate(year:dep_time, .after = time_hour)
+flights |> 
+  relocate(starts_with("arr"), .before = dep_time)
+```
+
+### Exercises
+
+```{r}
+#| eval: false
+#| echo: false
+
+# For data checking, not used in results shown in book
+flights <- flights |> mutate(
+  dep_time = hour * 60 + minute,
+  arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100),
+  airtime2 = arr_time - dep_time,
+  dep_sched = dep_time + dep_delay
+)
+
+ggplot(flights, aes(x = dep_sched)) + geom_histogram(binwidth = 60)
+ggplot(flights, aes(x = dep_sched %% 60)) + geom_histogram(binwidth = 1)
+ggplot(flights, aes(x = air_time - airtime2)) + geom_histogram()
+```
+
+1.  Compare `dep_time`, `sched_dep_time`, and `dep_delay`.
+    How would you expect those three numbers to be related?
+
+2.  Brainstorm as many ways as possible to select `dep_time`, `dep_delay`, `arr_time`, and `arr_delay` from `flights`.
+
+3.  What happens if you specify the name of the same variable multiple times in a `select()` call?
+
+4.  What does the `any_of()` function do?
+    Why might it be helpful in conjunction with this vector?
+
+    ```{r}
+    variables <- c("year", "month", "day", "dep_delay", "arr_delay")
+    ```
+
+5.  Does the result of running the following code surprise you?
+    How do the select helpers deal with upper and lower case by default?
+    How can you change that default?
+
+    ```{r}
+    #| eval: false
+
+    flights |> select(contains("TIME"))
+    ```
+
+6.  Rename `air_time` to `air_time_min` to indicate units of measurement and move it to the beginning of the data frame.
+
+7.  Why doesn't the following work, and what does the error mean?
+
+    ```{r}
+    #| error: true
+
+    flights |> 
+      select(tailnum) |> 
+      arrange(arr_delay)
+    ```
+
+## The pipe {#sec-the-pipe}
+
+We've shown you simple examples of the pipe above, but its real power arises when you start to combine multiple verbs.
+For example, imagine that you wanted to find the fast flights to Houston's IAH airport: you need to combine `filter()`, `mutate()`, `select()`, and `arrange()`:
+
+```{r}
+flights |> 
+  filter(dest == "IAH") |> 
+  mutate(speed = distance / air_time * 60) |> 
+  select(year:day, dep_time, carrier, flight, speed) |> 
+  arrange(desc(speed))
+```
+
+Even though this pipeline has four steps, it's easy to skim because the verbs come at the start of each line: start with the `flights` data, then filter, then mutate, then select, then arrange.
+
+What would happen if we didn't have the pipe?
+We could nest each function call inside the previous call:
+
+```{r}
+#| results: false
+
+arrange(
+  select(
+    mutate(
+      filter(
+        flights, 
+        dest == "IAH"
+      ),
+      speed = distance / air_time * 60
+    ),
+    year:day, dep_time, carrier, flight, speed
+  ),
+  desc(speed)
+)
+```
+
+Or we could use a bunch of intermediate objects:
+
+```{r}
+#| results: false
+
+flights1 <- filter(flights, dest == "IAH")
+flights2 <- mutate(flights1, speed = distance / air_time * 60)
+flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed)
+arrange(flights3, desc(speed))
+```
+
+While both forms have their time and place, the pipe generally produces data analysis code that is easier to write and read.
+
+To add the pipe to your code, we recommend using the build-in keyboard shortcut Ctrl/Cmd + Shift + M.
+You'll need to make one change to your RStudio options to use `|>` instead of `%>%` as shown in @fig-pipe-options; more on `%>%` shortly.
+
+```{r}
+#| label: fig-pipe-options
+#| echo: false
+#| fig-cap: |
+#|   To insert `|>`, make sure the "Use native pipe operator" option is checked.
+#| fig-alt: | 
+#|   Screenshot showing the "Use native pipe operator" option which can
+#|   be found on the "Editing" panel of the "Code" options.
+
+knitr::include_graphics("screenshots/rstudio-pipe-options.png")
+```
+
+::: callout-note
+## magrittr
+
+If you've been using the tidyverse for a while, you might be familiar with the `%>%` pipe provided by the **magrittr** package.
+The magrittr package is included in the core tidyverse, so you can use `%>%` whenever you load the tidyverse:
+
+```{r}
+#| eval: false
+
+library(tidyverse)
+
+mtcars %>% 
+  group_by(cyl) %>%
+  summarize(n = n())
+```
+
+For simple cases, `|>` and `%>%` behave identically.
+So why do we recommend the base pipe?
+Firstly, because it's part of base R, it's always available for you to use, even when you're not using the tidyverse.
+Secondly, `|>` is quite a bit simpler than `%>%`: in the time between the invention of `%>%` in 2014 and the inclusion of `|>` in R 4.1.0 in 2021, we gained a better understanding of the pipe.
+This allowed the base implementation to jettison infrequently used and less important features.
+:::
+
+## Groups
+
+So far you've learned about functions that work with rows and columns.
+dplyr gets even more powerful when you add in the ability to work with groups.
+In this section, we'll focus on the most important functions: `group_by()`, `summarize()`, and the slice family of functions.
+
+### `group_by()`
+
+Use `group_by()` to divide your dataset into groups meaningful for your analysis:
+
+```{r}
+flights |> 
+  group_by(month)
+```
+
+`group_by()` doesn't change the data but, if you look closely at the output, you'll notice that the output indicates that it is "grouped by" month (`Groups: month [12]`).
+This means subsequent operations will now work "by month".
+`group_by()` adds this grouped feature (referred to as class) to the data frame, which changes the behavior of the subsequent verbs applied to the data.
+
+### `summarize()` {#sec-summarize}
+
+The most important grouped operation is a summary, which, if being used to calculate a single summary statistic, reduces the data frame to have a single row for each group.
+In dplyr, this operation is performed by `summarize()`[^data-transform-3], as shown by the following example, which computes the average departure delay by month:
+
+[^data-transform-3]: Or `summarise()`, if you prefer British English.
+
+```{r}
+flights |> 
+  group_by(month) |> 
+  summarize(
+    avg_delay = mean(dep_delay)
+  )
+```
+
+Uhoh!
+Something has gone wrong and all of our results are `NA`s (pronounced "N-A"), R's symbol for missing value.
+This happened because some of the observed flights had missing data in the delay column, and so when we calculated the mean including those values, we got an `NA` result.
+We'll come back to discuss missing values in detail in @sec-missing-values, but for now we'll tell the `mean()` function to ignore all missing values by setting the argument `na.rm` to `TRUE`:
+
+```{r}
+flights |> 
+  group_by(month) |> 
+  summarize(
+    delay = mean(dep_delay, na.rm = TRUE)
+  )
+```
+
+You can create any number of summaries in a single call to `summarize()`.
+You'll learn various useful summaries in the upcoming chapters, but one very useful summary is `n()`, which returns the number of rows in each group:
+
+```{r}
+flights |> 
+  group_by(month) |> 
+  summarize(
+    delay = mean(dep_delay, na.rm = TRUE), 
+    n = n()
+  )
+```
+
+Means and counts can get you a surprisingly long way in data science!
+
+### The `slice_` functions
+
+There are five handy functions that allow you extract specific rows within each group:
+
+-   `df |> slice_head(n = 1)` takes the first row from each group.
+-   `df |> slice_tail(n = 1)` takes the last row in each group.
+-   `df |> slice_min(x, n = 1)` takes the row with the smallest value of column `x`.
+-   `df |> slice_max(x, n = 1)` takes the row with the largest value of column `x`.
+-   `df |> slice_sample(n = 1)` takes one random row.
+
+You can vary `n` to select more than one row, or instead of `n =`, you can use `prop = 0.1` to select (e.g.) 10% of the rows in each group.
+For example, the following code finds the flights that are most delayed upon arrival at each destination:
+
+```{r}
+flights |> 
+  group_by(dest) |> 
+  slice_max(arr_delay, n = 1) |>
+  relocate(dest)
+```
+
+Note that there are 105 destinations but we get 108 rows here.
+What's up?
+`slice_min()` and `slice_max()` keep tied values so `n = 1` means give us all rows with the highest value.
+If you want exactly one row per group you can set `with_ties = FALSE`.
+
+This is similar to computing the max delay with `summarize()`, but you get the whole corresponding row (or rows if there's a tie) instead of the single summary statistic.
+
+### Grouping by multiple variables
+
+You can create groups using more than one variable.
+For example, we could make a group for each date.
+
+```{r}
+daily <- flights |>  
+  group_by(year, month, day)
+daily
+```
+
+When you summarize a tibble grouped by more than one variable, each summary peels off the last group.
+In hindsight, this wasn't a great way to make this function work, but it's difficult to change without breaking existing code.
+To make it obvious what's happening, dplyr displays a message that tells you how you can change this behavior:
+
+```{r}
+daily_flights <- daily |> 
+  summarize(n = n())
+```
+
+If you're happy with this behavior, you can explicitly request it in order to suppress the message:
+
+```{r}
+#| results: false
+
+daily_flights <- daily |> 
+  summarize(
+    n = n(), 
+    .groups = "drop_last"
+  )
+```
+
+Alternatively, change the default behavior by setting a different value, e.g., `"drop"` to drop all grouping or `"keep"` to preserve the same groups.
+
+### Ungrouping
+
+You might also want to remove grouping from a data frame without using `summarize()`.
+You can do this with `ungroup()`.
+
+```{r}
+daily |> 
+  ungroup()
+```
+
+Now let's see what happens when you summarize an ungrouped data frame.
+
+```{r}
+daily |> 
+  ungroup() |>
+  summarize(
+    avg_delay = mean(dep_delay, na.rm = TRUE), 
+    flights = n()
+  )
+```
+
+You get a single row back because dplyr treats all the rows in an ungrouped data frame as belonging to one group.
+
+### `.by`
+
+dplyr 1.1.0 includes a new, experimental, syntax for per-operation grouping, the `.by` argument.
+`group_by()` and `ungroup()` aren't going away, but you can now also use the `.by` argument to group within a single operation:
+
+```{r}
+#| results: false
+flights |> 
+  summarize(
+    delay = mean(dep_delay, na.rm = TRUE), 
+    n = n(),
+    .by = month
+  )
+```
+
+Or if you want to group by multiple variables:
+
+```{r}
+#| results: false
+flights |> 
+  summarize(
+    delay = mean(dep_delay, na.rm = TRUE), 
+    n = n(),
+    .by = c(origin, dest)
+  )
+```
+
+`.by` works with all verbs and has the advantage that you don't need to use the `.groups` argument to suppress the grouping message or `ungroup()` when you're done.
+
+We didn't focus on this syntax in this chapter because it was very new when we wrote the book.
+We did want to mention it because we think it has a lot of promise and it's likely to be quite popular.
+You can learn more about it in the [dplyr 1.1.0 blog post](https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-per-operation-grouping/).
+
+### Exercises
+
+1.  Which carrier has the worst average delays?
+    Challenge: can you disentangle the effects of bad airports vs. bad carriers?
+    Why/why not?
+    (Hint: think about `flights |> group_by(carrier, dest) |> summarize(n())`)
+
+2.  Find the flights that are most delayed upon departure from each destination.
+
+3.  How do delays vary over the course of the day.
+    Illustrate your answer with a plot.
+
+4.  What happens if you supply a negative `n` to `slice_min()` and friends?
+
+5.  Explain what `count()` does in terms of the dplyr verbs you just learned.
+    What does the `sort` argument to `count()` do?
+
+6.  Suppose we have the following tiny data frame:
+
+    ```{r}
+    df <- tibble(
+      x = 1:5,
+      y = c("a", "b", "a", "a", "b"),
+      z = c("K", "K", "L", "L", "K")
+    )
+    ```
+
+    a.  Write down what you think the output will look like, then check if you were correct, and describe what `group_by()` does.
+
+        ```{r}
+        #| eval: false
+            
+        df |>
+          group_by(y)
+        ```
+
+    b.  Write down what you think the output will look like, then check if you were correct, and describe what `arrange()` does.
+        Also comment on how it's different from the `group_by()` in part (a)?
+
+        ```{r}
+        #| eval: false
+            
+        df |>
+          arrange(y)
+        ```
+
+    c.  Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does.
+
+        ```{r}
+        #| eval: false
+            
+        df |>
+          group_by(y) |>
+          summarize(mean_x = mean(x))
+        ```
+
+    d.  Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does.
+        Then, comment on what the message says.
+
+        ```{r}
+        #| eval: false
+            
+        df |>
+          group_by(y, z) |>
+          summarize(mean_x = mean(x))
+        ```
+
+    e.  Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does.
+        How is the output different from the one in part (d).
+
+        ```{r}
+        #| eval: false
+            
+        df |>
+          group_by(y, z) |>
+          summarize(mean_x = mean(x), .groups = "drop")
+        ```
+
+    f.  Write down what you think the outputs will look like, then check if you were correct, and describe what each pipeline does.
+        How are the outputs of the two pipelines different?
+
+        ```{r}
+        #| eval: false
+            
+        df |>
+          group_by(y, z) |>
+          summarize(mean_x = mean(x))
+            
+        df |>
+          group_by(y, z) |>
+          mutate(mean_x = mean(x))
+        ```
+
+## Case study: aggregates and sample size {#sec-sample-size}
+
+Whenever you do any aggregation, it's always a good idea to include a count (`n()`).
+That way, you can ensure that you're not drawing conclusions based on very small amounts of data.
+We'll demonstrate this with some baseball data from the **Lahman** package.
+Specifically, we will compare what proportion of times a player gets a hit (`H`) vs. the number of times they try to put the ball in play (`AB`):
+
+```{r}
+batters <- Lahman::Batting |> 
+  group_by(playerID) |> 
+  summarize(
+    performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
+    n = sum(AB, na.rm = TRUE)
+  )
+batters
+```
+
+When we plot the skill of the batter (measured by the batting average, `performance`) against the number of opportunities to hit the ball (measured by times at bat, `n`), you see two patterns:
+
+1.  The variation in `performance` is larger among players with fewer at-bats.
+    The shape of this plot is very characteristic: whenever you plot a mean (or other summary statistics) vs. group size, you'll see that the variation decreases as the sample size increases[^data-transform-4].
+
+2.  There's a positive correlation between skill (`performance`) and opportunities to hit the ball (`n`) because teams want to give their best batters the most opportunities to hit the ball.
+
+[^data-transform-4]: \*cough\* the law of large numbers \*cough\*.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of number of batting performance vs. batting opportunites 
+#|   overlaid with a smoothed line. Average performance increases sharply
+#|   from 0.2 at when n is 1 to 0.25 when n is ~1000. Average performance
+#|   continues to increase linearly at a much shallower slope reaching
+#|   ~0.3 when n is ~15,000.
+
+batters |> 
+  filter(n > 100) |> 
+  ggplot(aes(x = n, y = performance)) +
+  geom_point(alpha = 1 / 10) + 
+  geom_smooth(se = FALSE)
+```
+
+Note the handy pattern for combining ggplot2 and dplyr.
+You just have to remember to switch from `|>`, for dataset processing, to `+` for adding layers to your plot.
+
+This also has important implications for ranking.
+If you naively sort on `desc(performance)`, the people with the best batting averages are clearly the ones who tried to put the ball in play very few times and happened to get a hit, they're not necessarily the most skilled players:
+
+```{r}
+batters |> 
+  arrange(desc(performance))
+```
+
+You can find a good explanation of this problem and how to overcome it at <http://varianceexplained.org/r/empirical_bayes_baseball/> and <https://www.evanmiller.org/how-not-to-sort-by-average-rating.html>.
+
+## Summary
+
+In this chapter, you've learned the tools that dplyr provides for working with data frames.
+The tools are roughly grouped into three categories: those that manipulate the rows (like `filter()` and `arrange()`, those that manipulate the columns (like `select()` and `mutate()`), and those that manipulate groups (like `group_by()` and `summarize()`).
+In this chapter, we've focused on these "whole data frame" tools, but you haven't yet learned much about what you can do with the individual variable.
+We'll come back to that in the Transform part of the book, where each chapter will give you tools for a specific type of variable.
+
+In the next chapter, we'll pivot back to workflow to discuss the importance of code style, keeping your code well organized in order to make it easy for you and others to read and understand your code.
diff --git a/data-visualize.qmd b/data-visualize.qmd
new file mode 100644
index 000000000..967e63544
--- /dev/null
+++ b/data-visualize.qmd
@@ -0,0 +1,930 @@
+# Data visualization {#sec-data-visualization}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+> "The simple graph has brought more information to the data analyst's mind than any other device." --- John Tukey
+
+R has several systems for making graphs, but ggplot2 is one of the most elegant and most versatile.
+ggplot2 implements the **grammar of graphics**, a coherent system for describing and building graphs.
+With ggplot2, you can do more and faster by learning one system and applying it in many places.
+
+This chapter will teach you how to visualize your data using **ggplot2**.
+We will start by creating a simple scatterplot and use that to introduce aesthetic mappings and geometric objects -- the fundamental building blocks of ggplot2.
+We will then walk you through visualizing distributions of single variables as well as visualizing relationships between two or more variables.
+We'll finish off with saving your plots and troubleshooting tips.
+
+### Prerequisites
+
+This chapter focuses on ggplot2, one of the core packages in the tidyverse.
+To access the datasets, help pages, and functions used in this chapter, load the tidyverse by running:
+
+```{r}
+#| label: setup
+
+library(tidyverse)
+```
+
+That one line of code loads the core tidyverse; the packages that you will use in almost every data analysis.
+It also tells you which functions from the tidyverse conflict with functions in base R (or from other packages you might have loaded)[^data-visualize-1].
+
+[^data-visualize-1]: You can eliminate that message and force conflict resolution to happen on demand by using the conflicted package, which becomes more important as you load more packages.
+    You can learn more about conflicted at <https://conflicted.r-lib.org>.
+
+If you run this code and get the error message `there is no package called 'tidyverse'`, you'll need to first install it, then run `library()` once again.
+
+```{r}
+#| eval: false
+
+install.packages("tidyverse")
+library(tidyverse)
+```
+
+You only need to install a package once, but you need to load it every time you start a new session.
+
+In addition to tidyverse, we will also use the **palmerpenguins** package, which includes the `penguins` dataset containing body measurements for penguins on three islands in the Palmer Archipelago, and the ggthemes package, which offers a colorblind safe color palette.
+
+```{r}
+library(palmerpenguins)
+library(ggthemes)
+```
+
+## First steps
+
+Do penguins with longer flippers weigh more or less than penguins with shorter flippers?
+You probably already have an answer, but try to make your answer precise.
+What does the relationship between flipper length and body mass look like?
+Is it positive?
+Negative?
+Linear?
+Nonlinear?
+Does the relationship vary by the species of the penguin?
+How about by the island where the penguin lives?
+Let's create visualizations that we can use to answer these questions.
+
+### The `penguins` data frame
+
+You can test your answers to those questions with the `penguins` **data frame** found in palmerpenguins (a.k.a. `palmerpenguins::penguins`).
+A data frame is a rectangular collection of variables (in the columns) and observations (in the rows).
+`penguins` contains `r nrow(penguins)` observations collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER[^data-visualize-2].
+
+[^data-visualize-2]: Horst AM, Hill AP, Gorman KB (2020).
+    palmerpenguins: Palmer Archipelago (Antarctica) penguin data.
+    R package version 0.1.0.
+    <https://allisonhorst.github.io/palmerpenguins/>.
+    doi: 10.5281/zenodo.3960218.
+
+To make the discussion easier, let's define some terms:
+
+-   A **variable** is a quantity, quality, or property that you can measure.
+
+-   A **value** is the state of a variable when you measure it.
+    The value of a variable may change from measurement to measurement.
+
+-   An **observation** is a set of measurements made under similar conditions (you usually make all of the measurements in an observation at the same time and on the same object).
+    An observation will contain several values, each associated with a different variable.
+    We'll sometimes refer to an observation as a data point.
+
+-   **Tabular data** is a set of values, each associated with a variable and an observation.
+    Tabular data is *tidy* if each value is placed in its own "cell", each variable in its own column, and each observation in its own row.
+
+In this context, a variable refers to an attribute of all the penguins, and an observation refers to all the attributes of a single penguin.
+
+Type the name of the data frame in the console and R will print a preview of its contents.
+Note that it says `tibble` on top of this preview.
+In the tidyverse, we use special data frames called **tibbles** that you will learn more about soon.
+
+```{r}
+penguins
+```
+
+This data frame contains `r ncol(penguins)` columns.
+For an alternative view, where you can see all variables and the first few observations of each variable, use `glimpse()`.
+Or, if you're in RStudio, run `View(penguins)` to open an interactive data viewer.
+
+```{r}
+glimpse(penguins)
+```
+
+Among the variables in `penguins` are:
+
+1.  `species`: a penguin's species (Adelie, Chinstrap, or Gentoo).
+
+2.  `flipper_length_mm`: length of a penguin's flipper, in millimeters.
+
+3.  `body_mass_g`: body mass of a penguin, in grams.
+
+To learn more about `penguins`, open its help page by running `?penguins`.
+
+### Ultimate goal {#sec-ultimate-goal}
+
+Our ultimate goal in this chapter is to recreate the following visualization displaying the relationship between flipper lengths and body masses of these penguins, taking into consideration the species of the penguin.
+
+```{r}
+#| echo: false
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins, with a
+#|   best fit line of the relationship between these two variables 
+#|   overlaid. The plot displays a positive, fairly linear, and relatively 
+#|   strong relationship between these two variables. Species (Adelie, 
+#|   Chinstrap, and Gentoo) are represented with different colors and 
+#|   shapes. The relationship between body mass and flipper length is 
+#|   roughly the same for these three species, and Gentoo penguins are 
+#|   larger than penguins from the other two species.
+
+ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
+  geom_point(aes(color = species, shape = species)) +
+  geom_smooth(method = "lm") +
+  labs(
+    title = "Body mass and flipper length",
+    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
+    x = "Flipper length (mm)",
+    y = "Body mass (g)",
+    color = "Species",
+    shape = "Species"
+  ) +
+  scale_color_colorblind()
+```
+
+### Creating a ggplot
+
+Let's recreate this plot step-by-step.
+
+With ggplot2, you begin a plot with the function `ggplot()`, defining a plot object that you then add **layers** to.
+The first argument of `ggplot()` is the dataset to use in the graph and so `ggplot(data = penguins)` creates an empty graph that is primed to display the `penguins` data, but since we haven't told it how to visualize it yet, for now it's empty.
+This is not a very exciting plot, but you can think of it like an empty canvas you'll paint the remaining layers of your plot onto.
+
+```{r}
+#| fig-alt: |
+#|   A blank, gray plot area.
+
+ggplot(data = penguins)
+```
+
+Next, we need to tell `ggplot()` how the information from our data will be visually represented.
+The `mapping` argument of the `ggplot()` function defines how variables in your dataset are mapped to visual properties (**aesthetics**) of your plot.
+The `mapping` argument is always defined in the `aes()` function, and the `x` and `y` arguments of `aes()` specify which variables to map to the x and y axes.
+For now, we will only map flipper length to the `x` aesthetic and body mass to the `y` aesthetic.
+ggplot2 looks for the mapped variables in the `data` argument, in this case, `penguins`.
+
+The following plot shows the result of adding these mappings.
+
+```{r}
+#| fig-alt: |
+#|   The plot shows flipper length on the x-axis, with values that range from 
+#|   170 to 230, and body mass on the y-axis, with values that range from 3000 
+#|   to 6000.
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g)
+)
+```
+
+Our empty canvas now has more structure -- it's clear where flipper lengths will be displayed (on the x-axis) and where body masses will be displayed (on the y-axis).
+But the penguins themselves are not yet on the plot.
+This is because we have not yet articulated, in our code, how to represent the observations from our data frame on our plot.
+
+To do so, we need to define a **geom**: the geometrical object that a plot uses to represent data.
+These geometric objects are made available in ggplot2 with functions that start with `geom_`.
+People often describe plots by the type of geom that the plot uses.
+For example, bar charts use bar geoms (`geom_bar()`), line charts use line geoms (`geom_line()`), boxplots use boxplot geoms (`geom_boxplot()`), scatterplots use point geoms (`geom_point()`), and so on.
+
+The function `geom_point()` adds a layer of points to your plot, which creates a scatterplot.
+ggplot2 comes with many geom functions that each adds a different type of layer to a plot.
+You'll learn a whole bunch of geoms throughout the book, particularly in @sec-layers.
+
+```{r}
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. The plot 
+#|   displays a positive, linear, and relatively strong relationship between 
+#|   these two variables.
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g)
+) +
+  geom_point()
+```
+
+Now we have something that looks like what we might think of as a "scatterplot".
+It doesn't yet match our "ultimate goal" plot, but using this plot we can start answering the question that motivated our exploration: "What does the relationship between flipper length and body mass look like?" The relationship appears to be positive (as flipper length increases, so does body mass), fairly linear (the points are clustered around a line instead of a curve), and moderately strong (there isn't too much scatter around such a line).
+Penguins with longer flippers are generally larger in terms of their body mass.
+
+Before we add more layers to this plot, let's pause for a moment and review the warning message we got:
+
+> Removed 2 rows containing missing values (`geom_point()`).
+
+We're seeing this message because there are two penguins in our dataset with missing body mass and/or flipper length values and ggplot2 has no way of representing them on the plot without both of these values.
+Like R, ggplot2 subscribes to the philosophy that missing values should never silently go missing.
+This type of warning is probably one of the most common types of warnings you will see when working with real data -- missing values are a very common issue and you'll learn more about them throughout the book, particularly in @sec-missing-values.
+For the remaining plots in this chapter we will suppress this warning so it's not printed alongside every single plot we make.
+
+### Adding aesthetics and layers {#sec-adding-aesthetics-layers}
+
+Scatterplots are useful for displaying the relationship between two numerical variables, but it's always a good idea to be skeptical of any apparent relationship between two variables and ask if there may be other variables that explain or change the nature of this apparent relationship.
+For example, does the relationship between flipper length and body mass differ by species?
+Let's incorporate species into our plot and see if this reveals any additional insights into the apparent relationship between these variables.
+We will do this by representing species with different colored points.
+
+To achieve this, will we need to modify the aesthetic or the geom?
+If you guessed "in the aesthetic mapping, inside of `aes()`", you're already getting the hang of creating data visualizations with ggplot2!
+And if not, don't worry.
+Throughout the book you will make many more ggplots and have many more opportunities to check your intuition as you make them.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. The plot 
+#|   displays a positive, fairly linear, and relatively strong relationship 
+#|   between these two variables. Species (Adelie, Chinstrap, and Gentoo) 
+#|   are represented with different colors.
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
+) +
+  geom_point()
+```
+
+When a categorical variable is mapped to an aesthetic, ggplot2 will automatically assign a unique value of the aesthetic (here a unique color) to each unique level of the variable (each of the three species), a process known as **scaling**.
+ggplot2 will also add a legend that explains which values correspond to which levels.
+
+Now let's add one more layer: a smooth curve displaying the relationship between body mass and flipper length.
+Before you proceed, refer back to the code above, and think about how we can add this to our existing plot.
+
+Since this is a new geometric object representing our data, we will add a new geom as a layer on top of our point geom: `geom_smooth()`.
+And we will specify that we want to draw the line of best fit based on a `l`inear `m`odel with `method = "lm"`.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. Overlaid 
+#|   on the scatterplot are three smooth curves displaying the 
+#|   relationship between these variables for each species (Adelie, 
+#|   Chinstrap, and Gentoo). Different penguin species are plotted in 
+#|   different colors for the points and the smooth curves.
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species)
+) +
+  geom_point() +
+  geom_smooth(method = "lm")
+```
+
+We have successfully added lines, but this plot doesn't look like the plot from @sec-ultimate-goal, which only has one line for the entire dataset as opposed to separate lines for each of the penguin species.
+
+When aesthetic mappings are defined in `ggplot()`, at the *global* level, they're passed down to each of the subsequent geom layers of the plot.
+However, each geom function in ggplot2 can also take a `mapping` argument, which allows for aesthetic mappings at the *local* level that are added to those inherited from the global level.
+Since we want points to be colored based on species but don't want the lines to be separated out for them, we should specify `color = species` for `geom_point()` only.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. Overlaid 
+#|   on the scatterplot is a single line of best fit displaying the 
+#|   relationship between these variables for each species (Adelie, 
+#|   Chinstrap, and Gentoo). Different penguin species are plotted in 
+#|   different colors for the points only.
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g)
+) +
+  geom_point(mapping = aes(color = species)) +
+  geom_smooth(method = "lm")
+```
+
+Voila!
+We have something that looks very much like our ultimate goal, though it's not yet perfect.
+We still need to use different shapes for each species of penguins and improve labels.
+
+It's generally not a good idea to represent information using only colors on a plot, as people perceive colors differently due to color blindness or other color vision differences.
+Therefore, in addition to color, we can also map `species` to the `shape` aesthetic.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. Overlaid 
+#|   on the scatterplot is a single line of best fit displaying the 
+#|   relationship between these variables for each species (Adelie, 
+#|   Chinstrap, and Gentoo). Different penguin species are plotted in 
+#|   different colors and shapes for the points only.
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g)
+) +
+  geom_point(mapping = aes(color = species, shape = species)) +
+  geom_smooth(method = "lm")
+```
+
+Note that the legend is automatically updated to reflect the different shapes of the points as well.
+
+And finally, we can improve the labels of our plot using the `labs()` function in a new layer.
+Some of the arguments to `labs()` might be self explanatory: `title` adds a title and `subtitle` adds a subtitle to the plot.
+Other arguments match the aesthetic mappings, `x` is the x-axis label, `y` is the y-axis label, and `color` and `shape` define the label for the legend.
+In addition, we can improve the color palette to be colorblind safe with the `scale_color_colorblind()` function from the ggthemes package.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins, with a 
+#|   line of best fit displaying the relationship between these two variables 
+#|   overlaid. The plot displays a positive, fairly linear, and relatively 
+#|   strong relationship between these two variables. Species (Adelie, 
+#|   Chinstrap, and Gentoo) are represented with different colors and 
+#|   shapes. The relationship between body mass and flipper length is 
+#|   roughly the same for these three species, and Gentoo penguins are 
+#|   larger than penguins from the other two species.
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g)
+) +
+  geom_point(aes(color = species, shape = species)) +
+  geom_smooth(method = "lm") +
+  labs(
+    title = "Body mass and flipper length",
+    subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins",
+    x = "Flipper length (mm)", y = "Body mass (g)",
+    color = "Species", shape = "Species"
+  ) +
+  scale_color_colorblind()
+```
+
+We finally have a plot that perfectly matches our "ultimate goal"!
+
+### Exercises
+
+1.  How many rows are in `penguins`?
+    How many columns?
+
+2.  What does the `bill_depth_mm` variable in the `penguins` data frame describe?
+    Read the help for `?penguins` to find out.
+
+3.  Make a scatterplot of `bill_depth_mm` vs. `bill_length_mm`.
+    That is, make a scatterplot with `bill_depth_mm` on the y-axis and `bill_length_mm` on the x-axis.
+    Describe the relationship between these two variables.
+
+4.  What happens if you make a scatterplot of `species` vs. `bill_depth_mm`?
+    What might be a better choice of geom?
+
+5.  Why does the following give an error and how would you fix it?
+
+    ```{r}
+    #| eval: false
+
+    ggplot(data = penguins) + 
+      geom_point()
+    ```
+
+6.  What does the `na.rm` argument do in `geom_point()`?
+    What is the default value of the argument?
+    Create a scatterplot where you successfully use this argument set to `TRUE`.
+
+7.  Add the following caption to the plot you made in the previous exercise: "Data come from the palmerpenguins package." Hint: Take a look at the documentation for `labs()`.
+
+8.  Recreate the following visualization.
+    What aesthetic should `bill_depth_mm` be mapped to?
+    And should it be mapped at the global level or at the geom level?
+
+    ```{r}
+    #| echo: false
+    #| warning: false
+    #| fig-alt: |
+    #|   A scatterplot of body mass vs. flipper length of penguins, colored 
+    #|   by bill depth. A smooth curve of the relationship between body mass 
+    #|   and flipper length is overlaid. The relationship is positive, 
+    #|   fairly linear, and moderately strong.
+
+    ggplot(
+      data = penguins,
+      mapping = aes(x = flipper_length_mm, y = body_mass_g)
+    ) +
+      geom_point(aes(color = bill_depth_mm)) +
+      geom_smooth()
+    ```
+
+9.  Run this code in your head and predict what the output will look like.
+    Then, run the code in R and check your predictions.
+
+    ```{r}
+    #| eval: false
+
+    ggplot(
+      data = penguins,
+      mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island)
+    ) +
+      geom_point() +
+      geom_smooth(se = FALSE)
+    ```
+
+10. Will these two graphs look different?
+    Why/why not?
+
+    ```{r}
+    #| eval: false
+
+    ggplot(
+      data = penguins,
+      mapping = aes(x = flipper_length_mm, y = body_mass_g)
+    ) +
+      geom_point() +
+      geom_smooth()
+
+    ggplot() +
+      geom_point(
+        data = penguins,
+        mapping = aes(x = flipper_length_mm, y = body_mass_g)
+      ) +
+      geom_smooth(
+        data = penguins,
+        mapping = aes(x = flipper_length_mm, y = body_mass_g)
+      )
+    ```
+
+## ggplot2 calls {#sec-ggplot2-calls}
+
+As we move on from these introductory sections, we'll transition to a more concise expression of ggplot2 code.
+So far we've been very explicit, which is helpful when you are learning:
+
+```{r}
+#| eval: false
+
+ggplot(
+  data = penguins,
+  mapping = aes(x = flipper_length_mm, y = body_mass_g)
+) +
+  geom_point()
+```
+
+Typically, the first one or two arguments to a function are so important that you should know them by heart.
+The first two arguments to `ggplot()` are `data` and `mapping`, in the remainder of the book, we won't supply those names.
+That saves typing, and, by reducing the amount of extra text, makes it easier to see what's different between plots.
+That's a really important programming concern that we'll come back to in @sec-functions.
+
+Rewriting the previous plot more concisely yields:
+
+```{r}
+#| eval: false
+
+ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + 
+  geom_point()
+```
+
+In the future, you'll also learn about the pipe, `|>`, which will allow you to create that plot with:
+
+```{r}
+#| eval: false
+
+penguins |> 
+  ggplot(aes(x = flipper_length_mm, y = body_mass_g)) + 
+  geom_point()
+```
+
+## Visualizing distributions
+
+How you visualize the distribution of a variable depends on the type of variable: categorical or numerical.
+
+### A categorical variable
+
+A variable is **categorical** if it can only take one of a small set of values.
+To examine the distribution of a categorical variable, you can use a bar chart.
+The height of the bars displays how many observations occurred with each `x` value.
+
+```{r}
+#| fig-alt: |
+#|   A bar chart of frequencies of species of penguins: Adelie 
+#|   (approximately 150), Chinstrap (approximately 90), Gentoo 
+#|   (approximately 125).
+
+ggplot(penguins, aes(x = species)) +
+  geom_bar()
+```
+
+In bar plots of categorical variables with non-ordered levels, like the penguin `species` above, it's often preferable to reorder the bars based on their frequencies.
+Doing so requires transforming the variable to a factor (how R handles categorical data) and then reordering the levels of that factor.
+
+```{r}
+#| fig-alt: |
+#|   A bar chart of frequencies of species of penguins, where the bars are 
+#|   ordered in decreasing order of their heights (frequencies): Adelie 
+#|   (approximately 150), Gentoo (approximately 125), Chinstrap 
+#|   (approximately 90).
+
+ggplot(penguins, aes(x = fct_infreq(species))) +
+  geom_bar()
+```
+
+You will learn more about factors and functions for dealing with factors (like `fct_infreq()` shown above) in @sec-factors.
+
+### A numerical variable
+
+A variable is **numerical** (or quantitative) if it can take on a wide range of numerical values, and it is sensible to add, subtract, or take averages with those values.
+Numerical variables can be continuous or discrete.
+
+One commonly used visualization for distributions of continuous variables is a histogram.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A histogram of body masses of penguins. The distribution is unimodal 
+#|   and right skewed, ranging between approximately 2500 to 6500 grams.
+
+ggplot(penguins, aes(x = body_mass_g)) +
+  geom_histogram(binwidth = 200)
+```
+
+A histogram divides the x-axis into equally spaced bins and then uses the height of a bar to display the number of observations that fall in each bin.
+In the graph above, the tallest bar shows that 39 observations have a `body_mass_g` value between 3,500 and 3,700 grams, which are the left and right edges of the bar.
+
+You can set the width of the intervals in a histogram with the binwidth argument, which is measured in the units of the `x` variable.
+You should always explore a variety of binwidths when working with histograms, as different binwidths can reveal different patterns.
+In the plots below a binwidth of 20 is too narrow, resulting in too many bars, making it difficult to determine the shape of the distribution.
+Similarly, a binwidth of 2,000 is too high, resulting in all data being binned into only three bars, and also making it difficult to determine the shape of the distribution.
+A binwidth of 200 provides a sensible balance.
+
+```{r}
+#| warning: false
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|   Two histograms of body masses of penguins, one with binwidth of 20 
+#|   (left) and one with binwidth of 2000 (right). The histogram with binwidth 
+#|   of 20 shows lots of ups and downs in the heights of the bins, creating a 
+#|   jagged outline. The histogram  with binwidth of 2000 shows only three bins.
+
+ggplot(penguins, aes(x = body_mass_g)) +
+  geom_histogram(binwidth = 20)
+ggplot(penguins, aes(x = body_mass_g)) +
+  geom_histogram(binwidth = 2000)
+```
+
+An alternative visualization for distributions of numerical variables is a density plot.
+A density plot is a smoothed-out version of a histogram and a practical alternative, particularly for continuous data that comes from an underlying smooth distribution.
+We won't go into how `geom_density()` estimates the density (you can read more about that in the function documentation), but let's explain how the density curve is drawn with an analogy.
+Imagine a histogram made out of wooden blocks.
+Then, imagine that you drop a cooked spaghetti string over it.
+The shape the spaghetti will take draped over blocks can be thought of as the shape of the density curve.
+It shows fewer details than a histogram but can make it easier to quickly glean the shape of the distribution, particularly with respect to modes and skewness.
+
+```{r}
+#| fig-alt: |
+#|   A density plot of body masses of penguins. The distribution is unimodal 
+#|   and right skewed, ranging between approximately 2500 to 6500 grams.
+
+ggplot(penguins, aes(x = body_mass_g)) +
+  geom_density()
+```
+
+### Exercises
+
+1.  Make a bar plot of `species` of `penguins`, where you assign `species` to the `y` aesthetic.
+    How is this plot different?
+
+2.  How are the following two plots different?
+    Which aesthetic, `color` or `fill`, is more useful for changing the color of bars?
+
+    ```{r}
+    #| eval: false
+
+    ggplot(penguins, aes(x = species)) +
+      geom_bar(color = "red")
+
+    ggplot(penguins, aes(x = species)) +
+      geom_bar(fill = "red")
+    ```
+
+3.  What does the `bins` argument in `geom_histogram()` do?
+
+4.  Make a histogram of the `carat` variable in the `diamonds` dataset that is available when you load the tidyverse package.
+    Experiment with different binwidths.
+    What binwidth reveals the most interesting patterns?
+
+## Visualizing relationships
+
+To visualize a relationship we need to have at least two variables mapped to aesthetics of a plot.
+In the following sections you will learn about commonly used plots for visualizing relationships between two or more variables and the geoms used for creating them.
+
+### A numerical and a categorical variable
+
+To visualize the relationship between a numerical and a categorical variable we can use side-by-side box plots.
+A **boxplot** is a type of visual shorthand for measures of position (percentiles) that describe a distribution.
+It is also useful for identifying potential outliers.
+As shown in @fig-eda-boxplot, each boxplot consists of:
+
+-   A box that indicates the range of the middle half of the data, a distance known as the interquartile range (IQR), stretching from the 25th percentile of the distribution to the 75th percentile.
+    In the middle of the box is a line that displays the median, i.e. 50th percentile, of the distribution.
+    These three lines give you a sense of the spread of the distribution and whether or not the distribution is symmetric about the median or skewed to one side.
+
+-   Visual points that display observations that fall more than 1.5 times the IQR from either edge of the box.
+    These outlying points are unusual so are plotted individually.
+
+-   A line (or whisker) that extends from each end of the box and goes to the farthest non-outlier point in the distribution.
+
+```{r}
+#| label: fig-eda-boxplot
+#| echo: false
+#| fig-cap: |
+#|   Diagram depicting how a boxplot is created.
+#| fig-alt: |
+#|   A diagram depicting how a boxplot is created following the steps outlined 
+#|   above.
+
+knitr::include_graphics("images/EDA-boxplot.png")
+```
+
+Let's take a look at the distribution of body mass by species using `geom_boxplot()`:
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   Side-by-side box plots of distributions of body masses of Adelie, 
+#|   Chinstrap, and Gentoo penguins. The distribution of Adelie and 
+#|   Chinstrap penguins' body masses appear to be symmetric with 
+#|   medians around 3750 grams. The median body mass of Gentoo penguins 
+#|   is much higher, around 5000 grams, and the distribution of the 
+#|   body masses of these penguins appears to be somewhat right skewed.
+
+ggplot(penguins, aes(x = species, y = body_mass_g)) +
+  geom_boxplot()
+```
+
+Alternatively, we can make density plots with `geom_density()`.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A density plot of body masses of penguins by species of penguins. Each 
+#|   species (Adelie, Chinstrap, and Gentoo) is represented with different 
+#|   colored outlines for the density curves.
+
+ggplot(penguins, aes(x = body_mass_g, color = species)) +
+  geom_density(linewidth = 0.75)
+```
+
+We've also customized the thickness of the lines using the `linewidth` argument in order to make them stand out a bit more against the background.
+
+Additionally, we can map `species` to both `color` and `fill` aesthetics and use the `alpha` aesthetic to add transparency to the filled density curves.
+This aesthetic takes values between 0 (completely transparent) and 1 (completely opaque).
+In the following plot it's *set* to 0.5.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A density plot of body masses of penguins by species of penguins. Each 
+#|   species (Adelie, Chinstrap, and Gentoo) is represented in different 
+#|   colored outlines for the density curves. The density curves are also 
+#|   filled with the same colors, with some transparency added.
+
+ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) +
+  geom_density(alpha = 0.5)
+```
+
+Note the terminology we have used here:
+
+-   We *map* variables to aesthetics if we want the visual attribute represented by that aesthetic to vary based on the values of that variable.
+-   Otherwise, we *set* the value of an aesthetic.
+
+### Two categorical variables
+
+We can use stacked bar plots to visualize the relationship between two categorical variables.
+For example, the following two stacked bar plots both display the relationship between `island` and `species`, or specifically, visualizing the distribution of `species` within each island.
+
+The first plot shows the frequencies of each species of penguins on each island.
+The plot of frequencies show that there are equal numbers of Adelies on each island.
+But we don't have a good sense of the percentage balance within each island.
+
+```{r}
+#| fig-alt: |
+#|   Bar plots of penguin species by island (Biscoe, Dream, and Torgersen)
+ggplot(penguins, aes(x = island, fill = species)) +
+  geom_bar()
+```
+
+The second plot is a relative frequency plot, created by setting `position = "fill"` in the geom is more useful for comparing species distributions across islands since it's not affected by the unequal numbers of penguins across the islands.
+Using this plot we can see that Gentoo penguins all live on Biscoe island and make up roughly 75% of the penguins on that island, Chinstrap all live on Dream island and make up roughly 50% of the penguins on that island, and Adelie live on all three islands and make up all of the penguins on Torgersen.
+
+```{r}
+#| fig-alt: |
+#|   Bar plots of penguin species by island (Biscoe, Dream, and Torgersen)
+#|   the bars are scaled to the same height, making it a relative frequencies 
+#|   plot
+
+ggplot(penguins, aes(x = island, fill = species)) +
+  geom_bar(position = "fill")
+```
+
+In creating these bar charts, we map the variable that will be separated into bars to the `x` aesthetic, and the variable that will change the colors inside the bars to the `fill` aesthetic.
+
+### Two numerical variables
+
+So far you've learned about scatterplots (created with `geom_point()`) and smooth curves (created with `geom_smooth()`) for visualizing the relationship between two numerical variables.
+A scatterplot is probably the most commonly used plot for visualizing the relationship between two numerical variables.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. The plot 
+#|   displays a positive, linear, relatively strong relationship between 
+#|   these two variables.
+
+ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
+  geom_point()
+```
+
+### Three or more variables
+
+As we saw in @sec-adding-aesthetics-layers, we can incorporate more variables into a plot by mapping them to additional aesthetics.
+For example, in the following scatterplot the colors of points represent species and the shapes of points represent islands.
+
+```{r}
+#| warning: false
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. The plot 
+#|   displays a positive, linear, relatively strong relationship between 
+#|   these two variables. The points are colored based on the species of the 
+#|   penguins and the shapes of the points represent islands (round points are 
+#|   Biscoe island, triangles are Dream island, and squared are Torgersen 
+#|   island). The plot is very busy and it's difficult to distinguish the shapes
+#|   of the points.
+
+ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
+  geom_point(aes(color = species, shape = island))
+```
+
+However adding too many aesthetic mappings to a plot makes it cluttered and difficult to make sense of.
+Another way, which is particularly useful for categorical variables, is to split your plot into **facets**, subplots that each display one subset of the data.
+
+To facet your plot by a single variable, use `facet_wrap()`.
+The first argument of `facet_wrap()` is a formula[^data-visualize-3], which you create with `~` followed by a variable name.
+The variable that you pass to `facet_wrap()` should be categorical.
+
+[^data-visualize-3]: Here "formula" is the name of the thing created by `~`, not a synonym for "equation".
+
+```{r}
+#| warning: false
+#| fig-width: 8
+#| fig-asp: 0.33
+#| fig-alt: |
+#|   A scatterplot of body mass vs. flipper length of penguins. The shapes and 
+#|   colors of points represent species. Penguins from each island are on a 
+#|   separate facet. Within each facet, the relationship between body mass and 
+#|   flipper length is positive, linear, relatively strong. 
+
+ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
+  geom_point(aes(color = species, shape = species)) +
+  facet_wrap(~island)
+```
+
+You will learn about many other geoms for visualizing distributions of variables and relationships between them in @sec-layers.
+
+### Exercises
+
+1.  The `mpg` data frame that is bundled with the ggplot2 package contains `r nrow(mpg)` observations collected by the US Environmental Protection Agency on `r mpg |> distinct(model) |> nrow()` car models.
+    Which variables in `mpg` are categorical?
+    Which variables are numerical?
+    (Hint: Type `?mpg` to read the documentation for the dataset.) How can you see this information when you run `mpg`?
+
+2.  Make a scatterplot of `hwy` vs. `displ` using the `mpg` data frame.
+    Next, map a third, numerical variable to `color`, then `size`, then both `color` and `size`, then `shape`.
+    How do these aesthetics behave differently for categorical vs. numerical variables?
+
+3.  In the scatterplot of `hwy` vs. `displ`, what happens if you map a third variable to `linewidth`?
+
+4.  What happens if you map the same variable to multiple aesthetics?
+
+5.  Make a scatterplot of `bill_depth_mm` vs. `bill_length_mm` and color the points by `species`.
+    What does adding coloring by species reveal about the relationship between these two variables?
+    What about faceting by `species`?
+
+6.  Why does the following yield two separate legends?
+    How would you fix it to combine the two legends?
+
+    ```{r}
+    #| warning: false
+    #| fig-show: hide
+
+    ggplot(
+      data = penguins,
+      mapping = aes(
+        x = bill_length_mm, y = bill_depth_mm, 
+        color = species, shape = species
+      )
+    ) +
+      geom_point() +
+      labs(color = "Species")
+    ```
+
+7.  Create the two following stacked bar plots.
+    Which question can you answer with the first one?
+    Which question can you answer with the second one?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(penguins, aes(x = island, fill = species)) +
+      geom_bar(position = "fill")
+    ggplot(penguins, aes(x = species, fill = island)) +
+      geom_bar(position = "fill")
+    ```
+
+## Saving your plots {#sec-ggsave}
+
+Once you've made a plot, you might want to get it out of R by saving it as an image that you can use elsewhere.
+That's the job of `ggsave()`, which will save the plot most recently created to disk:
+
+```{r}
+#| fig-show: hide
+#| warning: false
+
+ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
+  geom_point()
+ggsave(filename = "penguin-plot.png")
+```
+
+```{r}
+#| include: false
+
+file.remove("penguin-plot.png")
+```
+
+This will save your plot to your working directory, a concept you'll learn more about in @sec-workflow-scripts-projects.
+
+If you don't specify the `width` and `height` they will be taken from the dimensions of the current plotting device.
+For reproducible code, you'll want to specify them.
+You can learn more about `ggsave()` in the documentation.
+
+Generally, however, we recommend that you assemble your final reports using Quarto, a reproducible authoring system that allows you to interleave your code and your prose and automatically include your plots in your write-ups.
+You will learn more about Quarto in @sec-quarto.
+
+### Exercises
+
+1.  Run the following lines of code.
+    Which of the two plots is saved as `mpg-plot.png`?
+    Why?
+
+    ```{r}
+    #| eval: false
+
+    ggplot(mpg, aes(x = class)) +
+      geom_bar()
+    ggplot(mpg, aes(x = cty, y = hwy)) +
+      geom_point()
+    ggsave("mpg-plot.png")
+    ```
+
+2.  What do you need to change in the code above to save the plot as a PDF instead of a PNG?
+    How could you find out what types of image files would work in `ggsave()`?
+
+## Common problems
+
+As you start to run R code, you're likely to run into problems.
+Don't worry --- it happens to everyone.
+We have all been writing R code for years, but every day we still write code that doesn't work on the first try!
+
+Start by carefully comparing the code that you're running to the code in the book.
+R is extremely picky, and a misplaced character can make all the difference.
+Make sure that every `(` is matched with a `)` and every `"` is paired with another `"`.
+Sometimes you'll run the code and nothing happens.
+Check the left-hand of your console: if it's a `+`, it means that R doesn't think you've typed a complete expression and it's waiting for you to finish it.
+In this case, it's usually easy to start from scratch again by pressing ESCAPE to abort processing the current command.
+
+One common problem when creating ggplot2 graphics is to put the `+` in the wrong place: it has to come at the end of the line, not the start.
+In other words, make sure you haven't accidentally written code like this:
+
+```{r}
+#| eval: false
+
+ggplot(data = mpg) 
++ geom_point(mapping = aes(x = displ, y = hwy))
+```
+
+If you're still stuck, try the help.
+You can get help about any R function by running `?function_name` in the console, or highlighting the function name and pressing F1 in RStudio.
+Don't worry if the help doesn't seem that helpful - instead skip down to the examples and look for code that matches what you're trying to do.
+
+If that doesn't help, carefully read the error message.
+Sometimes the answer will be buried there!
+But when you're new to R, even if the answer is in the error message, you might not yet know how to understand it.
+Another great tool is Google: try googling the error message, as it's likely someone else has had the same problem, and has gotten help online.
+
+## Summary
+
+In this chapter, you've learned the basics of data visualization with ggplot2.
+We started with the basic idea that underpins ggplot2: a visualization is a mapping from variables in your data to aesthetic properties like position, color, size and shape.
+You then learned about increasing the complexity and improving the presentation of your plots layer-by-layer.
+You also learned about commonly used plots for visualizing the distribution of a single variable as well as for visualizing relationships between two or more variables, by leveraging additional aesthetic mappings and/or splitting your plot into small multiples using faceting.
+
+We'll use visualizations again and again throughout this book, introducing new techniques as we need them as well as do a deeper dive into creating visualizations with ggplot2 in @sec-layers through @sec-communication.
+
+With the basics of visualization under your belt, in the next chapter we're going to switch gears a little and give you some practical workflow advice.
+We intersperse workflow advice with data science tools throughout this part of the book because it'll help you stay organized as you write increasing amounts of R code.
diff --git a/data/01-sales.csv b/data/01-sales.csv
new file mode 100644
index 000000000..534e8ac10
--- /dev/null
+++ b/data/01-sales.csv
@@ -0,0 +1,8 @@
+month,year,brand,item,n
+January,2019,1,1234,3
+January,2019,1,8721,9
+January,2019,1,1822,2
+January,2019,2,3333,1
+January,2019,2,2156,9
+January,2019,2,3987,6
+January,2019,2,3827,6
\ No newline at end of file
diff --git a/data/02-sales.csv b/data/02-sales.csv
new file mode 100644
index 000000000..6c8d2aa52
--- /dev/null
+++ b/data/02-sales.csv
@@ -0,0 +1,7 @@
+month,year,brand,item,n
+February,2019,1,1234,8
+February,2019,1,8721,2
+February,2019,1,1822,3
+February,2019,2,3333,1
+February,2019,2,2156,3
+February,2019,2,3987,6
diff --git a/data/03-sales.csv b/data/03-sales.csv
new file mode 100644
index 000000000..61d4d34a0
--- /dev/null
+++ b/data/03-sales.csv
@@ -0,0 +1,7 @@
+month,year,brand,item,n
+March,2019,1,1234,3
+March,2019,1,3627,1
+March,2019,1,8820,3
+March,2019,2,7253,1
+March,2019,2,8766,3
+March,2019,2,8288,6
diff --git a/data/bake-sale.xlsx b/data/bake-sale.xlsx
new file mode 100644
index 000000000..788373cc0
Binary files /dev/null and b/data/bake-sale.xlsx differ
diff --git a/data/gapminder.R b/data/gapminder.R
new file mode 100644
index 000000000..c2c297911
--- /dev/null
+++ b/data/gapminder.R
@@ -0,0 +1,20 @@
+
+
+repurrrsive::gap_simple |>
+  count(year)
+
+by_year <- repurrrsive::gap_simple |>
+  group_by(year)
+paths <- by_year |>
+  group_keys() |>
+  mutate(path = str_glue("data/gapminder/{year}.xlsx")) |>
+  pull()
+paths
+
+years <- by_year |>
+  group_split() |>
+  map(\(df) select(df, -year))
+
+dir.create("data/gapminder")
+
+walk2(years, paths, writexl::write_xlsx)
diff --git a/data/gapminder/1952.xlsx b/data/gapminder/1952.xlsx
new file mode 100644
index 000000000..7ce82a5b3
Binary files /dev/null and b/data/gapminder/1952.xlsx differ
diff --git a/data/gapminder/1957.xlsx b/data/gapminder/1957.xlsx
new file mode 100644
index 000000000..c909acdf2
Binary files /dev/null and b/data/gapminder/1957.xlsx differ
diff --git a/data/gapminder/1962.xlsx b/data/gapminder/1962.xlsx
new file mode 100644
index 000000000..621e4c682
Binary files /dev/null and b/data/gapminder/1962.xlsx differ
diff --git a/data/gapminder/1967.xlsx b/data/gapminder/1967.xlsx
new file mode 100644
index 000000000..337a45da9
Binary files /dev/null and b/data/gapminder/1967.xlsx differ
diff --git a/data/gapminder/1972.xlsx b/data/gapminder/1972.xlsx
new file mode 100644
index 000000000..21f9de80e
Binary files /dev/null and b/data/gapminder/1972.xlsx differ
diff --git a/data/gapminder/1977.xlsx b/data/gapminder/1977.xlsx
new file mode 100644
index 000000000..f71a9f501
Binary files /dev/null and b/data/gapminder/1977.xlsx differ
diff --git a/data/gapminder/1982.xlsx b/data/gapminder/1982.xlsx
new file mode 100644
index 000000000..0ff0eae80
Binary files /dev/null and b/data/gapminder/1982.xlsx differ
diff --git a/data/gapminder/1987.xlsx b/data/gapminder/1987.xlsx
new file mode 100644
index 000000000..a0b10ceb3
Binary files /dev/null and b/data/gapminder/1987.xlsx differ
diff --git a/data/gapminder/1992.xlsx b/data/gapminder/1992.xlsx
new file mode 100644
index 000000000..6ae0e5690
Binary files /dev/null and b/data/gapminder/1992.xlsx differ
diff --git a/data/gapminder/1997.xlsx b/data/gapminder/1997.xlsx
new file mode 100644
index 000000000..fe6517026
Binary files /dev/null and b/data/gapminder/1997.xlsx differ
diff --git a/data/gapminder/2002.xlsx b/data/gapminder/2002.xlsx
new file mode 100644
index 000000000..f794a287a
Binary files /dev/null and b/data/gapminder/2002.xlsx differ
diff --git a/data/gapminder/2007.xlsx b/data/gapminder/2007.xlsx
new file mode 100644
index 000000000..0601ec54c
Binary files /dev/null and b/data/gapminder/2007.xlsx differ
diff --git a/data/penguins.xlsx b/data/penguins.xlsx
new file mode 100644
index 000000000..2e198cea3
Binary files /dev/null and b/data/penguins.xlsx differ
diff --git a/data/roster.xlsx b/data/roster.xlsx
new file mode 100644
index 000000000..43943aecb
Binary files /dev/null and b/data/roster.xlsx differ
diff --git a/data/sales.xlsx b/data/sales.xlsx
new file mode 100644
index 000000000..c9d306cd4
Binary files /dev/null and b/data/sales.xlsx differ
diff --git a/data/students.csv b/data/students.csv
new file mode 100644
index 000000000..3ad908e5a
--- /dev/null
+++ b/data/students.csv
@@ -0,0 +1,7 @@
+﻿Student ID,Full Name,favourite.food,mealPlan,AGE
+1,Sunil Huffmann,Strawberry yoghurt,Lunch only,4
+2,Barclay Lynn,French fries,Lunch only,5
+3,Jayendra Lyne,N/A,Breakfast and lunch,7
+4,Leon Rossini,Anchovies,Lunch only,
+5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,five
+6,Güvenç Attila,Ice cream,Lunch only,6
\ No newline at end of file
diff --git a/data/students.xlsx b/data/students.xlsx
new file mode 100644
index 000000000..a0345c461
Binary files /dev/null and b/data/students.xlsx differ
diff --git a/data/survey.xlsx b/data/survey.xlsx
new file mode 100644
index 000000000..e4e647d0b
Binary files /dev/null and b/data/survey.xlsx differ
diff --git a/databases.qmd b/databases.qmd
new file mode 100644
index 000000000..b67287b35
--- /dev/null
+++ b/databases.qmd
@@ -0,0 +1,666 @@
+# Databases {#sec-import-databases}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+A huge amount of data lives in databases, so it's essential that you know how to access it.
+Sometimes you can ask someone to download a snapshot into a `.csv` for you, but this gets painful quickly: every time you need to make a change you'll have to communicate with another human.
+You want to be able to reach into the database directly to get the data you need, when you need it.
+
+In this chapter, you'll first learn the basics of the DBI package: how to use it to connect to a database and then retrieve data with a SQL[^databases-1] query.
+**SQL**, short for **s**tructured **q**uery **l**anguage, is the lingua franca of databases, and is an important language for all data scientists to learn.
+That said, we're not going to start with SQL, but instead we'll teach you dbplyr, which can translate your dplyr code to the SQL.
+We'll use that as a way to teach you some of the most important features of SQL.
+You won't become a SQL master by the end of the chapter, but you will be able to identify the most important components and understand what they do.
+
+[^databases-1]: SQL is either pronounced "s"-"q"-"l" or "sequel".
+
+### Prerequisites
+
+In this chapter, we'll introduce DBI and dbplyr.
+DBI is a low-level interface that connects to databases and executes SQL; dbplyr is a high-level interface that translates your dplyr code to SQL queries then executes them with DBI.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(DBI)
+library(dbplyr)
+library(tidyverse)
+```
+
+## Database basics
+
+At the simplest level, you can think about a database as a collection of data frames, called **tables** in database terminology.
+Like a data frame, a database table is a collection of named columns, where every value in the column is the same type.
+There are three high level differences between data frames and database tables:
+
+-   Database tables are stored on disk and can be arbitrarily large.
+    Data frames are stored in memory, and are fundamentally limited (although that limit is still plenty large for many problems).
+
+-   Database tables almost always have indexes.
+    Much like the index of a book, a database index makes it possible to quickly find rows of interest without having to look at every single row.
+    Data frames and tibbles don't have indexes, but data.tables do, which is one of the reasons that they're so fast.
+
+-   Most classical databases are optimized for rapidly collecting data, not analyzing existing data.
+    These databases are called **row-oriented** because the data is stored row-by-row, rather than column-by-column like R.
+    More recently, there's been much development of **column-oriented** databases that make analyzing the existing data much faster.
+
+Databases are run by database management systems (**DBMS**'s for short), which come in three basic forms:
+
+-   **Client-server** DBMS's run on a powerful central server, which you connect from your computer (the client). They are great for sharing data with multiple people in an organization. Popular client-server DBMS's include PostgreSQL, MariaDB, SQL Server, and Oracle.
+-   **Cloud** DBMS's, like Snowflake, Amazon's RedShift, and Google's BigQuery, are similar to client server DBMS's, but they run in the cloud. This means that they can easily handle extremely large datasets and can automatically provide more compute resources as needed.
+-   **In-process** DBMS's, like SQLite or duckdb, run entirely on your computer. They're great for working with large datasets where you're the primary user.
+
+## Connecting to a database
+
+To connect to the database from R, you'll use a pair of packages:
+
+-   You'll always use DBI (**d**ata**b**ase **i**nterface) because it provides a set of generic functions that connect to the database, upload data, run SQL queries, etc.
+
+-   You'll also use a package tailored for the DBMS you're connecting to.
+    This package translates the generic DBI commands into the specifics needed for a given DBMS.
+    There's usually one package for each DBMS, e.g.
+    RPostgres for PostgreSQL and RMariaDB for MySQL.
+
+If you can't find a specific package for your DBMS, you can usually use the odbc package instead.
+This uses the ODBC protocol supported by many DBMS.
+odbc requires a little more setup because you'll also need to install an ODBC driver and tell the odbc package where to find it.
+
+Concretely, you create a database connection using `DBI::dbConnect()`.
+The first argument selects the DBMS[^databases-2], then the second and subsequent arguments describe how to connect to it (i.e. where it lives and the credentials that you need to access it).
+The following code shows a couple of typical examples:
+
+[^databases-2]: Typically, this is the only function you'll use from the client package, so we recommend using `::` to pull out that one function, rather than loading the complete package with `library()`.
+
+```{r}
+#| eval: false
+con <- DBI::dbConnect(
+  RMariaDB::MariaDB(), 
+  username = "foo"
+)
+con <- DBI::dbConnect(
+  RPostgres::Postgres(), 
+  hostname = "databases.mycompany.com", 
+  port = 1234
+)
+```
+
+The precise details of the connection vary a lot from DBMS to DBMS so unfortunately we can't cover all the details here.
+This means you'll need to do a little research on your own.
+Typically you can ask the other data scientists in your team or talk to your DBA (**d**ata**b**ase **a**dministrator).
+The initial setup will often take a little fiddling (and maybe some googling) to get it right, but you'll generally only need to do it once.
+
+### In this book
+
+Setting up a client-server or cloud DBMS would be a pain for this book, so we'll instead use an in-process DBMS that lives entirely in an R package: duckdb.
+Thanks to the magic of DBI, the only difference between using duckdb and any other DBMS is how you'll connect to the database.
+This makes it great to teach with because you can easily run this code as well as easily take what you learn and apply it elsewhere.
+
+Connecting to duckdb is particularly simple because the defaults create a temporary database that is deleted when you quit R.
+That's great for learning because it guarantees that you'll start from a clean slate every time you restart R:
+
+```{r}
+con <- DBI::dbConnect(duckdb::duckdb())
+```
+
+duckdb is a high-performance database that's designed very much for the needs of a data scientist.
+We use it here because it's very easy to get started with, but it's also capable of handling gigabytes of data with great speed.
+If you want to use duckdb for a real data analysis project, you'll also need to supply the `dbdir` argument to make a persistent database and tell duckdb where to save it.
+Assuming you're using a project (@sec-workflow-scripts-projects), it's reasonable to store it in the `duckdb` directory of the current project:
+
+```{r}
+#| eval: false
+con <- DBI::dbConnect(duckdb::duckdb(), dbdir = "duckdb")
+```
+
+### Load some data {#sec-load-data}
+
+Since this is a new database, we need to start by adding some data.
+Here we'll add `mpg` and `diamonds` datasets from ggplot2 using `DBI::dbWriteTable()`.
+The simplest usage of `dbWriteTable()` needs three arguments: a database connection, the name of the table to create in the database, and a data frame of data.
+
+```{r}
+dbWriteTable(con, "mpg", ggplot2::mpg)
+dbWriteTable(con, "diamonds", ggplot2::diamonds)
+```
+
+If you're using duckdb in a real project, we highly recommend learning about `duckdb_read_csv()` and `duckdb_register_arrow()`.
+These give you powerful and performant ways to quickly load data directly into duckdb, without having to first load it into R.
+We'll also show off a useful technique for loading multiple files into a database in @sec-save-database.
+
+### DBI basics
+
+You can check that the data is loaded correctly by using a couple of other DBI functions: `dbListTable()` lists all tables in the database[^databases-3] and `dbReadTable()` retrieves the contents of a table.
+
+[^databases-3]: At least, all the tables that you have permission to see.
+
+```{r}
+dbListTables(con)
+
+con |> 
+  dbReadTable("diamonds") |> 
+  as_tibble()
+```
+
+`dbReadTable()` returns a `data.frame` so we use `as_tibble()` to convert it into a tibble so that it prints nicely.
+
+If you already know SQL, you can use `dbGetQuery()` to get the results of running a query on the database:
+
+```{r}
+sql <- "
+  SELECT carat, cut, clarity, color, price 
+  FROM diamonds 
+  WHERE price > 15000
+"
+as_tibble(dbGetQuery(con, sql))
+```
+
+If you've never seen SQL before, don't worry!
+You'll learn more about it shortly.
+But if you read it carefully, you might guess that it selects five columns of the diamonds dataset and all the rows where `price` is greater than 15,000.
+
+## dbplyr basics
+
+Now that we've connected to a database and loaded up some data, we can start to learn about dbplyr.
+dbplyr is a dplyr **backend**, which means that you keep writing dplyr code but the backend executes it differently.
+In this, dbplyr translates to SQL; other backends include [dtplyr](https://dtplyr.tidyverse.org) which translates to [data.table](https://r-datatable.com), and [multidplyr](https://multidplyr.tidyverse.org) which executes your code on multiple cores.
+
+To use dbplyr, you must first use `tbl()` to create an object that represents a database table:
+
+```{r}
+diamonds_db <- tbl(con, "diamonds")
+diamonds_db
+```
+
+::: callout-note
+There are two other common ways to interact with a database.
+First, many corporate databases are very large so you need some hierarchy to keep all the tables organized.
+In that case you might need to supply a schema, or a catalog and a schema, in order to pick the table you're interested in:
+
+```{r}
+#| eval: false
+diamonds_db <- tbl(con, in_schema("sales", "diamonds"))
+diamonds_db <- tbl(con, in_catalog("north_america", "sales", "diamonds"))
+```
+
+Other times you might want to use your own SQL query as a starting point:
+
+```{r}
+#| eval: false
+diamonds_db <- tbl(con, sql("SELECT * FROM diamonds"))
+```
+:::
+
+This object is **lazy**; when you use dplyr verbs on it, dplyr doesn't do any work: it just records the sequence of operations that you want to perform and only performs them when needed.
+For example, take the following pipeline:
+
+```{r}
+big_diamonds_db <- diamonds_db |> 
+  filter(price > 15000) |> 
+  select(carat:clarity, price)
+
+big_diamonds_db
+```
+
+You can tell this object represents a database query because it prints the DBMS name at the top, and while it tells you the number of columns, it typically doesn't know the number of rows.
+This is because finding the total number of rows usually requires executing the complete query, something we're trying to avoid.
+
+You can see the SQL code generated by the dplyr function `show_query()`.
+If you know dplyr, this is a great way to learn SQL!
+Write some dplyr code, get dbplyr to translate it to SQL, and then try to figure out how the two languages match up.
+
+```{r}
+big_diamonds_db |>
+  show_query()
+```
+
+To get all the data back into R, you call `collect()`.
+Behind the scenes, this generates the SQL, calls `dbGetQuery()` to get the data, then turns the result into a tibble:
+
+```{r}
+big_diamonds <- big_diamonds_db |> 
+  collect()
+big_diamonds
+```
+
+Typically, you'll use dbplyr to select the data you want from the database, performing basic filtering and aggregation using the translations described below.
+Then, once you're ready to analyse the data with functions that are unique to R, you'll `collect()` the data to get an in-memory tibble, and continue your work with pure R code.
+
+## SQL
+
+The rest of the chapter will teach you a little SQL through the lens of dbplyr.
+It's a rather non-traditional introduction to SQL but we hope it will get you quickly up to speed with the basics.
+Luckily, if you understand dplyr you're in a great place to quickly pick up SQL because so many of the concepts are the same.
+
+We'll explore the relationship between dplyr and SQL using a couple of old friends from the nycflights13 package: `flights` and `planes`.
+These datasets are easy to get into our learning database because dbplyr comes with a function that copies the tables from nycflights13 to our database:
+
+```{r}
+dbplyr::copy_nycflights13(con)
+flights <- tbl(con, "flights")
+planes <- tbl(con, "planes")
+```
+
+```{r}
+#| echo: false
+options(dplyr.strict_sql = TRUE)
+```
+
+### SQL basics
+
+The top-level components of SQL are called **statements**.
+Common statements include `CREATE` for defining new tables, `INSERT` for adding data, and `SELECT` for retrieving data.
+We will focus on `SELECT` statements, also called **queries**, because they are almost exclusively what you'll use as a data scientist.
+
+A query is made up of **clauses**.
+There are five important clauses: `SELECT`, `FROM`, `WHERE`, `ORDER BY`, and `GROUP BY`. Every query must have the `SELECT`[^databases-4] and `FROM`[^databases-5] clauses and the simplest query is `SELECT * FROM table`, which selects all columns from the specified table
+. This is what dbplyr generates for an unadulterated table
+:
+
+[^databases-4]: Confusingly, depending on the context, `SELECT` is either a statement or a clause.
+    To avoid this confusion, we'll generally use `SELECT` query instead of `SELECT` statement.
+
+[^databases-5]: Ok, technically, only the `SELECT` is required, since you can write queries like `SELECT 1+1` to perform basic calculations.
+    But if you want to work with data (as you always do!) you'll also need a `FROM` clause.
+
+```{r}
+flights |> show_query()
+planes |> show_query()
+```
+
+`WHERE` and `ORDER BY` control which rows are included and how they are ordered:
+
+```{r}
+flights |> 
+  filter(dest == "IAH") |> 
+  arrange(dep_delay) |>
+  show_query()
+```
+
+`GROUP BY` converts the query to a summary, causing aggregation to happen:
+
+```{r}
+flights |> 
+  group_by(dest) |> 
+  summarize(dep_delay = mean(dep_delay, na.rm = TRUE)) |> 
+  show_query()
+```
+
+There are two important differences between dplyr verbs and SELECT clauses:
+
+-   In SQL, case doesn't matter: you can write `select`, `SELECT`, or even `SeLeCt`. In this book we'll stick with the common convention of writing SQL keywords in uppercase to distinguish them from table or variables names.
+-   In SQL, order matters: you must always write the clauses in the order `SELECT`, `FROM`, `WHERE`, `GROUP BY`, `ORDER BY`. Confusingly, this order doesn't match how the clauses actually evaluated which is first `FROM`, then `WHERE`, `GROUP BY`, `SELECT`, and `ORDER BY`.
+
+The following sections explore each clause in more detail.
+
+::: callout-note
+Note that while SQL is a standard, it is extremely complex and no database follows it exactly.
+While the main components that we'll focus on in this book are very similar between DBMS's, there are many minor variations.
+Fortunately, dbplyr is designed to handle this problem and generates different translations for different databases.
+It's not perfect, but it's continually improving, and if you hit a problem you can file an issue [on GitHub](https://github.com/tidyverse/dbplyr/issues/) to help us do better.
+:::
+
+### SELECT
+
+The `SELECT` clause is the workhorse of queries and performs the same job as `select()`, `mutate()`, `rename()`, `relocate()`, and, as you'll learn in the next section, `summarize()`.
+
+`select()`, `rename()`, and `relocate()` have very direct translations to `SELECT` as they just affect where a column appears (if at all) along with its name:
+
+```{r}
+planes |> 
+  select(tailnum, type, manufacturer, model, year) |> 
+  show_query()
+
+planes |> 
+  select(tailnum, type, manufacturer, model, year) |> 
+  rename(year_built = year) |> 
+  show_query()
+
+planes |> 
+  select(tailnum, type, manufacturer, model, year) |> 
+  relocate(manufacturer, model, .before = type) |> 
+  show_query()
+```
+
+This example also shows you how SQL does renaming.
+In SQL terminology renaming is called **aliasing** and is done with `AS`.
+Note that unlike `mutate()`, the old name is on the left and the new name is on the right.
+
+::: callout-note
+In the examples above note that `"year"` and `"type"` are wrapped in double quotes.
+That's because these are **reserved words** in duckdb, so dbplyr quotes them to avoid any potential confusion between column/table names and SQL operators.
+
+When working with other databases you're likely to see every variable name quotes because only a handful of client packages, like duckdb, know what all the reserved words are, so they quote everything to be safe.
+
+``` sql
+SELECT "tailnum", "type", "manufacturer", "model", "year"
+FROM "planes"
+```
+
+Some other database systems use backticks instead of quotes:
+
+``` sql
+SELECT `tailnum`, `type`, `manufacturer`, `model`, `year`
+FROM `planes`
+```
+:::
+
+The translations for `mutate()` are similarly straightforward: each variable becomes a new expression in `SELECT`:
+
+```{r}
+flights |> 
+  mutate(
+    speed = distance / (air_time / 60)
+  ) |> 
+  show_query()
+```
+
+We'll come back to the translation of individual components (like `/`) in @sec-sql-expressions.
+
+### FROM
+
+The `FROM` clause defines the data source.
+It's going to be rather uninteresting for a little while, because we're just using single tables.
+You'll see more complex examples once we hit the join functions.
+
+### GROUP BY
+
+`group_by()` is translated to the `GROUP BY`[^databases-6] clause and `summarize()` is translated to the `SELECT` clause:
+
+[^databases-6]: This is no coincidence: the dplyr function name was inspired by the SQL clause.
+
+```{r}
+diamonds_db |> 
+  group_by(cut) |> 
+  summarize(
+    n = n(),
+    avg_price = mean(price, na.rm = TRUE)
+  ) |> 
+  show_query()
+```
+
+We'll come back to what's happening with translation `n()` and `mean()` in @sec-sql-expressions.
+
+### WHERE
+
+`filter()` is translated to the `WHERE` clause:
+
+```{r}
+flights |> 
+  filter(dest == "IAH" | dest == "HOU") |> 
+  show_query()
+
+flights |> 
+  filter(arr_delay > 0 & arr_delay < 20) |> 
+  show_query()
+```
+
+There are a few important details to note here:
+
+-   `|` becomes `OR` and `&` becomes `AND`.
+-   SQL uses `=` for comparison, not `==`. SQL doesn't have assignment, so there's no potential for confusion there.
+-   SQL uses only `''` for strings, not `""`. In SQL, `""` is used to identify variables, like R's ``` `` ```.
+
+Another useful SQL operator is `IN`, which is very close to R's `%in%`:
+
+```{r}
+flights |> 
+  filter(dest %in% c("IAH", "HOU")) |> 
+  show_query()
+```
+
+SQL uses `NULL` instead of `NA`.
+`NULL`s behave similarly to `NA`s.
+The main difference is that while they're "infectious" in comparisons and arithmetic, they are silently dropped when summarizing.
+dbplyr will remind you about this behavior the first time you hit it:
+
+```{r}
+flights |> 
+  group_by(dest) |> 
+  summarize(delay = mean(arr_delay))
+```
+
+If you want to learn more about how `NULL`s work, you might enjoy "[*Three valued logic*](https://modern-sql.com/concept/three-valued-logic)" by Markus Winand.
+
+In general, you can work with `NULL`s using the functions you'd use for `NA`s in R:
+
+```{r}
+flights |> 
+  filter(!is.na(dep_delay)) |> 
+  show_query()
+```
+
+This SQL query illustrates one of the drawbacks of dbplyr: while the SQL is correct, it isn't as simple as you might write by hand.
+In this case, you could drop the parentheses and use a special operator that's easier to read:
+
+``` sql
+WHERE "dep_delay" IS NOT NULL
+```
+
+Note that if you `filter()` a variable that you created using a summarize, dbplyr will generate a `HAVING` clause, rather than a `WHERE` clause.
+This is a one of the idiosyncrasies of SQL: `WHERE` is evaluated before `SELECT` and `GROUP BY`, so SQL needs another clause that's evaluated afterwards.
+
+```{r}
+diamonds_db |> 
+  group_by(cut) |> 
+  summarize(n = n()) |> 
+  filter(n > 100) |> 
+  show_query()
+```
+
+### ORDER BY
+
+Ordering rows involves a straightforward translation from `arrange()` to the `ORDER BY` clause:
+
+```{r}
+flights |> 
+  arrange(year, month, day, desc(dep_delay)) |> 
+  show_query()
+```
+
+Notice how `desc()` is translated to `DESC`: this is one of the many dplyr functions whose name was directly inspired by SQL.
+
+### Subqueries
+
+Sometimes it's not possible to translate a dplyr pipeline into a single `SELECT` statement and you need to use a subquery.
+A **subquery** is just a query used as a data source in the `FROM` clause, instead of the usual table.
+
+dbplyr typically uses subqueries to work around limitations of SQL.
+For example, expressions in the `SELECT` clause can't refer to columns that were just created.
+That means that the following (silly) dplyr pipeline needs to happen in two steps: the first (inner) query computes `year1` and then the second (outer) query can compute `year2`.
+
+```{r}
+flights |> 
+  mutate(
+    year1 = year + 1,
+    year2 = year1 + 1
+  ) |> 
+  show_query()
+```
+
+You'll also see this if you attempted to `filter()` a variable that you just created.
+Remember, even though `WHERE` is written after `SELECT`, it's evaluated before it, so we need a subquery in this (silly) example:
+
+```{r}
+flights |> 
+  mutate(year1 = year + 1) |> 
+  filter(year1 == 2014) |> 
+  show_query()
+```
+
+Sometimes dbplyr will create a subquery where it's not needed because it doesn't yet know how to optimize that translation.
+As dbplyr improves over time, these cases will get rarer but will probably never go away.
+
+### Joins
+
+If you're familiar with dplyr's joins, SQL joins are very similar.
+Here's a simple example:
+
+```{r}
+flights |> 
+  left_join(planes |> rename(year_built = year), by = "tailnum") |> 
+  show_query()
+```
+
+The main thing to notice here is the syntax: SQL joins use sub-clauses of the `FROM` clause to bring in additional tables, using `ON` to define how the tables are related.
+
+dplyr's names for these functions are so closely connected to SQL that you can easily guess the equivalent SQL for `inner_join()`, `right_join()`, and `full_join()`:
+
+``` sql
+SELECT flights.*, "type", manufacturer, model, engines, seats, speed
+FROM flights
+INNER JOIN planes ON (flights.tailnum = planes.tailnum)
+
+SELECT flights.*, "type", manufacturer, model, engines, seats, speed
+FROM flights
+RIGHT JOIN planes ON (flights.tailnum = planes.tailnum)
+
+SELECT flights.*, "type", manufacturer, model, engines, seats, speed
+FROM flights
+FULL JOIN planes ON (flights.tailnum = planes.tailnum)
+```
+
+You're likely to need many joins when working with data from a database.
+That's because database tables are often stored in a highly normalized form, where each "fact" is stored in a single place and to keep a complete dataset for analysis you need to navigate a complex network of tables connected by primary and foreign keys.
+If you hit this scenario, the [dm package](https://cynkra.github.io/dm/), by Tobias Schieferdecker, Kirill Müller, and Darko Bergant, is a life saver.
+It can automatically determine the connections between tables using the constraints that DBAs often supply, visualize the connections so you can see what's going on, and generate the joins you need to connect one table to another.
+
+### Other verbs
+
+dbplyr also translates other verbs like `distinct()`, `slice_*()`, and `intersect()`, and a growing selection of tidyr functions like `pivot_longer()` and `pivot_wider()`.
+The easiest way to see the full set of what's currently available is to visit the dbplyr website: <https://dbplyr.tidyverse.org/reference/>.
+
+### Exercises
+
+1.  What is `distinct()` translated to?
+    How about `head()`?
+
+2.  Explain what each of the following SQL queries do and try recreate them using dbplyr.
+
+    ``` sql
+    SELECT * 
+    FROM flights
+    WHERE dep_delay < arr_delay
+
+    SELECT *, distance / (airtime / 60) AS speed
+    FROM flights
+    ```
+
+## Function translations {#sec-sql-expressions}
+
+So far we've focused on the big picture of how dplyr verbs are translated to the clauses of a query.
+Now we're going to zoom in a little and talk about the translation of the R functions that work with individual columns, e.g., what happens when you use `mean(x)` in a `summarize()`?
+
+To help see what's going on, we'll use a couple of little helper functions that run a `summarize()` or `mutate()` and show the generated SQL.
+That will make it a little easier to explore a few variations and see how summaries and transformations can differ.
+
+```{r}
+summarize_query <- function(df, ...) {
+  df |> 
+    summarize(...) |> 
+    show_query()
+}
+mutate_query <- function(df, ...) {
+  df |> 
+    mutate(..., .keep = "none") |> 
+    show_query()
+}
+```
+
+Let's dive in with some summaries!
+Looking at the code below you'll notice that some summary functions, like `mean()`, have a relatively simple translation while others, like `median()`, are much more complex.
+The complexity is typically higher for operations that are common in statistics but less common in databases.
+
+```{r}
+flights |> 
+  group_by(year, month, day) |>  
+  summarize_query(
+    mean = mean(arr_delay, na.rm = TRUE),
+    median = median(arr_delay, na.rm = TRUE)
+  )
+```
+
+The translation of summary functions becomes more complicated when you use them inside a `mutate()` because they have to turn into so-called **window** functions.
+In SQL, you turn an ordinary aggregation function into a window function by adding `OVER` after it:
+
+```{r}
+flights |> 
+  group_by(year, month, day) |>  
+  mutate_query(
+    mean = mean(arr_delay, na.rm = TRUE),
+  )
+```
+
+In SQL, the `GROUP BY` clause is used exclusively for summaries so here you can see that the grouping has moved from the `PARTITION BY` argument to `OVER`.
+
+Window functions include all functions that look forward or backwards, like `lead()` and `lag()` which look at the "previous" or "next" value respectively:
+
+```{r}
+flights |> 
+  group_by(dest) |>  
+  arrange(time_hour) |> 
+  mutate_query(
+    lead = lead(arr_delay),
+    lag = lag(arr_delay)
+  )
+```
+
+Here it's important to `arrange()` the data, because SQL tables have no intrinsic order.
+In fact, if you don't use `arrange()` you might get the rows back in a different order every time!
+Notice for window functions, the ordering information is repeated: the `ORDER BY` clause of the main query doesn't automatically apply to window functions.
+
+Another important SQL function is `CASE WHEN`. It's used as the translation of `if_else()` and `case_when()`, the dplyr function that it directly inspired.
+Here are a couple of simple examples:
+
+```{r}
+flights |> 
+  mutate_query(
+    description = if_else(arr_delay > 0, "delayed", "on-time")
+  )
+flights |> 
+  mutate_query(
+    description = 
+      case_when(
+        arr_delay < -5 ~ "early", 
+        arr_delay < 5 ~ "on-time",
+        arr_delay >= 5 ~ "late"
+      )
+  )
+```
+
+`CASE WHEN` is also used for some other functions that don't have a direct translation from R to SQL.
+A good example of this is `cut()`:
+
+```{r}
+flights |> 
+  mutate_query(
+    description =  cut(
+      arr_delay, 
+      breaks = c(-Inf, -5, 5, Inf), 
+      labels = c("early", "on-time", "late")
+    )
+  )
+```
+
+dbplyr also translates common string and date-time manipulation functions, which you can learn about in `vignette("translation-function", package = "dbplyr")`.
+dbplyr's translations are certainly not perfect, and there are many R functions that aren't translated yet, but dbplyr does a surprisingly good job covering the functions that you'll use most of the time.
+
+## Summary
+
+In this chapter you learned how to access data from databases.
+We focused on dbplyr, a dplyr "backend" that allows you to write the dplyr code you're familiar with, and have it be automatically translated to SQL.
+We used that translation to teach you a little SQL; it's important to learn some SQL because it's *the* most commonly used language for working with data and knowing some will make it easier for you to communicate with other data folks who don't use R.
+If you've finished this chapter and would like to learn more about SQL.
+We have two recommendations:
+
+-   [*SQL for Data Scientists*](https://sqlfordatascientists.com) by Renée M. P. Teate is an introduction to SQL designed specifically for the needs of data scientists, and includes examples of the sort of highly interconnected data you're likely to encounter in real organizations.
+-   [*Practical SQL*](https://www.practicalsql.com) by Anthony DeBarros is written from the perspective of a data journalist (a data scientist specialized in telling compelling stories) and goes into more detail about getting your data into a database and running your own DBMS.
+
+In the next chapter, we'll learn about another dplyr backend for working with large data: arrow.
+Arrow is designed for working with large files on disk, and is a natural complement to databases.
diff --git a/datetimes.Rmd b/datetimes.Rmd
deleted file mode 100644
index e9477f558..000000000
--- a/datetimes.Rmd
+++ /dev/null
@@ -1,571 +0,0 @@
-# Dates and times
-
-## Introduction
-
-This chapter will show you how to work with dates and times in R. At first glance, dates and times seem simple. You use them all the time in your regular life, and they don't seem to cause much confusion. However, the more you learn about dates and times, the more complicated they seem to get. To warm up, try these three seemingly simple questions:
-
-* Does every year have 365 days?
-* Does every day have 24 hours?
-* Does every minute have 60 seconds?
-
-I'm sure you know that not every year has 365 days, but do you know the full rule for determining if a year is a leap year? (It has three parts.) You might have remembered that many parts of the world use daylight savings time (DST), so that some days have 23 hours, and others have 25. You might not have known that some minutes have 61 seconds because every now and then leap seconds are added because the Earth's rotation is gradually slowing down.
-
-Dates and times are hard because they have to reconcile two physical phenomena (the rotation of the Earth and its orbit around the sun) with a whole raft of geopolitical phenomena including months, time zones, and DST. This chapter won't teach you every last detail about dates and times, but it will give you a solid grounding of practical skills that will help you with common data analysis challenges.
-
-### Prerequisites
-
-This chapter will focus on the __lubridate__ package, which makes it easier to work with dates and times in R. lubridate is not part of core tidyverse because you only need it when you're working with dates/times. We will also need nycflights13 for practice data.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-
-library(lubridate)
-library(nycflights13)
-```
-
-## Creating date/times
-
-There are three types of date/time data that refer to an instant in time:
-
-* A __date__. Tibbles print this as `<date>`.
-
-* A __time__ within a day. Tibbles print this as `<time>`.
-
-* A __date-time__ is a date plus a time: it uniquely identifies an
-  instant in time (typically to the nearest second). Tibbles print this
-  as `<dttm>`. Elsewhere in R these are called POSIXct, but I don't think
-  that's a very useful name.
-  
-In this chapter we are only going to focus on dates and date-times as R doesn't have a native class for storing times. If you need one, you can use the __hms__ package.
-
-You should always use the simplest possible data type that works for your needs. That means if you can use a date instead of a date-time, you should. Date-times are substantially more complicated because of the need to handle time zones, which we'll come back to at the end of the chapter.
-
-To get the current date or date-time you can use `today()` or `now()`:
-
-```{r}
-today()
-now()
-```
-
-Otherwise, there are three ways you're likely to create a date/time:
-
-* From a string.
-* From individual date-time components.
-* From an existing date/time object.
-
-They work as follows.
-
-### From strings
-
-Date/time data often comes as strings. You've seen one approach to parsing strings into date-times in [date-times](#readr-datetimes). Another approach is to use the helpers provided by lubridate. They automatically work out the format once you specify the order of the component. To use them, identify the order in which year, month, and day appear in your dates, then arrange "y", "m", and "d" in the same order. That gives you the name of the lubridate function that will parse your date. For example:
-
-```{r}
-ymd("2017-01-31")
-mdy("January 31st, 2017")
-dmy("31-Jan-2017")
-```
-
-These functions also take unquoted numbers. This is the most concise way to create a single date/time object, as you might need when filtering date/time data. `ymd()` is short and unambiguous:
-
-```{r}
-ymd(20170131)
-```
-
-`ymd()` and friends create dates. To create a date-time, add an underscore and one or more of "h", "m", and "s" to the name of the parsing function:
-
-```{r}
-ymd_hms("2017-01-31 20:11:59")
-mdy_hm("01/31/2017 08:01")
-```
-
-You can also force the creation of a date-time from a date by supplying a timezone:
-
-```{r}
-ymd(20170131, tz = "UTC")
-```
-
-### From individual components
-
-Instead of a single string, sometimes you'll have the individual components of the date-time spread across multiple columns. This is what we have in the flights data:
-
-```{r}
-flights %>% 
-  select(year, month, day, hour, minute)
-```
-
-To create a date/time from this sort of input, use `make_date()` for dates, or `make_datetime()` for date-times:
-
-```{r}
-flights %>% 
-  select(year, month, day, hour, minute) %>% 
-  mutate(departure = make_datetime(year, month, day, hour, minute))
-```
-
-Let's do the same thing for each of the four time columns in `flights`. The times are represented in a slightly odd format, so we use modulus arithmetic to pull out the hour and minute components. Once I've created the date-time variables, I focus in on the variables we'll explore in the rest of the chapter.
-
-```{r}
-make_datetime_100 <- function(year, month, day, time) {
-  make_datetime(year, month, day, time %/% 100, time %% 100)
-}
-
-flights_dt <- flights %>% 
-  filter(!is.na(dep_time), !is.na(arr_time)) %>% 
-  mutate(
-    dep_time = make_datetime_100(year, month, day, dep_time),
-    arr_time = make_datetime_100(year, month, day, arr_time),
-    sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
-    sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
-  ) %>% 
-  select(origin, dest, ends_with("delay"), ends_with("time"))
-
-flights_dt
-```
-
-With this data, I can visualise the distribution of departure times across the year:
-
-```{r}
-flights_dt %>% 
-  ggplot(aes(dep_time)) + 
-  geom_freqpoly(binwidth = 86400) # 86400 seconds = 1 day
-```
-
-Or within a single day:
-
-```{r}
-flights_dt %>% 
-  filter(dep_time < ymd(20130102)) %>% 
-  ggplot(aes(dep_time)) + 
-  geom_freqpoly(binwidth = 600) # 600 s = 10 minutes
-```
-
-Note that when you use date-times in a numeric context (like in a histogram), 1 means 1 second, so a binwidth of 86400 means one day. For dates, 1 means 1 day.
-
-### From other types
-
-You may want to switch between a date-time and a date. That's the job of `as_datetime()` and `as_date()`:
-
-```{r}
-as_datetime(today())
-as_date(now())
-```
-
-Sometimes you'll get date/times as numeric offsets from the "Unix Epoch", 1970-01-01. If the offset is in seconds, use `as_datetime()`; if it's in days, use `as_date()`.
-
-```{r}
-as_datetime(60 * 60 * 10)
-as_date(365 * 10 + 2)
-```
-
-### Exercises
-
-1.  What happens if you parse a string that contains invalid dates?
-
-    ```{r, eval = FALSE}
-    ymd(c("2010-10-10", "bananas"))
-    ```
-
-1.  What does the `tzone` argument to `today()` do? Why is it important?
-
-1.  Use the appropriate lubridate function to parse each of the following dates:
-
-    ```{r}
-    d1 <- "January 1, 2010"
-    d2 <- "2015-Mar-07"
-    d3 <- "06-Jun-2017"
-    d4 <- c("August 19 (2015)", "July 1 (2015)")
-    d5 <- "12/30/14" # Dec 30, 2014
-    ```
-
-## Date-time components
-
-Now that you know how to get date-time data into R's date-time data structures, let's explore what you can do with them. This section will focus on the accessor functions that let you get and set individual components. The next section will look at how arithmetic works with date-times.
-
-### Getting components
-
-You can pull out individual parts of the date with the accessor functions `year()`, `month()`, `mday()` (day of the month), `yday()` (day of the year), `wday()` (day of the week), `hour()`, `minute()`, and `second()`. 
-
-```{r}
-datetime <- ymd_hms("2016-07-08 12:34:56")
-
-year(datetime)
-month(datetime)
-mday(datetime)
-
-yday(datetime)
-wday(datetime)
-```
-
-For `month()` and `wday()` you can set `label = TRUE` to return the abbreviated name of the month or day of the week. Set `abbr = FALSE` to return the full name.
-
-```{r}
-month(datetime, label = TRUE)
-wday(datetime, label = TRUE, abbr = FALSE)
-```
-
-We can use `wday()` to see that more flights depart during the week than on the weekend:
-
-```{r}
-flights_dt %>% 
-  mutate(wday = wday(dep_time, label = TRUE)) %>% 
-  ggplot(aes(x = wday)) +
-    geom_bar()
-```
-
-There's an interesting pattern if we look at the average departure delay by minute within the hour. It looks like flights leaving in minutes 20-30 and 50-60 have much lower delays than the rest of the hour!
-
-```{r}
-flights_dt %>% 
-  mutate(minute = minute(dep_time)) %>% 
-  group_by(minute) %>% 
-  summarise(
-    avg_delay = mean(arr_delay, na.rm = TRUE),
-    n = n()) %>% 
-  ggplot(aes(minute, avg_delay)) +
-    geom_line()
-```
-
-Interestingly, if we look at the _scheduled_ departure time we don't see such a strong pattern:
-
-```{r}
-sched_dep <- flights_dt %>% 
-  mutate(minute = minute(sched_dep_time)) %>% 
-  group_by(minute) %>% 
-  summarise(
-    avg_delay = mean(arr_delay, na.rm = TRUE),
-    n = n())
-
-ggplot(sched_dep, aes(minute, avg_delay)) +
-  geom_line()
-```
-
-So why do we see that pattern with the actual departure times? Well, like much data collected by humans, there's a strong bias towards flights leaving at "nice" departure times. Always be alert for this sort of pattern whenever you work with data that involves human judgement!
-
-```{r}
-ggplot(sched_dep, aes(minute, n)) +
-  geom_line()
-```
-
-### Rounding
-
-An alternative approach to plotting individual components is to round the date to a nearby unit of time, with `floor_date()`, `round_date()`, and `ceiling_date()`. Each function takes a vector of dates to adjust and then the name of the unit round down (floor), round up (ceiling), or round to. This, for example, allows us to plot the number of flights per week:
-
-```{r}
-flights_dt %>% 
-  count(week = floor_date(dep_time, "week")) %>% 
-  ggplot(aes(week, n)) +
-    geom_line()
-```
-
-Computing the difference between a rounded and unrounded date can be particularly useful.
-
-### Setting components
-
-You can also use each accessor function to set the components of a date/time: 
-
-```{r}
-(datetime <- ymd_hms("2016-07-08 12:34:56"))
-
-year(datetime) <- 2020
-datetime
-month(datetime) <- 01
-datetime
-hour(datetime) <- hour(datetime) + 1
-datetime
-```
-
-Alternatively, rather than modifying in place, you can create a new date-time with `update()`. This also allows you to set multiple values at once.
-
-```{r}
-update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
-```
-
-If values are too big, they will roll-over:
-
-```{r}
-ymd("2015-02-01") %>% 
-  update(mday = 30)
-ymd("2015-02-01") %>% 
-  update(hour = 400)
-```
-
-You can use `update()` to show the distribution of flights across the course of the day for every day of the year: 
-
-```{r}
-flights_dt %>% 
-  mutate(dep_hour = update(dep_time, yday = 1)) %>% 
-  ggplot(aes(dep_hour)) +
-    geom_freqpoly(binwidth = 300)
-```
-
-Setting larger components of a date to a constant is a powerful technique that allows you to explore patterns in the smaller components.
-
-### Exercises
-
-1.  How does the distribution of flight times within a day change over the 
-    course of the year?
-    
-1.  Compare `dep_time`, `sched_dep_time` and `dep_delay`. Are they consistent?
-    Explain your findings.
-
-1.  Compare `air_time` with the duration between the departure and arrival.
-    Explain your findings. (Hint: consider the location of the airport.)
-    
-1.  How does the average delay time change over the course of a day?
-    Should you use `dep_time` or `sched_dep_time`? Why?
-
-1.  On what day of the week should you leave if you want to minimise the
-    chance of a delay?
-
-1.  What makes the distribution of `diamonds$carat` and 
-    `flights$sched_dep_time` similar?
-
-1.  Confirm my hypothesis that the early departures of flights in minutes
-    20-30 and 50-60 are caused by scheduled flights that leave early. 
-    Hint: create a binary variable that tells you whether or not a flight 
-    was delayed.
-
-## Time spans
-
-Next you'll learn about how arithmetic with dates works, including subtraction, addition, and division. Along the way, you'll learn about three important classes that represent time spans:
-
-* __durations__, which represent an exact number of seconds.
-* __periods__, which represent human units like weeks and months.
-* __intervals__, which represent a starting and ending point.
-
-### Durations
-
-In R, when you subtract two dates, you get a difftime object:
-
-```{r}
-# How old is Hadley?
-h_age <- today() - ymd(19791014)
-h_age
-```
-
-A difftime class object records a time span of seconds, minutes, hours, days, or weeks. This ambiguity can make difftimes a little painful to work with, so lubridate provides an alternative which always uses seconds: the __duration__.
-
-```{r}
-as.duration(h_age)
-```
-
-Durations come with a bunch of convenient constructors:
-
-```{r}
-dseconds(15)
-dminutes(10)
-dhours(c(12, 24))
-ddays(0:5)
-dweeks(3)
-dyears(1)
-```
-
-Durations always record the time span in seconds. Larger units are created by converting minutes, hours, days, weeks, and years to seconds at the standard rate (60 seconds in a minute, 60 minutes in an hour, 24 hours in day, 7 days in a week, 365 days in a year).
-
-You can add and multiply durations:
-
-```{r}
-2 * dyears(1)
-dyears(1) + dweeks(12) + dhours(15)
-```
-
-You can add and subtract durations to and from days:
-
-```{r}
-tomorrow <- today() + ddays(1)
-last_year <- today() - dyears(1)
-```
-
-However, because durations represent an exact number of seconds, sometimes you might get an unexpected result:
-
-```{r}
-one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")
-
-one_pm
-one_pm + ddays(1)
-```
-
-Why is one day after 1pm on March 12, 2pm on March 13?! If you look carefully at the date you might also notice that the time zones have changed. Because of DST, March 12 only has 23 hours, so if we add a full days worth of seconds we end up with a different time.
-
-### Periods
-
-To solve this problem, lubridate provides __periods__. Periods are time spans but don't have a fixed length in seconds, instead they work with "human" times, like days and months. That allows them work in a more intuitive way:
-
-```{r}
-one_pm
-one_pm + days(1)
-```
-
-Like durations, periods can be created with a number of friendly constructor functions. 
-
-```{r}
-seconds(15)
-minutes(10)
-hours(c(12, 24))
-days(7)
-months(1:6)
-weeks(3)
-years(1)
-```
-
-You can add and multiply periods:
-
-```{r}
-10 * (months(6) + days(1))
-days(50) + hours(25) + minutes(2)
-```
-
-And of course, add them to dates. Compared to durations, periods are more likely to do what you expect:
-
-```{r}
-# A leap year
-ymd("2016-01-01") + dyears(1)
-ymd("2016-01-01") + years(1)
-
-# Daylight Savings Time
-one_pm + ddays(1)
-one_pm + days(1)
-```
-
-Let's use periods to fix an oddity related to our flight dates. Some planes appear to have arrived at their destination _before_ they departed from New York City.
-
-```{r}
-flights_dt %>% 
-  filter(arr_time < dep_time) 
-```
-
-These are overnight flights. We used the same date information for both the departure and the arrival times, but these flights arrived on the following day. We can fix this by adding `days(1)` to the arrival time of each overnight flight.
-
-```{r}
-flights_dt <- flights_dt %>% 
-  mutate(
-    overnight = arr_time < dep_time,
-    arr_time = arr_time + days(overnight * 1),
-    sched_arr_time = sched_arr_time + days(overnight * 1)
-  )
-```
-
-Now all of our flights obey the laws of physics.
-
-```{r}
-flights_dt %>% 
-  filter(overnight, arr_time < dep_time) 
-```
-
-### Intervals
-
-It's obvious what `dyears(1) / ddays(365)` should return: one, because durations are always represented by a number of seconds, and a duration of a year is defined as 365 days worth of seconds.
-
-What should `years(1) / days(1)` return? Well, if the year was 2015 it should return 365, but if it was 2016, it should return 366! There's not quite enough information for lubridate to give a single clear answer. What it does instead is give an estimate, with a warning:
-
-```{r}
-years(1) / days(1)
-```
-
-If you want a more accurate measurement, you'll have to use an __interval__. An interval is a duration with a starting point: that makes it precise so you can determine exactly how long it is:
-
-```{r}
-next_year <- today() + years(1)
-(today() %--% next_year) / ddays(1)
-```
-
-To find out how many periods fall into an interval, you need to use integer division:
-
-```{r}
-(today() %--% next_year) %/% days(1)
-```
-
-### Summary
-
-How do you pick between duration, periods, and intervals? As always, pick the simplest data structure that solves your problem. If you only care about physical time, use a duration; if you need to add human times, use a period; if you need to figure out how long a span is in human units, use an interval.
-
-Figure \@ref(fig:dt-algebra) summarises permitted arithmetic operations between the different data types.
-
-```{r dt-algebra, echo = FALSE, fig.cap = "The allowed arithmetic operations between pairs of date/time classes."}
-knitr::include_graphics("diagrams/datetimes-arithmetic.png")
-```
-
-### Exercises
-
-1.  Why is there `months()` but no `dmonths()`?
-
-1.  Explain `days(overnight * 1)` to someone who has just started 
-    learning R. How does it work?
-
-1.  Create a vector of dates giving the first day of every month in 2015.
-    Create a vector of dates giving the first day of every month
-    in the _current_ year.
-
-1.  Write a function that given your birthday (as a date), returns 
-    how old you are in years.
-
-1.  Why can't `(today() %--% (today() + years(1)) / months(1)` work?
-
-## Time zones 
-
-Time zones are an enormously complicated topic because of their interaction with geopolitical entities. Fortunately we don't need to dig into all the details as they're not all important for data analysis, but there are a few challenges we'll need to tackle head on.
-
-The first challenge is that everyday names of time zones tend to be ambiguous. For example, if you're American you're probably familiar with EST, or Eastern Standard Time. However, both Australia and Canada also have EST! To avoid confusion, R uses the international standard IANA time zones. These use a consistent naming scheme "<area>/<location>", typically in the form "\<continent\>/\<city\>" (there are a few exceptions because not every country lies on a continent). Examples include "America/New_York", "Europe/Paris", and "Pacific/Auckland".
-
-You might wonder why the time zone uses a city, when typically you think of time zones as associated with a country or region within a country. This is because the IANA database has to record decades worth of time zone rules. In the course of decades, countries change names (or break apart) fairly frequently, but city names tend to stay the same. Another problem is that name needs to reflect not only to the current behaviour, but also the complete history. For example, there are time zones for both "America/New_York" and "America/Detroit". These cities both currently use Eastern Standard Time but in 1969-1972 Michigan (the state in which Detroit is located), did not follow DST, so it needs a different name. It's worth reading the raw time zone database (available at <http://www.iana.org/time-zones>) just to read some of these stories!
-
-You can find out what R thinks your current time zone is with `Sys.timezone()`:
-
-```{r}
-Sys.timezone()
-```
-
-(If R doesn't know, you'll get an `NA`.)
-
-And see the complete list of all time zone names with `OlsonNames()`:
-
-```{r}
-length(OlsonNames())
-head(OlsonNames())
-```
-
-In R, the time zone is an attribute of the date-time that only controls printing. For example, these three objects represent the same instant in time:
-
-```{r}
-(x1 <- ymd_hms("2015-06-01 12:00:00", tz = "America/New_York"))
-(x2 <- ymd_hms("2015-06-01 18:00:00", tz = "Europe/Copenhagen"))
-(x3 <- ymd_hms("2015-06-02 04:00:00", tz = "Pacific/Auckland"))
-```
-
-You can verify that they're the same time using subtraction:
-
-```{r}
-x1 - x2
-x1 - x3
-```
-
-Unless otherwise specified, lubridate always uses UTC. UTC (Coordinated Universal Time) is the standard time zone used by the scientific community and roughly equivalent to its predecessor GMT (Greenwich Mean Time). It does not have DST, which makes a convenient representation for computation. Operations that combine date-times, like `c()`, will often drop the time zone. In that case, the date-times will display in your local time zone:
-
-```{r}
-x4 <- c(x1, x2, x3)
-x4
-```
-
-You can change the time zone in two ways:
-
-*   Keep the instant in time the same, and change how it's displayed.
-    Use this when the instant is correct, but you want a more natural
-    display.
-  
-    ```{r}
-    x4a <- with_tz(x4, tzone = "Australia/Lord_Howe")
-    x4a
-    x4a - x4
-    ```
-    
-    (This also illustrates another challenge of times zones: they're not
-    all integer hour offsets!)
-
-*   Change the underlying instant in time. Use this when you have an
-    instant that has been labelled with the incorrect time zone, and you
-    need to fix it.
-
-    ```{r}
-    x4b <- force_tz(x4, tzone = "Australia/Lord_Howe")
-    x4b
-    x4b - x4
-    ```
diff --git a/datetimes.qmd b/datetimes.qmd
new file mode 100644
index 000000000..533014db6
--- /dev/null
+++ b/datetimes.qmd
@@ -0,0 +1,797 @@
+# Dates and times {#sec-dates-and-times}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+
+# https://github.com/tidyverse/lubridate/issues/1058
+options(warnPartialMatchArgs = FALSE)
+```
+
+## Introduction
+
+This chapter will show you how to work with dates and times in R.
+At first glance, dates and times seem simple.
+You use them all the time in your regular life, and they don't seem to cause much confusion.
+However, the more you learn about dates and times, the more complicated they seem to get!
+
+To warm up think about how many days there are in a year, and how many hours there are in a day.
+You probably remembered that most years have 365 days, but leap years have 366.
+Do you know the full rule for determining if a year is a leap year[^datetimes-1]?
+The number of hours in a day is a little less obvious: most days have 24 hours, but in places that use daylight saving time (DST), one day each year has 23 hours and another has 25.
+
+[^datetimes-1]: A year is a leap year if it's divisible by 4, unless it's also divisible by 100, except if it's also divisible by 400.
+    In other words, in every set of 400 years, there's 97 leap years.
+
+Dates and times are hard because they have to reconcile two physical phenomena (the rotation of the Earth and its orbit around the sun) with a whole raft of geopolitical phenomena including months, time zones, and DST.
+This chapter won't teach you every last detail about dates and times, but it will give you a solid grounding of practical skills that will help you with common data analysis challenges.
+
+We'll begin by showing you how to create date-times from various inputs, and then once you've got a date-time, how you can extract components like year, month, and day.
+We'll then dive into the tricky topic of working with time spans, which come in a variety of flavors depending on what you're trying to do.
+We'll conclude with a brief discussion of the additional challenges posed by time zones.
+
+### Prerequisites
+
+This chapter will focus on the **lubridate** package, which makes it easier to work with dates and times in R.
+As of the latest tidyverse release, lubridate is part of core tidyverse.
+We will also need nycflights13 for practice data.
+
+```{r}
+#| message: false
+library(tidyverse)
+library(nycflights13)
+```
+
+## Creating date/times {#sec-creating-datetimes}
+
+There are three types of date/time data that refer to an instant in time:
+
+-   A **date**.
+    Tibbles print this as `<date>`.
+
+-   A **time** within a day.
+    Tibbles print this as `<time>`.
+
+-   A **date-time** is a date plus a time: it uniquely identifies an instant in time (typically to the nearest second).
+    Tibbles print this as `<dttm>`.
+    Base R calls these POSIXct, but doesn't exactly trip off the tongue.
+
+In this chapter we are going to focus on dates and date-times as R doesn't have a native class for storing times.
+If you need one, you can use the **hms** package.
+
+You should always use the simplest possible data type that works for your needs.
+That means if you can use a date instead of a date-time, you should.
+Date-times are substantially more complicated because of the need to handle time zones, which we'll come back to at the end of the chapter.
+
+To get the current date or date-time you can use `today()` or `now()`:
+
+```{r}
+today()
+now()
+```
+
+Otherwise, the following sections describe the four ways you're likely to create a date/time:
+
+-   While reading a file with readr.
+-   From a string.
+-   From individual date-time components.
+-   From an existing date/time object.
+
+### During import
+
+If your CSV contains an ISO8601 date or date-time, you don't need to do anything; readr will automatically recognize it:
+
+```{r}
+#| message: false
+csv <- "
+  date,datetime
+  2022-01-02,2022-01-02 05:12
+"
+read_csv(csv)
+```
+
+If you haven't heard of **ISO8601** before, it's an international standard[^datetimes-2] for writing dates where the components of a date are organized from biggest to smallest separated by `-`. For example, in ISO8601 May 3 2022 is `2022-05-03`. ISO8601 dates can also include times, where hour, minute, and second are separated by `:`, and the date and time components are separated by either a `T` or a space.
+For example, you could write 4:26pm on May 3 2022 as either `2022-05-03 16:26` or `2022-05-03T16:26`.
+
+[^datetimes-2]: <https://xkcd.com/1179/>
+
+For other date-time formats, you'll need to use `col_types` plus `col_date()` or `col_datetime()` along with a date-time format.
+The date-time format used by readr is a standard used across many programming languages, describing a date component with a `%` followed by a single character.
+For example, `%Y-%m-%d` specifies a date that's a year, `-`, month (as number) `-`, day.
+Table @tbl-date-formats lists all the options.
+
+| Type  | Code  | Meaning                        | Example         |
+|-------|-------|--------------------------------|-----------------|
+| Year  | `%Y`  | 4 digit year                   | 2021            |
+|       | `%y`  | 2 digit year                   | 21              |
+| Month | `%m`  | Number                         | 2               |
+|       | `%b`  | Abbreviated name               | Feb             |
+|       | `%B`  | Full name                      | February        |
+| Day   | `%d`  | Two digits                     | 02              |
+|       | `%e`  | One or two digits              | 2               |
+| Time  | `%H`  | 24-hour hour                   | 13              |
+|       | `%I`  | 12-hour hour                   | 1               |
+|       | `%p`  | AM/PM                          | pm              |
+|       | `%M`  | Minutes                        | 35              |
+|       | `%S`  | Seconds                        | 45              |
+|       | `%OS` | Seconds with decimal component | 45.35           |
+|       | `%Z`  | Time zone name                 | America/Chicago |
+|       | `%z`  | Offset from UTC                | +0800           |
+| Other | `%.`  | Skip one non-digit             | :               |
+|       | `%*`  | Skip any number of non-digits  |                 |
+
+: All date formats understood by readr {#tbl-date-formats}
+
+And this code shows a few options applied to a very ambiguous date:
+
+```{r}
+#| messages: false
+
+csv <- "
+  date
+  01/02/15
+"
+
+read_csv(csv, col_types = cols(date = col_date("%m/%d/%y")))
+
+read_csv(csv, col_types = cols(date = col_date("%d/%m/%y")))
+
+read_csv(csv, col_types = cols(date = col_date("%y/%m/%d")))
+```
+
+Note that no matter how you specify the date format, it's always displayed the same way once you get it into R.
+
+If you're using `%b` or `%B` and working with non-English dates, you'll also need to provide a `locale()`.
+See the list of built-in languages in `date_names_langs()`, or create your own with `date_names()`,
+
+### From strings
+
+The date-time specification language is powerful, but requires careful analysis of the date format.
+An alternative approach is to use lubridate's helpers which attempt to automatically determine the format once you specify the order of the component.
+To use them, identify the order in which year, month, and day appear in your dates, then arrange "y", "m", and "d" in the same order.
+That gives you the name of the lubridate function that will parse your date.
+For example:
+
+```{r}
+ymd("2017-01-31")
+mdy("January 31st, 2017")
+dmy("31-Jan-2017")
+```
+
+`ymd()` and friends create dates.
+To create a date-time, add an underscore and one or more of "h", "m", and "s" to the name of the parsing function:
+
+```{r}
+ymd_hms("2017-01-31 20:11:59")
+mdy_hm("01/31/2017 08:01")
+```
+
+You can also force the creation of a date-time from a date by supplying a timezone:
+
+```{r}
+ymd("2017-01-31", tz = "UTC")
+```
+
+Here I use the UTC[^datetimes-3] timezone which you might also know as GMT, or Greenwich Mean Time, the time at 0° longitude[^datetimes-4]
+. It doesn't use daylight saving time, making it a bit easier to compute with
+.
+
+[^datetimes-3]: You might wonder what UTC stands for.
+    It's a compromise between the English "Coordinated Universal Time" and French "Temps Universel Coordonné".
+
+[^datetimes-4]: No prizes for guessing which country came up with the longitude system.
+
+### From individual components
+
+Instead of a single string, sometimes you'll have the individual components of the date-time spread across multiple columns.
+This is what we have in the `flights` data:
+
+```{r}
+flights |> 
+  select(year, month, day, hour, minute)
+```
+
+To create a date/time from this sort of input, use `make_date()` for dates, or `make_datetime()` for date-times:
+
+```{r}
+flights |> 
+  select(year, month, day, hour, minute) |> 
+  mutate(departure = make_datetime(year, month, day, hour, minute))
+```
+
+Let's do the same thing for each of the four time columns in `flights`.
+The times are represented in a slightly odd format, so we use modulus arithmetic to pull out the hour and minute components.
+Once we've created the date-time variables, we focus in on the variables we'll explore in the rest of the chapter.
+
+```{r}
+make_datetime_100 <- function(year, month, day, time) {
+  make_datetime(year, month, day, time %/% 100, time %% 100)
+}
+
+flights_dt <- flights |> 
+  filter(!is.na(dep_time), !is.na(arr_time)) |> 
+  mutate(
+    dep_time = make_datetime_100(year, month, day, dep_time),
+    arr_time = make_datetime_100(year, month, day, arr_time),
+    sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
+    sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
+  ) |> 
+  select(origin, dest, ends_with("delay"), ends_with("time"))
+
+flights_dt
+```
+
+With this data, we can visualize the distribution of departure times across the year:
+
+```{r}
+#| fig.alt: >
+#|   A frequency polyon with departure time (Jan-Dec 2013) on the x-axis
+#|   and number of flights on the y-axis (0-1000). The frequency polygon
+#|   is binned by day so you see a time series of flights by day. The
+#|   pattern is dominated by a weekly pattern; there are fewer flights 
+#|   on weekends. The are few days that stand out as having a surprisingly
+#|   few flights in early February, early July, late November, and late
+#|   December.
+flights_dt |> 
+  ggplot(aes(x = dep_time)) + 
+  geom_freqpoly(binwidth = 86400) # 86400 seconds = 1 day
+```
+
+Or within a single day:
+
+```{r}
+#| fig.alt: >
+#|   A frequency polygon with departure time (6am - midnight Jan 1) on the
+#|   x-axis, number of flights on the y-axis (0-17), binned into 10 minute
+#|   increments. It's hard to see much pattern because of high variability,
+#|   but most bins have 8-12 flights, and there are markedly fewer flights 
+#|   before 6am and after 8pm.
+flights_dt |> 
+  filter(dep_time < ymd(20130102)) |> 
+  ggplot(aes(x = dep_time)) + 
+  geom_freqpoly(binwidth = 600) # 600 s = 10 minutes
+```
+
+Note that when you use date-times in a numeric context (like in a histogram), 1 means 1 second, so a binwidth of 86400 means one day.
+For dates, 1 means 1 day.
+
+### From other types
+
+You may want to switch between a date-time and a date.
+That's the job of `as_datetime()` and `as_date()`:
+
+```{r}
+as_datetime(today())
+as_date(now())
+```
+
+Sometimes you'll get date/times as numeric offsets from the "Unix Epoch", 1970-01-01.
+If the offset is in seconds, use `as_datetime()`; if it's in days, use `as_date()`.
+
+```{r}
+as_datetime(60 * 60 * 10)
+as_date(365 * 10 + 2)
+```
+
+### Exercises
+
+1.  What happens if you parse a string that contains invalid dates?
+
+    ```{r}
+    #| eval: false
+
+    ymd(c("2010-10-10", "bananas"))
+    ```
+
+2.  What does the `tzone` argument to `today()` do?
+    Why is it important?
+
+3.  For each of the following date-times, show how you'd parse it using a readr column specification and a lubridate function.
+
+    ```{r}
+    d1 <- "January 1, 2010"
+    d2 <- "2015-Mar-07"
+    d3 <- "06-Jun-2017"
+    d4 <- c("August 19 (2015)", "July 1 (2015)")
+    d5 <- "12/30/14" # Dec 30, 2014
+    t1 <- "1705"
+    t2 <- "11:15:10.12 PM"
+    ```
+
+## Date-time components
+
+Now that you know how to get date-time data into R's date-time data structures, let's explore what you can do with them.
+This section will focus on the accessor functions that let you get and set individual components.
+The next section will look at how arithmetic works with date-times.
+
+### Getting components
+
+You can pull out individual parts of the date with the accessor functions `year()`, `month()`, `mday()` (day of the month), `yday()` (day of the year), `wday()` (day of the week), `hour()`, `minute()`, and `second()`.
+These are effectively the opposites of `make_datetime()`.
+
+```{r}
+datetime <- ymd_hms("2026-07-08 12:34:56")
+
+year(datetime)
+month(datetime)
+mday(datetime)
+
+yday(datetime)
+wday(datetime)
+```
+
+For `month()` and `wday()` you can set `label = TRUE` to return the abbreviated name of the month or day of the week.
+Set `abbr = FALSE` to return the full name.
+
+```{r}
+month(datetime, label = TRUE)
+wday(datetime, label = TRUE, abbr = FALSE)
+```
+
+We can use `wday()` to see that more flights depart during the week than on the weekend:
+
+```{r}
+#| fig-alt: |
+#|   A bar chart with days of the week on the x-axis and number of 
+#|   flights on the y-axis. Monday-Friday have roughly the same number of
+#|   flights, ~48,0000, decreasingly slightly over the course of the week.
+#|   Sunday is a little lower (~45,000), and Saturday is much lower 
+#|   (~38,000).
+flights_dt |> 
+  mutate(wday = wday(dep_time, label = TRUE)) |> 
+  ggplot(aes(x = wday)) +
+  geom_bar()
+```
+
+We can also look at the average departure delay by minute within the hour.
+There's an interesting pattern: flights leaving in minutes 20-30 and 50-60 have much lower delays than the rest of the hour!
+
+```{r}
+#| fig-alt: | 
+#|   A line chart with minute of actual departure (0-60) on the x-axis and
+#|   average delay (4-20) on the y-axis. Average delay starts at (0, 12),
+#|   steadily increases to (18, 20), then sharply drops, hitting at minimum
+#|   at ~23 minute past the hour and 9 minutes of delay. It then increases
+#|   again to (17, 35), and sharply decreases to (55, 4). It finishes off
+#|   with an increase to (60, 9).
+flights_dt |> 
+  mutate(minute = minute(dep_time)) |> 
+  group_by(minute) |> 
+  summarize(
+    avg_delay = mean(dep_delay, na.rm = TRUE),
+    n = n()
+  ) |> 
+  ggplot(aes(x = minute, y = avg_delay)) +
+  geom_line()
+```
+
+Interestingly, if we look at the *scheduled* departure time we don't see such a strong pattern:
+
+```{r}
+#| fig-alt: | 
+#|   A line chart with minute of scheduled departure (0-60) on the x-axis
+#|   and average delay (4-16). There is relatively little pattern, just a
+#|   small suggestion that the average delay decreases from maybe 10 minutes
+#|   to 8 minutes over the course of the hour.
+sched_dep <- flights_dt |> 
+  mutate(minute = minute(sched_dep_time)) |> 
+  group_by(minute) |> 
+  summarize(
+    avg_delay = mean(arr_delay, na.rm = TRUE),
+    n = n()
+  )
+
+ggplot(sched_dep, aes(x = minute, y = avg_delay)) +
+  geom_line()
+```
+
+So why do we see that pattern with the actual departure times?
+Well, like much data collected by humans, there's a strong bias towards flights leaving at "nice" departure times, as @fig-human-rounding shows.
+Always be alert for this sort of pattern whenever you work with data that involves human judgement!
+
+```{r}
+#| label: fig-human-rounding
+#| fig-cap: |
+#|   A frequency polygon showing the number of flights scheduled to 
+#|   depart each hour. You can see a strong preference for round numbers
+#|   like 0 and 30 and generally for numbers that are a multiple of five.
+#| fig-alt: |
+#|   A line plot with departure minute (0-60) on the x-axis and number of
+#|   flights (0-60000) on the y-axis. Most flights are scheduled to depart
+#|   on either the hour (~60,000) or the half hour (~35,000). Otherwise,
+#|   all most all flights are scheduled to depart on multiples of five, 
+#|   with a few extra at 15, 45, and 55 minutes.
+#| echo: false
+ggplot(sched_dep, aes(x = minute, y = n)) +
+  geom_line()
+```
+
+### Rounding
+
+An alternative approach to plotting individual components is to round the date to a nearby unit of time, with `floor_date()`, `round_date()`, and `ceiling_date()`.
+Each function takes a vector of dates to adjust and then the name of the unit to round down (floor), round up (ceiling), or round to.
+This, for example, allows us to plot the number of flights per week:
+
+```{r}
+#| fig-alt: |
+#|   A line plot with week (Jan-Dec 2013) on the x-axis and number of
+#|   flights (2,000-7,000) on the y-axis. The pattern is fairly flat from
+#|   February to November with around 7,000 flights per week. There are
+#|   far fewer flights on the first (approximately 4,500 flights) and last
+#|   weeks of the year (approximately 2,500 flights).
+flights_dt |> 
+  count(week = floor_date(dep_time, "week")) |> 
+  ggplot(aes(x = week, y = n)) +
+  geom_line() + 
+  geom_point()
+```
+
+You can use rounding to show the distribution of flights across the course of a day by computing the difference between `dep_time` and the earliest instant of that day:
+
+```{r}
+#| fig-alt: |
+#|   A line plot with depature time on the x-axis. This is units of seconds
+#|   since midnight so it's hard to interpret.
+flights_dt |> 
+  mutate(dep_hour = dep_time - floor_date(dep_time, "day")) |> 
+  ggplot(aes(x = dep_hour)) +
+  geom_freqpoly(binwidth = 60 * 30)
+```
+
+Computing the difference between a pair of date-times yields a difftime (more on that in @sec-intervals).
+We can convert that to an `hms` object to get a more useful x-axis:
+
+```{r}
+#| fig-alt: |
+#|   A line plot with depature time (midnight to midnight) on the x-axis
+#|   and number of flights on the y-axis (0 to 15,000). There are very few
+#|   (<100) flights before 5am. The number of flights then rises rapidly 
+#|   to 12,000 / hour, peaking at 15,000 at 9am, before falling to around
+#|   8,000 / hour for 10am to 2pm. Number of flights then increases to
+#|   around 12,000 per hour until 8pm, when they rapidly drop again. 
+flights_dt |> 
+  mutate(dep_hour = hms::as_hms(dep_time - floor_date(dep_time, "day"))) |> 
+  ggplot(aes(x = dep_hour)) +
+  geom_freqpoly(binwidth = 60 * 30)
+```
+
+### Modifying components
+
+You can also use each accessor function to modify the components of a date/time.
+This doesn't come up much in data analysis, but can be useful when cleaning data that has clearly incorrect dates.
+
+```{r}
+(datetime <- ymd_hms("2026-07-08 12:34:56"))
+
+year(datetime) <- 2030
+datetime
+month(datetime) <- 01
+datetime
+hour(datetime) <- hour(datetime) + 1
+datetime
+```
+
+Alternatively, rather than modifying an existing variable, you can create a new date-time with `update()`.
+This also allows you to set multiple values in one step:
+
+```{r}
+update(datetime, year = 2030, month = 2, mday = 2, hour = 2)
+```
+
+If values are too big, they will roll-over:
+
+```{r}
+update(ymd("2023-02-01"), mday = 30)
+update(ymd("2023-02-01"), hour = 400)
+```
+
+### Exercises
+
+1.  How does the distribution of flight times within a day change over the course of the year?
+
+2.  Compare `dep_time`, `sched_dep_time` and `dep_delay`.
+    Are they consistent?
+    Explain your findings.
+
+3.  Compare `air_time` with the duration between the departure and arrival.
+    Explain your findings.
+    (Hint: consider the location of the airport.)
+
+4.  How does the average delay time change over the course of a day?
+    Should you use `dep_time` or `sched_dep_time`?
+    Why?
+
+5.  On what day of the week should you leave if you want to minimise the chance of a delay?
+
+6.  What makes the distribution of `diamonds$carat` and `flights$sched_dep_time` similar?
+
+7.  Confirm our hypothesis that the early departures of flights in minutes 20-30 and 50-60 are caused by scheduled flights that leave early.
+    Hint: create a binary variable that tells you whether or not a flight was delayed.
+
+## Time spans
+
+Next you'll learn about how arithmetic with dates works, including subtraction, addition, and division.
+Along the way, you'll learn about three important classes that represent time spans:
+
+-   **Durations**, which represent an exact number of seconds.
+-   **Periods**, which represent human units like weeks and months.
+-   **Intervals**, which represent a starting and ending point.
+
+How do you pick between duration, periods, and intervals?
+As always, pick the simplest data structure that solves your problem.
+If you only care about physical time, use a duration; if you need to add human times, use a period; if you need to figure out how long a span is in human units, use an interval.
+
+### Durations
+
+In R, when you subtract two dates, you get a difftime object:
+
+```{r}
+# How old is Hadley?
+h_age <- today() - ymd("1979-10-14")
+h_age
+```
+
+A `difftime` class object records a time span of seconds, minutes, hours, days, or weeks.
+This ambiguity can make difftimes a little painful to work with, so lubridate provides an alternative which always uses seconds: the **duration**.
+
+```{r}
+as.duration(h_age)
+```
+
+Durations come with a bunch of convenient constructors:
+
+```{r}
+dseconds(15)
+dminutes(10)
+dhours(c(12, 24))
+ddays(0:5)
+dweeks(3)
+dyears(1)
+```
+
+Durations always record the time span in seconds.
+Larger units are created by converting minutes, hours, days, weeks, and years to seconds: 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day, and 7 days in a week.
+Larger time units are more problematic.
+A year uses the "average" number of days in a year, i.e. 365.25.
+There's no way to convert a month to a duration, because there's just too much variation.
+
+You can add and multiply durations:
+
+```{r}
+2 * dyears(1)
+dyears(1) + dweeks(12) + dhours(15)
+```
+
+You can add and subtract durations to and from days:
+
+```{r}
+tomorrow <- today() + ddays(1)
+last_year <- today() - dyears(1)
+```
+
+However, because durations represent an exact number of seconds, sometimes you might get an unexpected result:
+
+```{r}
+one_am <- ymd_hms("2026-03-08 01:00:00", tz = "America/New_York")
+
+one_am
+one_am + ddays(1)
+```
+
+Why is one day after 1am March 8, 2am March 9?
+If you look carefully at the date you might also notice that the time zones have changed.
+March 8 only has 23 hours because it's when DST starts, so if we add a full days worth of seconds we end up with a different time.
+
+### Periods
+
+To solve this problem, lubridate provides **periods**.
+Periods are time spans but don't have a fixed length in seconds, instead they work with "human" times, like days and months.
+That allows them to work in a more intuitive way:
+
+```{r}
+one_am
+one_am + days(1)
+```
+
+Like durations, periods can be created with a number of friendly constructor functions.
+
+```{r}
+hours(c(12, 24))
+days(7)
+months(1:6)
+```
+
+You can add and multiply periods:
+
+```{r}
+10 * (months(6) + days(1))
+days(50) + hours(25) + minutes(2)
+```
+
+And of course, add them to dates.
+Compared to durations, periods are more likely to do what you expect:
+
+```{r}
+# A leap year
+ymd("2024-01-01") + dyears(1)
+ymd("2024-01-01") + years(1)
+
+# Daylight saving time
+one_am + ddays(1)
+one_am + days(1)
+```
+
+Let's use periods to fix an oddity related to our flight dates.
+Some planes appear to have arrived at their destination *before* they departed from New York City.
+
+```{r}
+flights_dt |> 
+  filter(arr_time < dep_time) 
+```
+
+These are overnight flights.
+We used the same date information for both the departure and the arrival times, but these flights arrived on the following day.
+We can fix this by adding `days(1)` to the arrival time of each overnight flight.
+
+```{r}
+flights_dt <- flights_dt |> 
+  mutate(
+    overnight = arr_time < dep_time,
+    arr_time = arr_time + days(overnight),
+    sched_arr_time = sched_arr_time + days(overnight)
+  )
+```
+
+Now all of our flights obey the laws of physics.
+
+```{r}
+flights_dt |> 
+  filter(arr_time < dep_time) 
+```
+
+### Intervals {#sec-intervals}
+
+What does `dyears(1) / ddays(365)` return?
+It's not quite one, because `dyears()` is defined as the number of seconds per average year, which is 365.25 days.
+
+What does `years(1) / days(1)` return?
+Well, if the year was 2015 it should return 365, but if it was 2016, it should return 366!
+There's not quite enough information for lubridate to give a single clear answer.
+What it does instead is give an estimate:
+
+```{r}
+years(1) / days(1)
+```
+
+If you want a more accurate measurement, you'll have to use an **interval**.
+An interval is a pair of starting and ending date times, or you can think of it as a duration with a starting point.
+
+You can create an interval by writing `start %--% end`:
+
+```{r}
+y2023 <- ymd("2023-01-01") %--% ymd("2024-01-01")
+y2024 <- ymd("2024-01-01") %--% ymd("2025-01-01")
+
+y2023
+y2024
+```
+
+You could then divide it by `days()` to find out how many days fit in the year:
+
+```{r}
+y2023 / days(1)
+y2024 / days(1)
+```
+
+### Exercises
+
+1.  Explain `days(!overnight)` and `days(overnight)` to someone who has just started learning R.
+    What is the key fact you need to know?
+
+2.  Create a vector of dates giving the first day of every month in 2015.
+    Create a vector of dates giving the first day of every month in the *current* year.
+
+3.  Write a function that given your birthday (as a date), returns how old you are in years.
+
+4.  Why can't `(today() %--% (today() + years(1))) / months(1)` work?
+
+## Time zones
+
+Time zones are an enormously complicated topic because of their interaction with geopolitical entities.
+Fortunately we don't need to dig into all the details as they're not all important for data analysis, but there are a few challenges we'll need to tackle head on.
+
+<!--# https://www.ietf.org/timezones/tzdb-2018a/theory.html -->
+
+The first challenge is that everyday names of time zones tend to be ambiguous.
+For example, if you're American you're probably familiar with EST, or Eastern Standard Time.
+However, both Australia and Canada also have EST!
+To avoid confusion, R uses the international standard IANA time zones.
+These use a consistent naming scheme `{area}/{location}`, typically in the form `{continent}/{city}` or `{ocean}/{city}`.
+Examples include "America/New_York", "Europe/Paris", and "Pacific/Auckland".
+
+You might wonder why the time zone uses a city, when typically you think of time zones as associated with a country or region within a country.
+This is because the IANA database has to record decades worth of time zone rules.
+Over the course of decades, countries change names (or break apart) fairly frequently, but city names tend to stay the same.
+Another problem is that the name needs to reflect not only the current behavior, but also the complete history.
+For example, there are time zones for both "America/New_York" and "America/Detroit".
+These cities both currently use Eastern Standard Time but in 1969-1972 Michigan (the state in which Detroit is located), did not follow DST, so it needs a different name.
+It's worth reading the raw time zone database (available at <https://www.iana.org/time-zones>) just to read some of these stories!
+
+You can find out what R thinks your current time zone is with `Sys.timezone()`:
+
+```{r}
+Sys.timezone()
+```
+
+(If R doesn't know, you'll get an `NA`.)
+
+And see the complete list of all time zone names with `OlsonNames()`:
+
+```{r}
+length(OlsonNames())
+head(OlsonNames())
+```
+
+In R, the time zone is an attribute of the date-time that only controls printing.
+For example, these three objects represent the same instant in time:
+
+```{r}
+x1 <- ymd_hms("2024-06-01 12:00:00", tz = "America/New_York")
+x1
+
+x2 <- ymd_hms("2024-06-01 18:00:00", tz = "Europe/Copenhagen")
+x2
+
+x3 <- ymd_hms("2024-06-02 04:00:00", tz = "Pacific/Auckland")
+x3
+```
+
+You can verify that they're the same time using subtraction:
+
+```{r}
+x1 - x2
+x1 - x3
+```
+
+Unless otherwise specified, lubridate always uses UTC.
+UTC (Coordinated Universal Time) is the standard time zone used by the scientific community and is roughly equivalent to GMT (Greenwich Mean Time).
+It does not have DST, which makes a convenient representation for computation.
+Operations that combine date-times, like `c()`, will often drop the time zone.
+In that case, the date-times will display in the time zone of the first element:
+
+```{r}
+x4 <- c(x1, x2, x3)
+x4
+```
+
+You can change the time zone in two ways:
+
+-   Keep the instant in time the same, and change how it's displayed.
+    Use this when the instant is correct, but you want a more natural display.
+
+    ```{r}
+    x4a <- with_tz(x4, tzone = "Australia/Lord_Howe")
+    x4a
+    x4a - x4
+    ```
+
+    (This also illustrates another challenge of times zones: they're not all integer hour offsets!)
+
+-   Change the underlying instant in time.
+    Use this when you have an instant that has been labelled with the incorrect time zone, and you need to fix it.
+
+    ```{r}
+    x4b <- force_tz(x4, tzone = "Australia/Lord_Howe")
+    x4b
+    x4b - x4
+    ```
+
+## Summary
+
+This chapter has introduced you to the tools that lubridate provides to help you work with date-time data.
+Working with dates and times can seem harder than necessary, but hopefully this chapter has helped you see why --- date-times are more complex than they seem at first glance, and handling every possible situation adds complexity.
+Even if your data never crosses a day light savings boundary or involves a leap year, the functions need to be able to handle it.
+
+The next chapter gives a round up of missing values.
+You've seen them in a few places and have no doubt encounter in your own analysis, and it's now time to provide a grab bag of useful techniques for dealing with them.
diff --git a/diagrams/data-science-communicate.png b/diagrams/data-science-communicate.png
deleted file mode 100644
index d3b8d0019..000000000
Binary files a/diagrams/data-science-communicate.png and /dev/null differ
diff --git a/diagrams/data-science-explore.png b/diagrams/data-science-explore.png
deleted file mode 100644
index bd84df327..000000000
Binary files a/diagrams/data-science-explore.png and /dev/null differ
diff --git a/diagrams/data-science-model.png b/diagrams/data-science-model.png
deleted file mode 100644
index c6eabd26d..000000000
Binary files a/diagrams/data-science-model.png and /dev/null differ
diff --git a/diagrams/data-science-program.png b/diagrams/data-science-program.png
deleted file mode 100644
index fb49141d9..000000000
Binary files a/diagrams/data-science-program.png and /dev/null differ
diff --git a/diagrams/data-science-us.graffle b/diagrams/data-science-us.graffle
deleted file mode 100644
index fa0cf7a34..000000000
Binary files a/diagrams/data-science-us.graffle and /dev/null differ
diff --git a/diagrams/data-science-wrangle.png b/diagrams/data-science-wrangle.png
deleted file mode 100644
index 710076a0e..000000000
Binary files a/diagrams/data-science-wrangle.png and /dev/null differ
diff --git a/diagrams/data-science.graffle b/diagrams/data-science.graffle
index f0ea601cb..aff62291e 100644
Binary files a/diagrams/data-science.graffle and b/diagrams/data-science.graffle differ
diff --git a/diagrams/data-science.png b/diagrams/data-science.png
deleted file mode 100644
index 5c4c8d41d..000000000
Binary files a/diagrams/data-science.png and /dev/null differ
diff --git a/diagrams/data-science/base.png b/diagrams/data-science/base.png
new file mode 100644
index 000000000..d7dbe02c4
Binary files /dev/null and b/diagrams/data-science/base.png differ
diff --git a/diagrams/data-science/communicate.png b/diagrams/data-science/communicate.png
new file mode 100644
index 000000000..ea91ed2b8
Binary files /dev/null and b/diagrams/data-science/communicate.png differ
diff --git a/diagrams/data-science/import.png b/diagrams/data-science/import.png
new file mode 100644
index 000000000..eee37c6f9
Binary files /dev/null and b/diagrams/data-science/import.png differ
diff --git a/diagrams/data-science/program.png b/diagrams/data-science/program.png
new file mode 100644
index 000000000..0c5cb6bfb
Binary files /dev/null and b/diagrams/data-science/program.png differ
diff --git a/diagrams/data-science/transform.png b/diagrams/data-science/transform.png
new file mode 100644
index 000000000..d72b8b9f8
Binary files /dev/null and b/diagrams/data-science/transform.png differ
diff --git a/diagrams/data-science/visualize.png b/diagrams/data-science/visualize.png
new file mode 100644
index 000000000..b090dc8e9
Binary files /dev/null and b/diagrams/data-science/visualize.png differ
diff --git a/diagrams/data-science/whole-game.png b/diagrams/data-science/whole-game.png
new file mode 100644
index 000000000..76200629f
Binary files /dev/null and b/diagrams/data-science/whole-game.png differ
diff --git a/diagrams/data-science/wrangle.png b/diagrams/data-science/wrangle.png
new file mode 100644
index 000000000..7b42ef510
Binary files /dev/null and b/diagrams/data-science/wrangle.png differ
diff --git a/diagrams/data-structures-overview.png b/diagrams/data-structures-overview.png
deleted file mode 100644
index b44109ba8..000000000
Binary files a/diagrams/data-structures-overview.png and /dev/null differ
diff --git a/diagrams/data-structures.graffle b/diagrams/data-structures.graffle
deleted file mode 100644
index c9d048a6a..000000000
Binary files a/diagrams/data-structures.graffle and /dev/null differ
diff --git a/diagrams/datetimes-arithmetic.png b/diagrams/datetimes-arithmetic.png
deleted file mode 100644
index eb36c47e4..000000000
Binary files a/diagrams/datetimes-arithmetic.png and /dev/null differ
diff --git a/diagrams/datetimes.graffle b/diagrams/datetimes.graffle
deleted file mode 100644
index 83857950e..000000000
Binary files a/diagrams/datetimes.graffle and /dev/null differ
diff --git a/diagrams/join-anti.png b/diagrams/join-anti.png
deleted file mode 100644
index cb0a4a618..000000000
Binary files a/diagrams/join-anti.png and /dev/null differ
diff --git a/diagrams/join-inner.png b/diagrams/join-inner.png
deleted file mode 100644
index 18e996daa..000000000
Binary files a/diagrams/join-inner.png and /dev/null differ
diff --git a/diagrams/join-many-to-many.png b/diagrams/join-many-to-many.png
deleted file mode 100644
index c8bb15050..000000000
Binary files a/diagrams/join-many-to-many.png and /dev/null differ
diff --git a/diagrams/join-one-to-many.png b/diagrams/join-one-to-many.png
deleted file mode 100644
index e4c130b76..000000000
Binary files a/diagrams/join-one-to-many.png and /dev/null differ
diff --git a/diagrams/join-outer.png b/diagrams/join-outer.png
deleted file mode 100644
index 946a696fb..000000000
Binary files a/diagrams/join-outer.png and /dev/null differ
diff --git a/diagrams/join-semi-many.png b/diagrams/join-semi-many.png
deleted file mode 100644
index 5ddd1091c..000000000
Binary files a/diagrams/join-semi-many.png and /dev/null differ
diff --git a/diagrams/join-semi.png b/diagrams/join-semi.png
deleted file mode 100644
index eb8657821..000000000
Binary files a/diagrams/join-semi.png and /dev/null differ
diff --git a/diagrams/join-setup.png b/diagrams/join-setup.png
deleted file mode 100644
index e95069201..000000000
Binary files a/diagrams/join-setup.png and /dev/null differ
diff --git a/diagrams/join-setup2.png b/diagrams/join-setup2.png
deleted file mode 100644
index c4f50d6e3..000000000
Binary files a/diagrams/join-setup2.png and /dev/null differ
diff --git a/diagrams/join-venn.png b/diagrams/join-venn.png
deleted file mode 100644
index 023cd2f26..000000000
Binary files a/diagrams/join-venn.png and /dev/null differ
diff --git a/diagrams/join.graffle b/diagrams/join.graffle
index 50e3daec8..0894e6b30 100644
Binary files a/diagrams/join.graffle and b/diagrams/join.graffle differ
diff --git a/diagrams/join/anti.png b/diagrams/join/anti.png
new file mode 100644
index 000000000..150115721
Binary files /dev/null and b/diagrams/join/anti.png differ
diff --git a/diagrams/join/closest.png b/diagrams/join/closest.png
new file mode 100644
index 000000000..dfbc32ab2
Binary files /dev/null and b/diagrams/join/closest.png differ
diff --git a/diagrams/join/components.png b/diagrams/join/components.png
new file mode 100644
index 000000000..b621f8a24
Binary files /dev/null and b/diagrams/join/components.png differ
diff --git a/diagrams/join/cross.png b/diagrams/join/cross.png
new file mode 100644
index 000000000..15fccc6fb
Binary files /dev/null and b/diagrams/join/cross.png differ
diff --git a/diagrams/join/full.png b/diagrams/join/full.png
new file mode 100644
index 000000000..b0c63c1bc
Binary files /dev/null and b/diagrams/join/full.png differ
diff --git a/diagrams/join/gte.png b/diagrams/join/gte.png
new file mode 100644
index 000000000..fdca9166a
Binary files /dev/null and b/diagrams/join/gte.png differ
diff --git a/diagrams/join/inner-both.png b/diagrams/join/inner-both.png
new file mode 100644
index 000000000..1cc660459
Binary files /dev/null and b/diagrams/join/inner-both.png differ
diff --git a/diagrams/join/inner.png b/diagrams/join/inner.png
new file mode 100644
index 000000000..7c6f9a89d
Binary files /dev/null and b/diagrams/join/inner.png differ
diff --git a/diagrams/join/left.png b/diagrams/join/left.png
new file mode 100644
index 000000000..4efb093f8
Binary files /dev/null and b/diagrams/join/left.png differ
diff --git a/diagrams/join/lt.png b/diagrams/join/lt.png
new file mode 100644
index 000000000..7c8b6a79d
Binary files /dev/null and b/diagrams/join/lt.png differ
diff --git a/diagrams/join/many-to-many.png b/diagrams/join/many-to-many.png
new file mode 100644
index 000000000..e2eba2564
Binary files /dev/null and b/diagrams/join/many-to-many.png differ
diff --git a/diagrams/join/many-to-one.png b/diagrams/join/many-to-one.png
new file mode 100644
index 000000000..f64dddc77
Binary files /dev/null and b/diagrams/join/many-to-one.png differ
diff --git a/diagrams/join/match-types.png b/diagrams/join/match-types.png
new file mode 100644
index 000000000..1f9fe5386
Binary files /dev/null and b/diagrams/join/match-types.png differ
diff --git a/diagrams/join/one-to-many.png b/diagrams/join/one-to-many.png
new file mode 100644
index 000000000..0c25fbaf9
Binary files /dev/null and b/diagrams/join/one-to-many.png differ
diff --git a/diagrams/join/right.png b/diagrams/join/right.png
new file mode 100644
index 000000000..5d8c6cdf2
Binary files /dev/null and b/diagrams/join/right.png differ
diff --git a/diagrams/join/semi.png b/diagrams/join/semi.png
new file mode 100644
index 000000000..b76f2115f
Binary files /dev/null and b/diagrams/join/semi.png differ
diff --git a/diagrams/join/setup.png b/diagrams/join/setup.png
new file mode 100644
index 000000000..00332168d
Binary files /dev/null and b/diagrams/join/setup.png differ
diff --git a/diagrams/join/setup2.png b/diagrams/join/setup2.png
new file mode 100644
index 000000000..cb0d82e33
Binary files /dev/null and b/diagrams/join/setup2.png differ
diff --git a/diagrams/join/venn.png b/diagrams/join/venn.png
new file mode 100644
index 000000000..c9d558f0b
Binary files /dev/null and b/diagrams/join/venn.png differ
diff --git a/diagrams/lists-flatten.png b/diagrams/lists-flatten.png
deleted file mode 100644
index 6c64fe429..000000000
Binary files a/diagrams/lists-flatten.png and /dev/null differ
diff --git a/diagrams/lists-invoke.png b/diagrams/lists-invoke.png
deleted file mode 100644
index afb697953..000000000
Binary files a/diagrams/lists-invoke.png and /dev/null differ
diff --git a/diagrams/lists-map2.png b/diagrams/lists-map2.png
deleted file mode 100644
index 211deb6a5..000000000
Binary files a/diagrams/lists-map2.png and /dev/null differ
diff --git a/diagrams/lists-pmap-named.png b/diagrams/lists-pmap-named.png
deleted file mode 100644
index 18e1bdc40..000000000
Binary files a/diagrams/lists-pmap-named.png and /dev/null differ
diff --git a/diagrams/lists-pmap-unnamed.png b/diagrams/lists-pmap-unnamed.png
deleted file mode 100644
index c463cfad0..000000000
Binary files a/diagrams/lists-pmap-unnamed.png and /dev/null differ
diff --git a/diagrams/lists-structure.png b/diagrams/lists-structure.png
deleted file mode 100644
index a0d10f0ed..000000000
Binary files a/diagrams/lists-structure.png and /dev/null differ
diff --git a/diagrams/lists-subsetting.png b/diagrams/lists-subsetting.png
deleted file mode 100644
index f7c6f73e6..000000000
Binary files a/diagrams/lists-subsetting.png and /dev/null differ
diff --git a/diagrams/lists-transpose.png b/diagrams/lists-transpose.png
deleted file mode 100644
index 77a4c29e1..000000000
Binary files a/diagrams/lists-transpose.png and /dev/null differ
diff --git a/diagrams/lists.graffle b/diagrams/lists.graffle
deleted file mode 100644
index c9ea034fb..000000000
Binary files a/diagrams/lists.graffle and /dev/null differ
diff --git a/diagrams/new-project.graffle b/diagrams/new-project.graffle
new file mode 100644
index 000000000..94e6b370e
Binary files /dev/null and b/diagrams/new-project.graffle differ
diff --git a/diagrams/new-project.png b/diagrams/new-project.png
new file mode 100644
index 000000000..9bcec1d9a
Binary files /dev/null and b/diagrams/new-project.png differ
diff --git a/diagrams/pepper.graffle b/diagrams/pepper.graffle
new file mode 100644
index 000000000..d68d6ed0b
Binary files /dev/null and b/diagrams/pepper.graffle differ
diff --git a/diagrams/pepper.png b/diagrams/pepper.png
new file mode 100644
index 000000000..effbfe027
Binary files /dev/null and b/diagrams/pepper.png differ
diff --git a/diagrams/relational-nycflights.png b/diagrams/relational-nycflights.png
deleted file mode 100644
index 10b04ce0f..000000000
Binary files a/diagrams/relational-nycflights.png and /dev/null differ
diff --git a/diagrams/relational.graffle b/diagrams/relational.graffle
index ec63ac36c..452e14eaa 100644
Binary files a/diagrams/relational.graffle and b/diagrams/relational.graffle differ
diff --git a/diagrams/relational.png b/diagrams/relational.png
new file mode 100644
index 000000000..40cc9b1c7
Binary files /dev/null and b/diagrams/relational.png differ
diff --git a/diagrams/rstudio-console.png b/diagrams/rstudio-console.png
deleted file mode 100644
index a0fb3488e..000000000
Binary files a/diagrams/rstudio-console.png and /dev/null differ
diff --git a/diagrams/rstudio-editor.png b/diagrams/rstudio-editor.png
deleted file mode 100644
index e172c8491..000000000
Binary files a/diagrams/rstudio-editor.png and /dev/null differ
diff --git a/diagrams/rstudio.graffle b/diagrams/rstudio.graffle
new file mode 100644
index 000000000..84bd99530
Binary files /dev/null and b/diagrams/rstudio.graffle differ
diff --git a/diagrams/rstudio.graffle/data.plist b/diagrams/rstudio.graffle/data.plist
deleted file mode 100644
index 604c9150f..000000000
Binary files a/diagrams/rstudio.graffle/data.plist and /dev/null differ
diff --git a/diagrams/rstudio.graffle/image10.png b/diagrams/rstudio.graffle/image10.png
deleted file mode 100644
index 9acade90a..000000000
Binary files a/diagrams/rstudio.graffle/image10.png and /dev/null differ
diff --git a/diagrams/rstudio.graffle/image8.png b/diagrams/rstudio.graffle/image8.png
deleted file mode 100644
index d5820016d..000000000
Binary files a/diagrams/rstudio.graffle/image8.png and /dev/null differ
diff --git a/diagrams/rstudio.graffle/image9.png b/diagrams/rstudio.graffle/image9.png
deleted file mode 100644
index 23b3c6a21..000000000
Binary files a/diagrams/rstudio.graffle/image9.png and /dev/null differ
diff --git a/diagrams/rstudio/clean-slate.png b/diagrams/rstudio/clean-slate.png
new file mode 100644
index 000000000..b617b1807
Binary files /dev/null and b/diagrams/rstudio/clean-slate.png differ
diff --git a/diagrams/rstudio/console.png b/diagrams/rstudio/console.png
new file mode 100644
index 000000000..0955bc387
Binary files /dev/null and b/diagrams/rstudio/console.png differ
diff --git a/diagrams/rstudio/script.png b/diagrams/rstudio/script.png
new file mode 100644
index 000000000..3ff427ace
Binary files /dev/null and b/diagrams/rstudio/script.png differ
diff --git a/diagrams/sample-rmarkdown.Rmd b/diagrams/sample-rmarkdown.Rmd
deleted file mode 100644
index b69e624dd..000000000
--- a/diagrams/sample-rmarkdown.Rmd
+++ /dev/null
@@ -1,30 +0,0 @@
----
-title: "Untitled"
-author: "RStudio"
-date: "August 2, 2016"
-output: html_document
----
-
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-```
-
-## R Markdown
-
-This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
-
-When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
-
-```{r cars}
-summary(cars)
-```
-
-## Including Plots
-
-You can also embed plots, for example:
-
-```{r pressure, echo=FALSE}
-plot(pressure)
-```
-
-Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
diff --git a/diagrams/tidy-data.graffle b/diagrams/tidy-data.graffle
new file mode 100644
index 000000000..a6ce323e7
Binary files /dev/null and b/diagrams/tidy-data.graffle differ
diff --git a/diagrams/tidy-data/cell-values.png b/diagrams/tidy-data/cell-values.png
new file mode 100644
index 000000000..0e1533082
Binary files /dev/null and b/diagrams/tidy-data/cell-values.png differ
diff --git a/diagrams/tidy-data/column-names.png b/diagrams/tidy-data/column-names.png
new file mode 100644
index 000000000..0b384de59
Binary files /dev/null and b/diagrams/tidy-data/column-names.png differ
diff --git a/diagrams/tidy-data/multiple-names.png b/diagrams/tidy-data/multiple-names.png
new file mode 100644
index 000000000..1dc13376d
Binary files /dev/null and b/diagrams/tidy-data/multiple-names.png differ
diff --git a/diagrams/tidy-data/names-and-values.png b/diagrams/tidy-data/names-and-values.png
new file mode 100644
index 000000000..b17416eb3
Binary files /dev/null and b/diagrams/tidy-data/names-and-values.png differ
diff --git a/diagrams/tidy-data/variables.png b/diagrams/tidy-data/variables.png
new file mode 100644
index 000000000..72664ff3e
Binary files /dev/null and b/diagrams/tidy-data/variables.png differ
diff --git a/diagrams/transform-logical.png b/diagrams/transform-logical.png
deleted file mode 100644
index 43462e639..000000000
Binary files a/diagrams/transform-logical.png and /dev/null differ
diff --git a/diagrams/transform.graffle b/diagrams/transform.graffle
index 15d86ceed..b0b39eae7 100644
Binary files a/diagrams/transform.graffle and b/diagrams/transform.graffle differ
diff --git a/diagrams/transform.png b/diagrams/transform.png
new file mode 100644
index 000000000..75032a269
Binary files /dev/null and b/diagrams/transform.png differ
diff --git a/explore.Rmd b/explore.Rmd
deleted file mode 100644
index 0c37639a3..000000000
--- a/explore.Rmd
+++ /dev/null
@@ -1,29 +0,0 @@
-# (PART) Explore {-}
-
-# Introduction {#explore-intro}
-
-The goal of the first part of this book is to get you up to speed with the basic tools of __data exploration__ as quickly as possible. Data exploration is the art of looking at your data, rapidly generating hypotheses, quickly testing them, then repeating again and again and again. The goal of data exploration is to generate many promising leads that you can later explore in more depth.
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/data-science-explore.png")
-```
-
-In this part of the book you will learn some useful tools that have an immediate payoff: 
-
-*   Visualisation is a great place to start with R programming, because the 
-    payoff is so clear: you get to make elegant and informative plots that help 
-    you understand data. In [data visualisation] you'll dive into visualisation, 
-    learning the basic structure of a ggplot2 plot, and powerful techniques for 
-    turning data into plots. 
-
-*   Visualisation alone is typically not enough, so in [data transformation] 
-    you'll learn the key verbs that allow you to select important variables, 
-    filter out key observations, create new variables, and compute summaries.
-  
-*   Finally, in [exploratory data analysis], you'll combine visualisation and
-    transformation with your curiosity and scepticism to ask and answer 
-    interesting questions about data.
-
-Modelling is an important part of the exploratory process, but you don't have the skills to effectively learn or apply it yet. We'll come back to it in [modelling](#model-intro), once you're better equipped with more data wrangling and programming tools.
-
-Nestled among these three chapters that teach you the tools of exploration are three chapters that focus on your R workflow. In [workflow: basics], [workflow: scripts], and [workflow: projects] you'll learn good practices for writing and organising your R code. These will set you up for success in the long run, as they'll give you the tools to stay organised when you tackle real projects.
diff --git a/extra/clustering.Rmd b/extra/clustering.Rmd
deleted file mode 100644
index 099a7814a..000000000
--- a/extra/clustering.Rmd
+++ /dev/null
@@ -1,152 +0,0 @@
-
-## Visualizing three or more variables
-
-In general, outliers, clusters, and patterns become easier to spot as you look at the interaction of more and more variables. However, as you include more variables in your plot, data becomes harder to visualize.
-
-You can extend scatterplots into three dimensions with the plotly, rgl, rglwidget, and threejs packages (among others). Each creates a "three dimensional," graph that you can rotate with your mouse. Below is an example from plotly, displayed as a static image.
-
-```{r eval = FALSE}
-library(plotly)
-plot_ly(data = iris, x = Sepal.Length, y = Sepal.Width, z = Petal.Width, 
-        color = Species, type = "scatter3d", mode = "markers")
-```
-
-```{r, echo = FALSE}
-knitr::include_graphics("images/EDA-plotly.png")
-```
-
-You can extend this approach into n-dimensional hyperspace with the ggobi package, but you will soon notice a weakness of multidimensional graphs. You can only visualize multidimensional space by projecting it onto your two dimensional retinas. In the case of 3D graphics, you can combine 2D projections with rotation to create an intuitive illusion of 3D space, but the illusion ceases to be intuitive as soon as you add a fourth dimension.
-
-This doesn't mean that you should ignore complex interactions in your data. You can explore multivariate relationships in several ways. You can
-
-* visualize each combination of variables in a multivariate relationship, two at a time
-
-* use aesthetics and facetting to add additional variables to a 2D plot
-
-* use a clustering algorithm to spot clusters in multivariate space
-
-* use a modeling algorithm to spot patterns and outliers in multivariate space
-
-## Clusters
-
-Cluster algorithms are automated tools that seek out clusters in n-dimensional space for you. Base R provides two easy to use clustering algorithms: hierarchical clustering and k means clustering.
-
-### Hierarchical clustering
-
-Hierarchical clustering uses a simple algorithm to locate groups of points that are near each other in n-dimensional space:
-
-1. Identify the two points that are closest to each other
-2. Combine these points into a cluster
-3. Treat the new cluster as a point
-4. Repeat until all of the points are grouped into a single cluster
-
-You can visualize the results of the algorithm as a dendrogram, and you can use the dendrogram to divide your data into any number of clusters. The figure below demonstrates how the algorithm would proceed in a two dimensional dataset. 
-
-```{r, echo = FALSE}
-knitr::include_graphics("images/EDA-hclust.png")
-```
-
-To use hierarchical clustering in R, begin by selecting the numeric columns from your data; you can only apply hierarchical clustering to numeric data. Then apply the `dist()` function to the data and pass the results to `hclust()`. `dist()` computes the distances between your points in the n dimensional space defined by your numeric vectors. `hclust()` performs the clustering algorithm.
-
-```{r}
-small_iris <- sample_n(iris, 50)
-  
-iris_hclust <- small_iris %>% 
-  select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>% 
-  dist() %>% 
-  hclust(method = "complete")
-```
-
-Use `plot()` to visualize the results as a dendrogram. Each observation in the dataset will appear at the bottom of the dendrogram labeled by its rowname. You can use the labels argument to set the labels to something more informative.
-
-```{r fig.height = 4}
-plot(iris_hclust, labels = small_iris$Species)
-```
-
-To see how near two data points are to each other, trace the paths of the data points up through the tree until they intersect. The y value of the intersection displays how far apart the points are in n-dimensional space. Points that are close to each other will intersect at a small y value, points that are far from each other will intersect at a large y value. Groups of points that are near each other will look like "leaves" that all grow on the same "branch." The ordering of the x axis in the dendrogram is somewhat arbitrary (think of the tree as a mobile, each horizontal branch can spin around meaninglessly). 
-
-You can split your data into any number of clusters by drawing a horizontal line across the tree. Each vertical branch that the line crosses will represent a cluster that contains all of the points downstream from the branch. Move the line up the y axis to intersect fewer branches (and create fewer clusters), move the line down the y axis to intersect more branches and (create more clusters).
-
-`cutree()` provides a useful way to split data points into clusters. Give cutree the output of `hclust()` as well as the number of clusters that you want to split the data into. `cutree()` will return a vector of cluster labels for your dataset. To visualize the results, map the output of `cutree()` to an aesthetic.
-
-```{r}
-(clusters <- cutree(iris_hclust, 3))
-
-ggplot(small_iris, aes(x = Sepal.Width, y = Sepal.Length)) +
-  geom_point(aes(color = factor(clusters)))
-```
-
-You can modify the hierarchical clustering algorithm by setting the method argument of hclust to one of "complete", "single", "average", or "centroid". The method determines how to measure the distance between two clusters or a lone point and a cluster, a measurement that affects the outcome of the algorithm.
-
-```{r, echo = FALSE}
-knitr::include_graphics("images/EDA-linkage.png")
-```
-
-* *complete* - Measures the greatest distance between any two points in the separate clusters. Tends to create distinct clusters and subclusters.
-
-* *single* - Measures the smallest distance between any two points in the separate clusters. Tends to add points one at a time to existing clusters, creating ambiguously defined clusters.
-
-* *average* - Measures the average distance between all combinations of points in the separate clusters. Tends to add points one at a time to existing clusters.
-
-* *centroid* - Measures the distance between the average location of the points in each cluster.
-
-
-```{r fig.height = 4}
-small_iris %>% 
-  select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>% 
-  dist() %>% 
-  hclust(method = "single") %>% 
-  plot(labels = small_iris$Species)
-```
-
-
-### K means clustering
-
-K means clustering provides a simulation based alternative to hierarchical clustering. It identifies the "best" way to group your data into a predefined number of clusters. The figure below visualizes (in two dimensional space) the k means algorithm:
-
-1. Randomly assign each data point to one of $k$ groups
-2. Compute the centroid of each group
-3. Reassign each point to the group whose centroid it is nearest to
-4. Repeat steps 2 and 3 until group memberships cease to change
-
-```{r, echo = FALSE}
-knitr::include_graphics("images/EDA-kmeans.png")
-```
-
-Use `kmeans()` to perform k means clustering with R. As with hierarchical clustering, you can only apply k means clustering to numerical data. Pass your numerical data to the `kmeans()` function, then set `center` to the number of clusters to search for ($k$) and `nstart` to the number of simulations to run. Since the results of k means clustering depend on the initial assignment of points to groups, which is random, R will run `nstart` simulations and then return the best results (as measured by the minimum sum of squared distances between each point and the centroid of the group it is assigned to). Finally, set the maximum number of iterations to let each simulation run in case the simulation cannot quickly find a stable grouping.
-
-```{r}
-iris_kmeans <- small_iris %>% 
-  select(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) %>% 
-  kmeans(centers = 3, nstart = 20, iter.max = 50)
-
-iris_kmeans$cluster
-```
-
-Unlike `hclust()`, the k means algorithm does not provide an intuitive visual interface. Instead, `kmeans()` returns a kmeans class object. Subset the object with `$cluster` to access a list of cluster assignments for your dataset, e.g. `iris_kmeans$cluster`. You can visualize the results by mapping them to an aesthetic, or you can apply the results by passing them to dplyr's `group_by()` function.
-
-```{r}
-ggplot(small_iris, aes(x = Sepal.Width, y = Sepal.Length)) +
-  geom_point(aes(color = factor(iris_kmeans$cluster)))
-
-small_iris %>% 
-  group_by(iris_kmeans$cluster) %>% 
-  summarise(n_obs = n(), avg_width = mean(Sepal.Width), avg_length = mean(Sepal.Length))
-```
-
-
-### Asking questions about clustering
-
-Ask the same questions about clusters that you find with `hclust()` and `kmeans()` that you would ask about clusters that you find with a graph. Ask yourself:
-
-* Do the clusters seem to identify real differences between your points? How can you tell?
-
-* Are the points within each cluster similar in some way?
-
-* Are the points in separate clusters different in some way?
-
-* Might there be a mismatch between the number of clusters that you found and the number that exist in real life? Are only a couple of the clusters meaningful? Are there more clusters in the data than you found?
-
-* How stable are the clusters if you rerun the algorithm?
-
-Keep in mind that both algorithms _will always_ return a set of clusters, whether your data appears clustered or not. As a result, you should always be skeptical about the results. They can be quite insightful, but there is no reason to treat them as a fact without doing further research. 
diff --git a/extra/databases.Rmd b/extra/databases.Rmd
deleted file mode 100644
index 6c950ce6e..000000000
--- a/extra/databases.Rmd
+++ /dev/null
@@ -1,21 +0,0 @@
-# Databases
-
-### Two-table verbs
-
-Each two-table verb has a straightforward SQL equivalent:
-
-| R                | SQL
-|------------------|--------
-| `inner_join()`   | `SELECT * FROM x JOIN y ON x.a = y.a`
-| `left_join()`    | `SELECT * FROM x LEFT JOIN y ON x.a = y.a`
-| `right_join()`   | `SELECT * FROM x RIGHT JOIN y ON x.a = y.a`
-| `full_join()`    | `SELECT * FROM x FULL JOIN y ON x.a = y.a`
-| `semi_join()`    | `SELECT * FROM x WHERE EXISTS (SELECT 1 FROM y WHERE x.a = y.a)`
-| `anti_join()`    | `SELECT * FROM x WHERE NOT EXISTS (SELECT 1 FROM y WHERE x.a = y.a)`
-| `intersect(x, y)`| `SELECT * FROM x INTERSECT SELECT * FROM y`
-| `union(x, y)`    | `SELECT * FROM x UNION SELECT * FROM y`
-| `setdiff(x, y)`  | `SELECT * FROM x EXCEPT SELECT * FROM y`
-
-`x` and `y` don't have to be tables in the same database. If you specify `copy = TRUE`, dplyr will copy the `y` table into the same location as the `x` variable. This is useful if you've downloaded a summarised dataset and determined a subset of interest that you now want the full data for. You can use `semi_join(x, y, copy = TRUE)` to upload the indices of interest to a temporary table in the same database as `x`, and then perform a efficient semi join in the database. 
-
-If you're working with large data, it maybe also be helpful to set `auto_index = TRUE`. That will automatically add an index on the join variables to the temporary table.
diff --git a/extra/heights.Rmd b/extra/heights.Rmd
deleted file mode 100644
index 488f9bcdd..000000000
--- a/extra/heights.Rmd
+++ /dev/null
@@ -1,312 +0,0 @@
-
-## Heights data
-
-Have you heard that a relationship exists between your height and your income? It sounds far-fetched---and maybe it is---but many people believe that taller people will be promoted faster and valued more for their work, an effect that increases their income. Could this be true? 
-
-Luckily, it is easy to measure someone's height, as well as their income, which means that we can collect data relevant to the question. In fact, the Bureau of Labor Statistics has been doing this in a controlled way for over 50 years. The BLS [National Longitudinal Surveys (NLS)](https://www.nlsinfo.org/) track the income, education, and life circumstances of a large cohort of Americans across several decades. In case you are wondering just how your tax dollars are being spent, the point of the NLS is not to study the relationship between height and income, that's just a lucky accident.
-
-A small sample of the full dataset is included in modelr:
-
-```{r}
-heights
-```
-
-As well as `height` and `income` there are some other variables that might affect someone's income: `age`, `sex`, `race`, years of `education`, and their score on the `afqt` (Armed Forces Qualification Test). 
-
-Now that you have the data, you can visualize the relationship between height and income. But what does the data say? How would you describe the relationship?
-
-```{r warnings = FALSE}
-ggplot(heights, aes(height, income)) +
-  geom_point()
-```
-
-First, let's address a distraction: the data is censored in an odd way. The y variable is income, which means that there are no y values less than zero. That's not odd. However, there are also no y values above $180,331. In fact, there are a line of unusual values at exactly $180,331. This is because the Bureau of Labor Statistics removed the top 2% of income values and replaced them with the mean value of the top 2% of values, an action that was not designed to enhance the usefulness of the data for data science.
-
-```{r}
-n <- nrow(heights)
-heights <- heights %>% filter(income < 150000)
-nrow(heights) / n
-```
-
-I'm going to record the original number of observations in `n`. We'll come back to this every now and then to make sure that we haven't throw out too much of our data.
-
-Also, you can see that heights have been rounded to the nearest inch so using boxplots will make it easier to see the pattern. We'll also remove the very tall and very short people so we can focus on the most typically heights: 
-
-```{r}
-heights <- heights %>% filter(between(height, 59, 78))
-nrow(heights) / n
-
-ggplot(heights, aes(height, income, group = height)) +
-  geom_boxplot()
-```
-
-(Throwing away data in the first pass at a model is perfectly acceptable: starting with a simple subset of a problem that you can easily solve is a good general strategy. But in a real analysis, once you've got the first simple model working, you really should come back and all look at the full dataset. Is removing the data still a good idea?)
-
-You can see there seems to be a fairly weak relationship: as height increase the median wage also seems to increase.  But how could we summarise that more quantitiatively?
-
-## Linear models
-
-One way is to use a linear model. A linear model is a very broad family of models: it encompasses all models that are a weighted sum of variables.
-
-The formula specifies a family of models: for example, `income ~ height` describes the family of models specified by `x1 * income + x0`, where `x0` and `x1` are real numbers.
-
-```{r}
-income ~ height
-```
-
-We fit the model by supplying the family of models (the formula), and the data, to a model fitting function, `lm()`. `lm()` finds the single model in the family of models that is closest to the data:
-
-```{r}
-h <- lm(income ~ height, data = heights)
-h 
-```
-
-We can extract the coefficients of this fitted model and write down the model it specifies:
-
-```{r}
-coef(h)
-```
-
-This tells says the model is $`r coef(h)[1]` + `r coef(h)[2]` * height$. In other words, one inch increase of height associated with an increase of \$937 in income.
-
-
-The definition that `lm()` uses for closeness is that it looks for a model that minimises the "root mean squared error". 
-
-`lm()` fits a straight line that describes the relationship between the variables in your formula. You can picture the result visually like this.
-
-```{r}
-ggplot(heights, aes(height, income)) +
-  geom_boxplot(aes(group = height)) +
-  geom_smooth(method = lm, se = FALSE)
-```
-
-`lm()` treats the variable(s) on the right-hand side of the formula as _explanatory variables_ that partially determine the value of the variable on the left-hand side of the formula, which is known as the _response variable_. In other words, it acts as if the _response variable_ is determined by a function of the _explanatory variables_. Linear regression is _linear_ because it finds the linear combination of the explanatory variables that best predict the response.
-
-
-### Exercises
-
-1.  What variables in `heights` do you expect to be most highly correlated with
-    income?  Use `cor()` plus `purrr::map_dbl()` to check your guesses.
-
-1.  Correlation only summarises the linear relationship between two continuous
-    variables. There are some famous drawbacks to the correlation. What
-    are they? Hint: google for Anscombe's quartet, read <https://xkcd.com/552/>.
-
-### Categorical
-
-Our model so far is extremely simple: it only uses one variable to try and predict income. We also know something else important: women tend to be shorter than men and tend to get paid less.
-
-```{r}
-ggplot(heights, aes(height, colour = sex)) + 
-  geom_freqpoly(binwidth = 1)
-ggplot(heights, aes(income, colour = sex)) + 
-  geom_freqpoly(binwidth = 5000)
-```
-
-What happens if we also include `sex` in the model?
-
-```{r}
-h2 <- lm(income ~ height * sex, data = heights)
-grid <- heights %>% 
-  expand(height, sex) %>% 
-  add_predictions(h2, "income")
-
-ggplot(heights, aes(height, income)) + 
-  geom_point() + 
-  geom_line(data = grid) +
-  facet_wrap(~sex)
-```
-
-Need to commment about predictions for tall women and short men - there is not a lot of data there.  Need to be particularly sceptical.
-
-`*` vs `+`.
-
-```{r}
-h3 <- lm(income ~ height + sex, data = heights)
-grid <- heights %>% 
-  expand(height, sex) %>% 
-  gather_predictions(h2, h3)
-
-ggplot(grid, aes(height, pred, colour = sex)) + 
-  geom_line() +
-  facet_wrap(~model)
-```
-
-### Continuous
-
-There appears to be a relationship between a person's education and how poorly the model predicts their income.  If we graph the model residuals against `education` above, we see that the more a person is educated, the worse the model underestimates their income:
-
-But before we add a variable to our model, we need to do a little EDA + cleaning:
-
-```{r}
-ggplot(heights, aes(education)) + geom_bar()
-heights_ed <- heights %>% filter(education >= 12)
-nrow(heights) / n
-```
-
-We could improve the model by adding education:
-
-```{r}
-he1 <- lm(income ~ height + education, data = heights_ed)
-he2 <- lm(income ~ height * education, data = heights_ed)
-```
-
-How can we visualise the results of this model? One way to think about it as a surface: we have a 2d grid of height and education, and point on that grid gets a predicted income.
-
-```{r}
-grid <- heights_ed %>% 
-  expand(height, education) %>% 
-  gather_predictions(he1, he2)
-
-ggplot(grid, aes(height, education, fill = pred)) + 
-  geom_raster() +
-  facet_wrap(~model)
-```
-
-It's easier to see what's going on in a line plot:
-
-```{r}
-ggplot(grid, aes(height, pred, group = education)) + 
-  geom_line() +
-  facet_wrap(~model)
-ggplot(grid, aes(education, pred, group = height)) + 
-  geom_line() +
-  facet_wrap(~model)
-```
-
-One of the big advantages to `+` instead of `*` is that because the terms are independent we display them using two simple plots instead of one complex plot:
-
-```{r}
-heights_ed %>% 
-  expand(
-    height = seq_range(height, 10), 
-    education = mean(education, na.rm = TRUE)
-  ) %>% 
-  add_predictions(he1, "income") %>% 
-  ggplot(aes(height, income)) + 
-    geom_line()
-
-heights_ed %>% 
-  expand(
-    height = mean(height, na.rm = TRUE), 
-    education = seq_range(education, 10)
-  ) %>% 
-  add_predictions(he1, "income") %>% 
-  ggplot(aes(education, income)) + 
-    geom_line()
-```
-
-The full interaction suggests that height matters less as education increases. But which model is "better"? We'll come back to that question later.
-
-What happens if we add the data back in to the plot? Do you get more or less sceptical about the results from this model?
-
-You can imagine that if you had a model with four continuous predictions all interacting, that it would be pretty complicated to understand what's going in the model! And certainly you don't have to - it's totally fine to use a model simply as a tool for predicting new values, and in the next chapters you'll learn some techniques to help evaluate such models without looking at them. However, I think the more you can connect your understand of the domain to the model, the more likely you are to detect potential problems before they occur.  The goal is not to undertand every last nuance of the model, but instead to understand more than what you did previously.
-
-condvis.
-
-### Categorical
-
-
-```{r}
-s <- lm(income ~ sex, data = heights)
-tidy(s)
-```
-
-Every level of the factor except one receives its own coefficient. The missing level acts as a baseline.
-
-To change the baseline, create a new factor with a new levels attribute. R will use the first level in the levels attribute as the baseline.
-
-```{r}
-heights$sex <- factor(heights$sex, levels = c("male", "female"))
-```
-
-```{r}
-hes <- lm(income ~ height + education + sex, data = heights)
-tidy(hes)
-```
-
-```{r}
-heights %>% 
-  group_by(sex)  %>% 
-  do(glance(lm(income ~ height, data = .)))
-```
-
-```{r}
-hes2 <- lm(income ~ height + education * sex, data = heights)
-tidy(hes2)
-```
-
-### Splines
-
-But what if the relationship between variables is not linear? For example, the relationship between income and education does not seem to be linear:
-
-```{r}
-ggplot(heights_ed, aes(education, income)) + 
-  geom_boxplot(aes(group = education)) +
-  geom_smooth(se = FALSE)
-```
-
-One way to introduce non-linearity into our model is to use transformed variants of the predictors.
-
-```{r}
-mod_e1 <- lm(income ~ education, data = heights_ed)
-mod_e2 <- lm(income ~ education + I(education ^ 2) + I(education ^ 3), data = heights_ed)
-
-heights_ed %>% 
-  expand(education) %>% 
-  gather_predictions(mod_e1, mod_e2) %>% 
-  ggplot(aes(education, pred, colour = model)) +
-    geom_point() + 
-    geom_line()
-```
-
-This is a bit clunky because we have to surround each transformation with `I()`. This is because the rules of model algebra are a little different to usual algebra. `x ^ 2` is equivalent to `x * x` which in the modelling algebra is equivalent to `x + x + x:x` which is the same as `x`. This is useful because `(x + y + z)^2` fit all all major terms and second order interactions of x, y, and z.
-
-```{r}
-mod_e1 <- lm(income ~ education, data = heights_ed)
-mod_e2 <- lm(income ~ poly(education, 2), data = heights_ed)
-mod_e3 <- lm(income ~ poly(education, 3), data = heights_ed)
-
-heights_ed %>% 
-  expand(education) %>% 
-  gather_predictions(mod_e1, mod_e2, mod_e3) %>% 
-  ggplot(aes(education, pred, colour = model)) +
-    geom_point() + 
-    geom_line()
-```
-
-However: there's one major problem with using `poly()`: outside the range of the data, polynomials are going to rapidly shoot off to positive or negative infinity.
-
-```{r}
-tibble(education = seq(5, 25)) %>% 
-  gather_predictions(mod_e1, mod_e2, mod_e3) %>% 
-  ggplot(aes(education, pred, colour = model)) +
-    geom_line()
-```
-
-Splines avoid this problem by linearly interpolating outside the range of the data. This isn't great either, but it's a safer default when you don't know for sure what's going to happen.
-
-```{r}
-library(splines)
-mod_e1 <- lm(income ~ education, data = heights_ed)
-mod_e2 <- lm(income ~ ns(education, 2), data = heights_ed)
-mod_e3 <- lm(income ~ ns(education, 3), data = heights_ed)
-
-tibble(education = seq(5, 25)) %>% 
-  gather_predictions(mod_e1, mod_e2, mod_e3) %>% 
-  ggplot(aes(education, pred, colour = model)) +
-    geom_line()
-```
-
-
-### Additive models
-
-    
-```{r, dev = "png"}
-library(mgcv)
-gam(income ~ s(education), data = heights)
-
-ggplot(data = heights, mapping = aes(x = education, y = income)) +
-  geom_point() +
-  geom_smooth(method = gam, formula = y ~ s(x))
-```
diff --git a/extra/robust-code.Rmd b/extra/robust-code.Rmd
deleted file mode 100644
index e01f9de08..000000000
--- a/extra/robust-code.Rmd
+++ /dev/null
@@ -1,227 +0,0 @@
-```{r, include = FALSE}
-library(magrittr)
-```
-
-# Robust code
-
-(This is an advanced topic. You shouldn't worry too much about it when you first start writing functions. Instead you should focus on getting a function that works right for the easiest 80% of the problem. Then in time, you'll learn how to get to 99% with minimal extra effort. The defaults in this book should steer you in the right direction: we avoid teaching you functions with major surprises.)
-
-In this section you'll learn an important principle that lends itself to reliable and readable code: favour code that can be understood with a minimum of context. On one extreme, take this code:
-
-```{r, eval = FALSE}
-baz <- foo(bar, qux)
-```
-
-What does it do? You can glean only a little from the context: `foo()` is a function that takes (at least) two arguments, and it returns a result we store in `baz`. But apart from that, you have no idea. To understand what this function does, you need to read the definitions of `foo()`, `bar`, and `qux`. Using better variable names helps a lot:
-
-```{r, eval = FALSE}
-df2 <- arrange(df, qux)
-```
-
-It's now much easier to see what's going on! Function and variable names are important because they tell you about (or at least jog your memory of) what the code does. That helps you understand code in isolation, even if you don't completely understand all the details. Unfortunately naming things is hard, and it's hard to give concrete advice apart from giving objects short but evocative names.  As autocomplete in RStudio has gotten better, I've tended to use longer names that are more descriptive. Short names are faster to type, but you write code relatively infrequently compared to the number of times that you read it.
-
-The idea of minimising the context needed to understand your code goes beyond just good naming. You also want to favour functions with predictable behaviour and few surprises. If a function does radically different things when its inputs differ slightly, you'll need to carefully read the surrounding context in order to predict what it will do. The goal of this section is to educate you about the most common ways R functions can be surprising and to provide you with unsurprising alternatives.
-
-There are three common classes of surprises in R:
-
-1.  Unstable types: What will `df[, x]` return? You can assume that `df` 
-    is a data frame and `x` is a vector because of their names. But you don't 
-    know whether this code will return a data frame or a vector because the 
-    behaviour of `[` depends on the length of x.
-    
-1.  Non-standard evaluation: What will `filter(df, x == y)` do? It depends on 
-    whether `x` or `y` or both are variable in `df` or variables in the current 
-    environment.
-
-1.  Hidden arguments: What sort of variable will `data.frame(x = "a")` 
-    create? It will be either a character vector or a factor depending on 
-    the value of the global `stringsAsFactors` option. 
-    
-Avoiding these three types of functions helps you to write code that you is easily understand and fails obviously with unexpected input. If this behaviour is so important, why do any functions behave differently? It's because R is not just a programming language, but it's also an environment for interactive data analysis. Some things make sense for interactive use (where you quickly check the output and guessing what you want is ok) but don't make sense for programming (where you want errors to arise as quickly as possible). 
-You might notice that these issues revolve around data frames. That's unfortunate because data frames are the data structure you'll use most commonly. It's ironic, the most frustrating things about programming in R are features that were originally designed to make your data analysis easier! Data frames try very hard to be helpful:
-
-```{r}
-df <- data.frame(xy = c("x", "y"))
-# Character vectors were hard to work with for a long time, so R
-# helpfully converts to a factor for you:
-class(df$xy)
-
-# If you're only selecting a single column, R tries to be helpful
-# and give you that column, rather than giving you a single column
-# data frame
-class(df[, "xy"])
-
-# If you have long variable names, R is "helpful" and lets you select
-# them with a unique prefix
-df$x
-```
-
-These features all made sense at the time they were added to R, but computing environments have changed a lot, and these features now tend to cause a lot of problems. tibble disables them for you:
-
-```{r, error = TRUE}
-df <- tibble::tibble(xy = c("x", "y"))
-class(df$xy)
-class(df[, "xy"])
-df$x
-```
-
-### Unpredictable types
-
-One of the aspects most frustrating for programming is that `[` returns a vector if the result has a single column, and returns a data frame otherwise. In other words, if you see code like `df[x, ]` you can't predict what it will return without knowing the value of `x`. This can trip you up in surprising ways. For example, imagine you've written this function to return the last row of a data frame:
-
-```{r}
-last_row <- function(df) {
-  df[nrow(df), ]
-}
-```
-
-It's not always going to return a row! If you give it a single column data frame, it will return a single number:
-
-```{r}
-df <- data.frame(x = 1:3)
-last_row(df)
-```
-
-There are two ways to avoid this problem:
-
-* Use `drop = FALSE`: `df[x, , drop = FALSE]`.
-* Subset the data frame like a list: `df[x]`.
-
-Using one of those techniques for `last_row()` makes it more predictable: you know it will always return a data frame.
-
-```{r}
-last_row <- function(df) {
-  df[nrow(df), , drop = FALSE]
-}
-last_row(df)
-```
-
-Another common cause of problems is the `sapply()` function. If you've never heard of it before, feel free to skip this bit: just remember to avoid it! The problem with `sapply()` is that it tries to guess what the simplest form of output is, and it always succeeds. 
-
-The following code shows how `sapply()` can produce three different types of data depending on the input.
-
-```{r}
-df <- data.frame(
-  a = 1L,
-  b = 1.5,
-  y = Sys.time(),
-  z = ordered(1)
-)
-
-
-df[1:4] %>% sapply(class) %>% str()
-df[1:2] %>% sapply(class) %>% str()
-df[3:4] %>% sapply(class) %>% str()
-```
-
-In the next chapter, you'll learn about the purrr package which provides a variety of alternatives. In this case, you could use `map_chr()` which always returns a character vector: if it can't, it will throw an error. Another option is the base `vapply()` function which takes a third argument indicating what the output should look like.
-
-This doesn't make `sapply()` bad and `vapply()` and `map_chr()` good. `sapply()` is nice because you can use it interactively without having to think about what `f` will return. 95% of the time it will do the right thing, and if it doesn't you can quickly fix it. `map_chr()` is more important when you're programming because a clear error message is more valuable when an operation is buried deep inside a tree of function calls. At this point it's worth thinking more about 
-
-### Non-standard evaluation
-
-You've learned a number of functions that implement special lookup rules:
-
-```{r, eval = FALSE}
-ggplot(mpg, aes(displ, cty)) + geom_point()
-filter(mpg, displ > 10)
-```
-
-These are called "non-standard evaluation", or NSE for short, because the usual lookup rules don't apply. In both cases above neither `displ` nor `cty` are present in the global environment. Instead both ggplot2 and dplyr look for them first in a data frame. This is great for interactive use, but can cause problems inside a function because they'll fall back to the global environment if the variable isn't found.
-
-[Talk a little bit about the standard scoping rules]
-
-For example, take this function:
-
-```{r}
-big_x <- function(df, threshold) {
-  dplyr::filter(df, x > threshold)
-}
-```
-
-There are two ways in which this function can fail:
-
-1.  `df$x` might not exist. There are two potential failure modes:
-      
-    ```{r, error = TRUE}
-    big_x(mtcars, 10)
-    
-    x <- 1
-    big_x(mtcars, 10)
-    ```
-    
-    The second failure mode is particularly pernicious because it doesn't 
-    throw an error, but instead silently returns an incorrect result. It
-    works because by design `filter()` looks in both the data frame and
-    the parent environment. 
-    
-    It is unlikely that the variable you care about will both be missing where
-    you expect it, and present where you don't expect it. But I think it's 
-    worth weighing heavily in your analysis of potential failure modes because 
-    it's a failure that's easy to miss (since it just silently gives a bad 
-    result), and hard to track down (since you need to read a lot of context).
-
-1.  `df$threshold` might exist:
-    
-    ```{r}
-    df <- tibble::tibble(x = 1:10, threshold = 100)
-    big_x(df, 5)
-    ```
-    
-    Again, this is bad because it silently gives an unexpected result.
-    
-How can you avoid this problem?  Currently, you need to do this:
-
-```{r}
-big_x <- function(df, threshold) {
-  if (!"x" %in% names(df)) 
-    stop("`df` must contain variable called `x`.", call. = FALSE)
-  
-  if ("threshold" %in% names(df))
-    stop("`df` must not contain variable called `threshold`.", call. = FALSE)
-  
-  dplyr::filter(df, x > threshold)
-}
-```
-
-Because dplyr currently has no way to force a name to be interpreted as either a local or parent variable, as I've only just realised, that's really why you should avoid NSE. In a future version you should be able to do:
-
-```{r}
-big_x <- function(df, threshold) {
-  dplyr::filter(df, local(x) > parent(threshold))
-}
-```
-
-Another option is to implement it yourself using base subsetting:
-
-```{r}
-big_x <- function(df, threshold) {
-  rows <- df$x > threshold
-  df[!is.na(rows) & rows, , drop = FALSE]
-}
-```
-
-The challenge is remembering that `filter()` also drops missing values, and you also need to remember to use `drop = FALSE`! 
-
-### Relying on global options
-
-Functions are easiest to reason about if they have two properties:
-
-1.  Their output only depends on their inputs.
-1.  They don't affect the outside world except through their return value.
-
-The first property is particularly important. If a function has hidden additional inputs, it's very difficult to even know where the important context is!
-
-The biggest breakers of this rule in base R are functions that create data frames. Most of these functions have a `stringsAsFactors` argument that defaults to `getOption("stringsAsFactors")`. This means that a global option affects the operation of a very large number of functions, and you need to be aware that, depending on an external state, a function might produce either a character vector or a factor. In this book, we steer you away from that problem by recommending functions like `readr::read_csv()` and `tibble::tibble()` that don't rely on this option. But be aware of it!  Generally if a function is affected by a global option, you should avoid setting it.
-
-Only use `options()` to control side-effects of a function. The value of an option should never affect the return value of a function. There are only three violations of this rule in base R: `stringsAsFactors`, `encoding`, `na.action`. For example, base R lets you control the number of digits printed in default displays with (e.g.) `options(digits = 3)`. This is a good use of an option because it's something that people frequently want control over, but doesn't affect the computation of a result, just its display. Follow this principle with your own use of options.
-
-### Trying too hard
-
-Another class of problems is functions that try really really hard to always return a useful result. Unfortunately they try so hard that they never throw error messages so you never find out if the input is really really weird.
-
-### Exercises
-
-1.  Look at the `encoding` argument to `file()`, `url()`, `gzfile()` etc. 
-    What is the default value? Why should you avoid setting the default
-    value on a global level?
diff --git a/factors.Rmd b/factors.Rmd
deleted file mode 100644
index 336f135e3..000000000
--- a/factors.Rmd
+++ /dev/null
@@ -1,326 +0,0 @@
-# Factors
-
-## Introduction
-
-In R, factors are used to work with categorical variables, variables that have a fixed and known set of possible values. They are also useful when you want to display character vectors in a non-alphabetical order.
-
-Historically, factors were much easier to work with than characters. As a result, many of the functions in base R automatically convert characters to factors. This means that factors often crop up in places where they're not actually helpful. Fortunately, you don't need to worry about that in the tidyverse, and can focus on situations where factors are genuinely useful.
-
-### Prerequisites
-
-To work with factors, we'll use the __forcats__ package, which provides tools for dealing with **cat**egorical variables (and it's an anagram of factors!). It provides a wide range of helpers for working with factors. forcats is not part of the core tidyverse, so we need to load it explicitly.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-library(forcats)
-```
-
-### Learning more
-
-If you want to learn more about factors, I recommend reading Amelia McNamara and Nicholas Horton’s paper, [_Wrangling categorical data in R_](https://peerj.com/preprints/3163/). This paper lays out some of the history discussed in [_stringsAsFactors: An unauthorized biography_](http://simplystatistics.org/2015/07/24/stringsasfactors-an-unauthorized-biography/) and [_stringsAsFactors = \<sigh\>_](http://notstatschat.tumblr.com/post/124987394001/stringsasfactors-sigh), and compares the tidy approaches to categorical data outlined in this book with base R methods. A early version of the paper help motivate and scope the forcats package; thanks Amelia & Nick!
-
-## Creating factors
-
-Imagine that you have a variable that records month:
-
-```{r}
-x1 <- c("Dec", "Apr", "Jan", "Mar")
-```
-
-Using a string to record this variable has two problems:
-
-1.  There are only twelve possible months, and there's nothing saving you
-    from typos:
-     
-    ```{r}
-    x2 <- c("Dec", "Apr", "Jam", "Mar")
-    ```
-    
-1.  It doesn't sort in a useful way:
-
-    ```{r}
-    sort(x1)
-    ```
-
-You can fix both of these problems with a factor. To create a factor you must start by creating a list of the valid __levels__:
-
-```{r}
-month_levels <- c(
-  "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
-  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
-)
-```
-
-Now you can create a factor:
-
-```{r}
-y1 <- factor(x1, levels = month_levels)
-y1
-sort(y1)
-```
-
-And any values not in the set will be silently converted to NA:
-
-```{r}
-y2 <- factor(x2, levels = month_levels)
-y2
-```
-
-If you want a warning, you can use `readr::parse_factor()`:
-
-```{r}
-y2 <- parse_factor(x2, levels = month_levels)
-```
-
-If you omit the levels, they'll be taken from the data in alphabetical order:
-
-```{r}
-factor(x1)
-```
-
-Sometimes you'd prefer that the order of the levels match the order of the first appearance in the data. You can do that when creating the factor by setting levels to `unique(x)`, or after the fact, with `fct_inorder()`:
-
-```{r}
-f1 <- factor(x1, levels = unique(x1))
-f1
-
-f2 <- x1 %>% factor() %>% fct_inorder()
-f2
-```
-
-If you ever need to access the set of valid levels directly, you can do so with `levels()`:
-
-```{r}
-levels(f2)
-```
-
-## General Social Survey
-
-For the rest of this chapter, we're going to focus on `forcats::gss_cat`. It's a sample of data from the [General Social Survey](http://gss.norc.org), which is a long-running US survey conducted by the independent research organization NORC at the University of Chicago. The survey has thousands of questions, so in `gss_cat` I've selected a handful that will illustrate some common challenges you'll encounter when working with factors.
-
-```{r}
-gss_cat
-```
-
-(Remember, since this dataset is provided by a package, you can get more information about the variables with `?gss_cat`.)
-
-When factors are stored in a tibble, you can't see their levels so easily. One way to see them is with `count()`:
-
-```{r}
-gss_cat %>%
-  count(race)
-```
-
-Or with a bar chart:
-
-```{r}
-ggplot(gss_cat, aes(race)) +
-  geom_bar()
-```
-
-By default, ggplot2 will drop levels that don't have any values. You can force them to display with:
-
-```{r}
-ggplot(gss_cat, aes(race)) +
-  geom_bar() +
-  scale_x_discrete(drop = FALSE)
-```
-
-These levels represent valid values that simply did not occur in this dataset. Unfortunately, dplyr doesn't yet have a `drop` option, but it will in the future.
-
-When working with factors, the two most common operations are changing the order of the levels, and changing the values of the levels. Those operations are described in the sections below.
-
-### Exercise
-
-1.  Explore the distribution of `rincome` (reported income). What makes the
-    default bar chart hard to understand? How could you improve the plot?
-
-1.  What is the most common `relig` in this survey? What's the most
-    common `partyid`?
-
-1.  Which `relig` does `denom` (denomination) apply to? How can you find
-    out with a table? How can you find out with a visualisation?
-
-## Modifying factor order
-
-It's often useful to change the order of the factor levels in a visualisation. For example, imagine you want to explore the average number of hours spent watching TV per day across religions:
-
-```{r}
-relig_summary <- gss_cat %>%
-  group_by(relig) %>%
-  summarise(
-    age = mean(age, na.rm = TRUE),
-    tvhours = mean(tvhours, na.rm = TRUE),
-    n = n()
-  )
-
-ggplot(relig_summary, aes(tvhours, relig)) + geom_point()
-```
-
-It is difficult to interpret this plot because there's no overall pattern. We can improve it by reordering the levels of `relig` using `fct_reorder()`. `fct_reorder()` takes three arguments:
-
-* `f`, the factor whose levels you want to modify.
-* `x`, a numeric vector that you want to use to reorder the levels.
-* Optionally, `fun`, a function that's used if there are multiple values of
-  `x` for each value of `f`. The default value is `median`.
-
-```{r}
-ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
-  geom_point()
-```
-
-Reordering religion makes it much easier to see that people in the "Don't know" category watch much more TV, and Hinduism & Other Eastern religions watch much less.
-
-As you start making more complicated transformations, I'd recommend moving them out of `aes()` and into a separate `mutate()` step. For example, you could rewrite the plot above as:
-
-```{r, eval = FALSE}
-relig_summary %>%
-  mutate(relig = fct_reorder(relig, tvhours)) %>%
-  ggplot(aes(tvhours, relig)) +
-    geom_point()
-```
-What if we create a similar plot looking at how average age varies across reported income level?
-
-```{r}
-rincome_summary <- gss_cat %>%
-  group_by(rincome) %>%
-  summarise(
-    age = mean(age, na.rm = TRUE),
-    tvhours = mean(tvhours, na.rm = TRUE),
-    n = n()
-  )
-
-ggplot(rincome_summary, aes(age, fct_reorder(rincome, age))) + geom_point()
-```
-
-Here, arbitrarily reordering the levels isn't a good idea! That's because `rincome` already has a principled order that we shouldn't mess with. Reserve `fct_reorder()` for factors whose levels are arbitrarily ordered.
-
-However, it does make sense to pull "Not applicable" to the front with the other special levels. You can use `fct_relevel()`. It takes a factor, `f`, and then any number of levels that you want to move to the front of the line.
-
-```{r}
-ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +
-  geom_point()
-```
-
-Why do you think the average age for "Not applicable" is so high?
-
-Another type of reordering is useful when you are colouring the lines on a plot. `fct_reorder2()` reorders the factor by the `y` values associated with the largest `x` values. This makes the plot easier to read because the line colours line up with the legend.
-
-```{r, fig.align = "default", out.width = "50%", fig.width = 4}
-by_age <- gss_cat %>%
-  filter(!is.na(age)) %>%
-  count(age, marital) %>%
-  group_by(age) %>%
-  mutate(prop = n / sum(n))
-
-ggplot(by_age, aes(age, prop, colour = marital)) +
-  geom_line(na.rm = TRUE)
-
-ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) +
-  geom_line() +
-  labs(colour = "marital")
-```
-
-Finally, for bar plots, you can use `fct_infreq()` to order levels in increasing frequency: this is the simplest type of reordering because it doesn't need any extra variables. You may want to combine with `fct_rev()`.
-
-```{r}
-gss_cat %>%
-  mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
-  ggplot(aes(marital)) +
-    geom_bar()
-```
-
-### Exercises
-
-1.  There are some suspiciously high numbers in `tvhours`. Is the mean a good
-    summary?
-
-1.  For each factor in `gss_cat` identify whether the order of the levels is
-    arbitrary or principled.
-
-1.  Why did moving "Not applicable" to the front of the levels move it to the
-    bottom of the plot?
-
-## Modifying factor levels
-
-More powerful than changing the orders of the levels is changing their values. This allows you to clarify labels for publication, and collapse levels for high-level displays. The most general and powerful tool is `fct_recode()`. It allows you to recode, or change, the value of each level. For example, take the `gss_cat$partyid`:
-
-```{r}
-gss_cat %>% count(partyid)
-```
-
-The levels are terse and inconsistent. Let's tweak them to be longer and use a parallel construction.
-
-```{r}
-gss_cat %>%
-  mutate(partyid = fct_recode(partyid,
-    "Republican, strong"    = "Strong republican",
-    "Republican, weak"      = "Not str republican",
-    "Independent, near rep" = "Ind,near rep",
-    "Independent, near dem" = "Ind,near dem",
-    "Democrat, weak"        = "Not str democrat",
-    "Democrat, strong"      = "Strong democrat"
-  )) %>%
-  count(partyid)
-```
-
-`fct_recode()` will leave levels that aren't explicitly mentioned as is, and will warn you if you accidentally refer to a level that doesn't exist.
-
-To combine groups, you can assign multiple old levels to the same new level:
-
-```{r}
-gss_cat %>%
-  mutate(partyid = fct_recode(partyid,
-    "Republican, strong"    = "Strong republican",
-    "Republican, weak"      = "Not str republican",
-    "Independent, near rep" = "Ind,near rep",
-    "Independent, near dem" = "Ind,near dem",
-    "Democrat, weak"        = "Not str democrat",
-    "Democrat, strong"      = "Strong democrat",
-    "Other"                 = "No answer",
-    "Other"                 = "Don't know",
-    "Other"                 = "Other party"
-  )) %>%
-  count(partyid)
-```
-
-You must use this technique with care: if you group together categories that are truly different you will end up with misleading results.
-
-If you want to collapse a lot of levels, `fct_collapse()` is a useful variant of `fct_recode()`. For each new variable, you can provide a vector of old levels:
-
-```{r}
-gss_cat %>%
-  mutate(partyid = fct_collapse(partyid,
-    other = c("No answer", "Don't know", "Other party"),
-    rep = c("Strong republican", "Not str republican"),
-    ind = c("Ind,near rep", "Independent", "Ind,near dem"),
-    dem = c("Not str democrat", "Strong democrat")
-  )) %>%
-  count(partyid)
-```
-
-Sometimes you just want to lump together all the small groups to make a plot or table simpler. That's the job of `fct_lump()`:
-
-```{r}
-gss_cat %>%
-  mutate(relig = fct_lump(relig)) %>%
-  count(relig)
-```
-
-The default behaviour is to progressively lump together the smallest groups, ensuring that the aggregate is still the smallest group. In this case it's not very helpful: it is true that the majority of Americans in this survey are Protestant, but we've probably over collapsed.
-
-Instead, we can use the `n` parameter to specify how many groups (excluding other) we want to keep:
-
-```{r}
-gss_cat %>%
-  mutate(relig = fct_lump(relig, n = 10)) %>%
-  count(relig, sort = TRUE) %>%
-  print(n = Inf)
-```
-
-### Exercises
-
-1.  How have the proportions of people identifying as Democrat, Republican, and
-    Independent changed over time?
-
-1.  How could you collapse `rincome` into a small set of categories?
diff --git a/factors.qmd b/factors.qmd
new file mode 100644
index 000000000..498b3fe7b
--- /dev/null
+++ b/factors.qmd
@@ -0,0 +1,440 @@
+# Factors {#sec-factors}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+Factors are used for categorical variables, variables that have a fixed and known set of possible values.
+They are also useful when you want to display character vectors in a non-alphabetical order.
+
+We'll start by motivating why factors are needed for data analysis[^factors-1] and how you can create them with `factor()`. We'll then introduce you to the `gss_cat` dataset which contains a bunch of categorical variables to experiment with.
+You'll then use that dataset to practice modifying the order and values of factors, before we finish up with a discussion of ordered factors.
+
+[^factors-1]: They're also really important for modelling.
+
+### Prerequisites
+
+Base R provides some basic tools for creating and manipulating factors.
+We'll supplement these with the **forcats** package, which is part of the core tidyverse.
+It provides tools for dealing with **cat**egorical variables (and it's an anagram of factors!) using a wide range of helpers for working with factors.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+## Factor basics
+
+Imagine that you have a variable that records month:
+
+```{r}
+x1 <- c("Dec", "Apr", "Jan", "Mar")
+```
+
+Using a string to record this variable has two problems:
+
+1.  There are only twelve possible months, and there's nothing saving you from typos:
+
+    ```{r}
+    x2 <- c("Dec", "Apr", "Jam", "Mar")
+    ```
+
+2.  It doesn't sort in a useful way:
+
+    ```{r}
+    sort(x1)
+    ```
+
+You can fix both of these problems with a factor.
+To create a factor you must start by creating a list of the valid **levels**:
+
+```{r}
+month_levels <- c(
+  "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
+  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+)
+```
+
+Now you can create a factor:
+
+```{r}
+y1 <- factor(x1, levels = month_levels)
+y1
+
+sort(y1)
+```
+
+And any values not in the level will be silently converted to NA:
+
+```{r}
+y2 <- factor(x2, levels = month_levels)
+y2
+```
+
+This seems risky, so you might want to use `forcats::fct()` instead:
+
+```{r}
+#| error: true
+y2 <- fct(x2, levels = month_levels)
+```
+
+If you omit the levels, they'll be taken from the data in alphabetical order:
+
+```{r}
+factor(x1)
+```
+
+Sorting alphabetically is slightly risky because not every computer will sort strings in the same way.
+So `forcats::fct()` orders by first appearance:
+
+```{r}
+fct(x1)
+```
+
+If you ever need to access the set of valid levels directly, you can do so with `levels()`:
+
+```{r}
+levels(y2)
+```
+
+You can also create a factor when reading your data with readr with `col_factor()`:
+
+```{r}
+csv <- "
+month,value
+Jan,12
+Feb,56
+Mar,12"
+
+df <- read_csv(csv, col_types = cols(month = col_factor(month_levels)))
+df$month
+```
+
+## General Social Survey
+
+For the rest of this chapter, we're going to use `forcats::gss_cat`.
+It's a sample of data from the [General Social Survey](https://gss.norc.org), a long-running US survey conducted by the independent research organization NORC at the University of Chicago.
+The survey has thousands of questions, so in `gss_cat` Hadley selected a handful that will illustrate some common challenges you'll encounter when working with factors.
+
+```{r}
+gss_cat
+```
+
+(Remember, since this dataset is provided by a package, you can get more information about the variables with `?gss_cat`.)
+
+When factors are stored in a tibble, you can't see their levels so easily.
+One way to view them is with `count()`:
+
+```{r}
+gss_cat |>
+  count(race)
+```
+
+When working with factors, the two most common operations are changing the order of the levels, and changing the values of the levels.
+Those operations are described in the sections below.
+
+### Exercise
+
+1.  Explore the distribution of `rincome` (reported income).
+    What makes the default bar chart hard to understand?
+    How could you improve the plot?
+
+2.  What is the most common `relig` in this survey?
+    What's the most common `partyid`?
+
+3.  Which `relig` does `denom` (denomination) apply to?
+    How can you find out with a table?
+    How can you find out with a visualization?
+
+## Modifying factor order {#sec-modifying-factor-order}
+
+It's often useful to change the order of the factor levels in a visualization.
+For example, imagine you want to explore the average number of hours spent watching TV per day across religions:
+
+```{r}
+#| fig-alt: |
+#|   A scatterplot of with tvhours on the x-axis and religion on the y-axis.
+#|   The y-axis is ordered seemingly aribtrarily making it hard to get
+#|   any sense of overall pattern.
+relig_summary <- gss_cat |>
+  group_by(relig) |>
+  summarize(
+    tvhours = mean(tvhours, na.rm = TRUE),
+    n = n()
+  )
+
+ggplot(relig_summary, aes(x = tvhours, y = relig)) + 
+  geom_point()
+```
+
+It is hard to read this plot because there's no overall pattern.
+We can improve it by reordering the levels of `relig` using `fct_reorder()`.
+`fct_reorder()` takes three arguments:
+
+-   `f`, the factor whose levels you want to modify.
+-   `x`, a numeric vector that you want to use to reorder the levels.
+-   Optionally, `fun`, a function that's used if there are multiple values of `x` for each value of `f`. The default value is `median`.
+
+```{r}
+#| fig-alt: |
+#|   The same scatterplot as above, but now the religion is displayed in
+#|   increasing order of tvhours. "Other eastern" has the fewest tvhours
+#|   under 2, and "Don't know" has the highest (over 5).
+ggplot(relig_summary, aes(x = tvhours, y = fct_reorder(relig, tvhours))) +
+  geom_point()
+```
+
+Reordering religion makes it much easier to see that people in the "Don't know" category watch much more TV, and Hinduism & Other Eastern religions watch much less.
+
+As you start making more complicated transformations, we recommend moving them out of `aes()` and into a separate `mutate()` step.
+For example, you could rewrite the plot above as:
+
+```{r}
+#| eval: false
+
+relig_summary |>
+  mutate(
+    relig = fct_reorder(relig, tvhours)
+  ) |>
+  ggplot(aes(x = tvhours, y = relig)) +
+  geom_point()
+```
+
+What if we create a similar plot looking at how average age varies across reported income level?
+
+```{r}
+#| fig-alt: |
+#|   A scatterplot with age on the x-axis and income on the y-axis. Income
+#|   has been reordered in order of average age which doesn't make much
+#|   sense. One section of the y-axis goes from $6000-6999, then <$1000, 
+#|   then $8000-9999.
+rincome_summary <- gss_cat |>
+  group_by(rincome) |>
+  summarize(
+    age = mean(age, na.rm = TRUE),
+    n = n()
+  )
+
+ggplot(rincome_summary, aes(x = age, y = fct_reorder(rincome, age))) + 
+  geom_point()
+```
+
+Here, arbitrarily reordering the levels isn't a good idea!
+That's because `rincome` already has a principled order that we shouldn't mess with.
+Reserve `fct_reorder()` for factors whose levels are arbitrarily ordered.
+
+However, it does make sense to pull "Not applicable" to the front with the other special levels.
+You can use `fct_relevel()`.
+It takes a factor, `f`, and then any number of levels that you want to move to the front of the line.
+
+```{r}
+#| fig-alt: |
+#|   The same scatterplot but now "Not Applicable" is displayed at the
+#|   bottom of the y-axis. Generally there is a positive association
+#|   between income and age, and the income band with the highethst average
+#|   age is "Not applicable".
+
+ggplot(rincome_summary, aes(x = age, y = fct_relevel(rincome, "Not applicable"))) +
+  geom_point()
+```
+
+Why do you think the average age for "Not applicable" is so high?
+
+Another type of reordering is useful when you are coloring the lines on a plot.
+`fct_reorder2(f, x, y)` reorders the factor `f` by the `y` values associated with the largest `x` values.
+This makes the plot easier to read because the colors of the line at the far right of the plot will line up with the legend.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|     A line plot with age on the x-axis and proportion on the y-axis.
+#|     There is one line for each category of marital status: no answer,
+#|     never married, separated, divorced, widowed, and married. It is
+#|     a little hard to read the plot because the order of the legend is 
+#|     unrelated to the lines on the plot.   
+#|     
+#|     Rearranging the legend makes the plot easier to read because the
+#|     legend colors now match the order of the lines on the far right 
+#|     of the plot. You can see some unsurprising patterns: the proportion
+#|     never married decreases with age, married forms an upside down U 
+#|     shape, and widowed starts off low but increases steeply after age
+#|     60.
+by_age <- gss_cat |>
+  filter(!is.na(age)) |> 
+  count(age, marital) |>
+  group_by(age) |>
+  mutate(
+    prop = n / sum(n)
+  )
+
+ggplot(by_age, aes(x = age, y = prop, color = marital)) +
+  geom_line(linewidth = 1) + 
+  scale_color_brewer(palette = "Set1")
+
+ggplot(by_age, aes(x = age, y = prop, color = fct_reorder2(marital, age, prop))) +
+  geom_line(linewidth = 1) +
+  scale_color_brewer(palette = "Set1") + 
+  labs(color = "marital") 
+```
+
+Finally, for bar plots, you can use `fct_infreq()` to order levels in decreasing frequency: this is the simplest type of reordering because it doesn't need any extra variables.
+Combine it with `fct_rev()` if you want them in increasing frequency so that in the bar plot largest values are on the right, not the left.
+
+```{r}
+#| fig-alt: |
+#|   A bar char of marital status ordered in from least to most common:
+#|   no answer (~0), separated (~1,000), widowed (~2,000), divorced 
+#|   (~3,000), never married (~5,000), married (~10,000).
+gss_cat |>
+  mutate(marital = marital |> fct_infreq() |> fct_rev()) |>
+  ggplot(aes(x = marital)) +
+  geom_bar()
+```
+
+### Exercises
+
+1.  There are some suspiciously high numbers in `tvhours`.
+    Is the mean a good summary?
+
+2.  For each factor in `gss_cat` identify whether the order of the levels is arbitrary or principled.
+
+3.  Why did moving "Not applicable" to the front of the levels move it to the bottom of the plot?
+
+## Modifying factor levels
+
+More powerful than changing the orders of the levels is changing their values.
+This allows you to clarify labels for publication, and collapse levels for high-level displays.
+The most general and powerful tool is `fct_recode()`.
+It allows you to recode, or change, the value of each level.
+For example, take the `partyid` variable from the `gss_cat` data frame:
+
+```{r}
+gss_cat |> count(partyid)
+```
+
+The levels are terse and inconsistent.
+Let's tweak them to be longer and use a parallel construction.
+Like most rename and recoding functions in the tidyverse, the new values go on the left and the old values go on the right:
+
+```{r}
+gss_cat |>
+  mutate(
+    partyid = fct_recode(partyid,
+      "Republican, strong"    = "Strong republican",
+      "Republican, weak"      = "Not str republican",
+      "Independent, near rep" = "Ind,near rep",
+      "Independent, near dem" = "Ind,near dem",
+      "Democrat, weak"        = "Not str democrat",
+      "Democrat, strong"      = "Strong democrat"
+    )
+  ) |>
+  count(partyid)
+```
+
+`fct_recode()` will leave the levels that aren't explicitly mentioned as is, and will warn you if you accidentally refer to a level that doesn't exist.
+
+To combine groups, you can assign multiple old levels to the same new level:
+
+```{r}
+#| results: false
+gss_cat |>
+  mutate(
+    partyid = fct_recode(partyid,
+      "Republican, strong"    = "Strong republican",
+      "Republican, weak"      = "Not str republican",
+      "Independent, near rep" = "Ind,near rep",
+      "Independent, near dem" = "Ind,near dem",
+      "Democrat, weak"        = "Not str democrat",
+      "Democrat, strong"      = "Strong democrat",
+      "Other"                 = "No answer",
+      "Other"                 = "Don't know",
+      "Other"                 = "Other party"
+    )
+  )
+```
+
+Use this technique with care: if you group together categories that are truly different you will end up with misleading results.
+
+If you want to collapse a lot of levels, `fct_collapse()` is a useful variant of `fct_recode()`.
+For each new variable, you can provide a vector of old levels:
+
+```{r}
+gss_cat |>
+  mutate(
+    partyid = fct_collapse(partyid,
+      "other" = c("No answer", "Don't know", "Other party"),
+      "rep" = c("Strong republican", "Not str republican"),
+      "ind" = c("Ind,near rep", "Independent", "Ind,near dem"),
+      "dem" = c("Not str democrat", "Strong democrat")
+    )
+  ) |>
+  count(partyid)
+```
+
+Sometimes you just want to lump together the small groups to make a plot or table simpler.
+That's the job of the `fct_lump_*()` family of functions.
+`fct_lump_lowfreq()` is a simple starting point that progressively lumps the smallest groups categories into "Other", always keeping "Other" as the smallest category.
+
+```{r}
+gss_cat |>
+  mutate(relig = fct_lump_lowfreq(relig)) |>
+  count(relig)
+```
+
+In this case it's not very helpful: it is true that the majority of Americans in this survey are Protestant, but we'd probably like to see some more details!
+Instead, we can use the `fct_lump_n()` to specify that we want exactly 10 groups:
+
+```{r}
+gss_cat |>
+  mutate(relig = fct_lump_n(relig, n = 10)) |>
+  count(relig, sort = TRUE)
+```
+
+Read the documentation to learn about `fct_lump_min()` and `fct_lump_prop()` which are useful in other cases.
+
+### Exercises
+
+1.  How have the proportions of people identifying as Democrat, Republican, and Independent changed over time?
+
+2.  How could you collapse `rincome` into a small set of categories?
+
+3.  Notice there are 9 groups (excluding other) in the `fct_lump` example above.
+    Why not 10?
+    (Hint: type `?fct_lump`, and find the default for the argument `other_level` is "Other".)
+
+## Ordered factors {#sec-ordered-factors}
+
+Before we go on, there's a special type of factor that needs to be mentioned briefly: ordered factors.
+Ordered factors, created with `ordered()`, imply a strict ordering and equal distance between levels: the first level is "less than" the second level by the same amount that the second level is "less than" the third level, and so on.
+You can recognize them when printing because they use `<` between the factor levels:
+
+```{r}
+ordered(c("a", "b", "c"))
+```
+
+In practice, `ordered()` factors behave very similarly to regular factors.
+There are only two places where you might notice different behavior:
+
+-   If you map an ordered factor to color or fill in ggplot2, it will default to `scale_color_viridis()`/`scale_fill_viridis()`, a color scale that implies a ranking.
+-   If you use an ordered function in a linear model, it will use "polygonal contrasts". These are mildly useful, but you are unlikely to have heard of them unless you have a PhD in Statistics, and even then you probably don't routinely interpret them. If you want to learn more, we recommend `vignette("contrasts", package = "faux")` by Lisa DeBruine.
+
+Given the arguable utility of these differences, we don't generally recommend using ordered factors.
+
+## Summary
+
+This chapter introduced you to the handy forcats package for working with factors, introducing you to the most commonly used functions.
+forcats contains a wide range of other helpers that we didn't have space to discuss here, so whenever you're facing a factor analysis challenge that you haven't encountered before, I highly recommend skimming the [reference index](https://forcats.tidyverse.org/reference/index.html) to see if there's a canned function that can help solve your problem.
+
+If you want to learn more about factors after reading this chapter, we recommend reading Amelia McNamara and Nicholas Horton's paper, [*Wrangling categorical data in R*](https://peerj.com/preprints/3163/).
+This paper lays out some of the history discussed in [*stringsAsFactors: An unauthorized biography*](https://simplystatistics.org/posts/2015-07-24-stringsasfactors-an-unauthorized-biography/) and [*stringsAsFactors = \<sigh\>*](https://notstatschat.tumblr.com/post/124987394001/stringsasfactors-sigh), and compares the tidy approaches to categorical data outlined in this book with base R methods.
+An early version of the paper helped motivate and scope the forcats package; thanks Amelia & Nick!
+
+In the next chapter we'll switch gears to start learning about dates and times in R.
+Dates and times seem deceptively simple, but as you'll soon see, the more you learn about them, the more complex they seem to get!
diff --git a/functions.Rmd b/functions.Rmd
deleted file mode 100644
index f88e532e1..000000000
--- a/functions.Rmd
+++ /dev/null
@@ -1,796 +0,0 @@
-# Functions
-
-## Introduction 
-
-One of the best ways to improve your reach as a data scientist is to write functions. Functions allow you to automate common tasks in a more powerful and general way than copy-and-pasting. Writing a function has three big advantages over using copy-and-paste:
-
-1.  You can give a function an evocative name that makes your code easier to 
-    understand.
-
-1.  As requirements change, you only need to update code in one place, instead
-    of many.
-
-1.  You eliminate the chance of making incidental mistakes when you copy and 
-    paste (i.e. updating a variable name in one place, but not in another).
-
-Writing good functions is a lifetime journey. Even after using R for many years I still learn new techniques and better ways of approaching old problems. The goal of this chapter is not to teach you every esoteric detail of functions but to get you started with some pragmatic advice that you can apply immediately.
-
-As well as practical advice for writing functions, this chapter also gives you some suggestions for how to style your code. Good code style is like correct punctuation. Youcanmanagewithoutit, but it sure makes things easier to read! As with styles of punctuation, there are many possible variations. Here we present the style we use in our code, but the most important thing is to be consistent.
-
-### Prerequisites
-
-The focus of this chapter is on writing functions in base R, so you won't need any extra packages.
-
-## When should you write a function?
-
-You should consider writing a function whenever you've copied and pasted a block of code more than twice (i.e. you now have three copies of the same code). For example, take a look at this code. What does it do?
-
-```{r}
-df <- tibble::tibble(
-  a = rnorm(10),
-  b = rnorm(10),
-  c = rnorm(10),
-  d = rnorm(10)
-)
-
-df$a <- (df$a - min(df$a, na.rm = TRUE)) / 
-  (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
-df$b <- (df$b - min(df$b, na.rm = TRUE)) / 
-  (max(df$b, na.rm = TRUE) - min(df$a, na.rm = TRUE))
-df$c <- (df$c - min(df$c, na.rm = TRUE)) / 
-  (max(df$c, na.rm = TRUE) - min(df$c, na.rm = TRUE))
-df$d <- (df$d - min(df$d, na.rm = TRUE)) / 
-  (max(df$d, na.rm = TRUE) - min(df$d, na.rm = TRUE))
-```
-
-You might be able to puzzle out that this rescales each column to have a range from 0 to 1. But did you spot the mistake? I made an error when copying-and-pasting the code for `df$b`: I forgot to change an `a` to a `b`. Extracting repeated code out into a function is a good idea because it prevents you from making this type of mistake.
-
-To write a function you need to first analyse the code. How many inputs does it have?
-
-```{r, eval = FALSE}
-(df$a - min(df$a, na.rm = TRUE)) /
-  (max(df$a, na.rm = TRUE) - min(df$a, na.rm = TRUE))
-```
-
-This code only has one input: `df$a`. (If you're surprised that `TRUE` is not an input, you can explore why in the exercise below.) To make the inputs more clear, it's a good idea to rewrite the code using temporary variables with general names. Here this code only requires a single numeric vector, so I'll call it `x`:
-
-```{r}
-x <- df$a
-(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
-```
-
-There is some duplication in this code. We're computing the range of the data three times, so it makes sense to do it in one step:
-
-```{r}
-rng <- range(x, na.rm = TRUE)
-(x - rng[1]) / (rng[2] - rng[1])
-```
-
-Pulling out intermediate calculations into named variables is a good practice because it makes it more clear what the code is doing. Now that I've simplified the code, and checked that it still works, I can turn it into a function:
-
-```{r}
-rescale01 <- function(x) {
-  rng <- range(x, na.rm = TRUE)
-  (x - rng[1]) / (rng[2] - rng[1])
-}
-rescale01(c(0, 5, 10))
-```
-
-There are three key steps to creating a new function:
-
-1.  You need to pick a __name__ for the function. Here I've used `rescale01` 
-    because this function rescales a vector to lie between 0 and 1.
-
-1.  You list the inputs, or __arguments__, to the function inside `function`.
-    Here we have just one argument. If we had more the call would look like
-    `function(x, y, z)`.
-
-1.  You place the code you have developed in __body__ of the function, a 
-    `{` block that immediately follows `function(...)`.
-
-Note the overall process: I only made the function after I'd figured out how to make it work with a simple input. It's easier to start with working code and turn it into a function; it's harder to create a function and then try to make it work.
-
-At this point it's a good idea to check your function with a few different inputs:
-
-```{r}
-rescale01(c(-10, 0, 10))
-rescale01(c(1, 2, 3, NA, 5))
-```
-
-As you write more and more functions you'll eventually want to convert these informal, interactive tests into formal, automated tests. That process is called unit testing. Unfortunately, it's beyond the scope of this book, but you can learn about it in <http://r-pkgs.had.co.nz/tests.html>.
-
-We can simplify the original example now that we have a function:
-
-```{r}
-df$a <- rescale01(df$a)
-df$b <- rescale01(df$b)
-df$c <- rescale01(df$c)
-df$d <- rescale01(df$d)
-```
-
-Compared to the original, this code is easier to understand and we've eliminated one class of copy-and-paste errors. There is still quite a bit of duplication since we're doing the same thing to multiple columns. We'll learn how to eliminate that duplication in [iteration], once you've learned more about R's data structures in [vectors].
-
-Another advantage of functions is that if our requirements change, we only need to make the change in one place. For example, we might discover that some of our variables include infinite values, and `rescale01()` fails:
-
-```{r}
-x <- c(1:10, Inf)
-rescale01(x)
-```
-
-Because we've extracted the code into a function, we only need to make the fix in one place:
-
-```{r}
-rescale01 <- function(x) {
-  rng <- range(x, na.rm = TRUE, finite = TRUE)
-  (x - rng[1]) / (rng[2] - rng[1])
-}
-rescale01(x)
-```
-
-This is an important part of the "do not repeat yourself" (or DRY) principle. The more repetition you have in your code, the more places you need to remember to update when things change (and they always do!), and the more likely you are to create bugs over time.
-
-### Practice
-
-1.  Why is `TRUE` not a parameter to `rescale01()`? What would happen if
-    `x` contained a single missing value, and `na.rm` was `FALSE`?
-
-1.  In the second variant of `rescale01()`, infinite values are left
-    unchanged. Rewrite `rescale01()` so that `-Inf` is mapped to 0, and 
-    `Inf` is mapped to 1.
-
-1.  Practice turning the following code snippets into functions. Think about 
-    what each function does. What would you call it? How many arguments does it
-    need? Can you rewrite it to be more expressive or less duplicative?
-
-    ```{r, eval = FALSE}
-    mean(is.na(x))
-    
-    x / sum(x, na.rm = TRUE)
-    
-    sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)
-    ```
-
-1.  Follow <http://nicercode.github.io/intro/writing-functions.html> to 
-    write your own functions to compute the variance and skew of a numeric 
-    vector.
-
-1.  Write `both_na()`, a function that takes two vectors of the same length 
-    and returns the number of positions that have an `NA` in both vectors.
-
-1.  What do the following functions do? Why are they useful even though they
-    are so short?
-    
-    ```{r}
-    is_directory <- function(x) file.info(x)$isdir
-    is_readable <- function(x) file.access(x, 4) == 0
-    ```
-
-1.  Read the [complete lyrics](https://en.wikipedia.org/wiki/Little_Bunny_Foo_Foo) 
-    to "Little Bunny Foo Foo". There's a lot of duplication in this song. 
-    Extend the initial piping example to recreate the complete song, and use 
-    functions to reduce the duplication.
-
-## Functions are for humans and computers
-
-It's important to remember that functions are not just for the computer, but are also for humans. R doesn't care what your function is called, or what comments it contains, but these are important for human readers. This section discusses some things that you should bear in mind when writing functions that humans can understand.
-
-The name of a function is important. Ideally, the name of your function will be short, but clearly evoke what the function does. That's hard! But it's better to be clear than short, as RStudio's autocomplete makes it easy to type long names.
-
-Generally, function names should be verbs, and arguments should be nouns. There are some exceptions: nouns are ok if the function computes a very well known noun (i.e. `mean()` is better than `compute_mean()`), or accessing some property of an object (i.e. `coef()` is better than `get_coefficients()`). A good sign that a noun might be a better choice is if you're using a very broad verb like "get", "compute", "calculate", or "determine". Use your best judgement and don't be afraid to rename a function if you figure out a better name later.
-
-```{r, eval = FALSE}
-# Too short
-f()
-
-# Not a verb, or descriptive
-my_awesome_function()
-
-# Long, but clear
-impute_missing()
-collapse_years()
-```
-
-If your function name is composed of multiple words, I recommend using "snake\_case", where each lowercase word is separated by an underscore. camelCase is a popular alternative. It doesn't really matter which one you pick, the important thing is to be consistent: pick one or the other and stick with it. R itself is not very consistent, but there's nothing you can do about that. Make sure you don't fall into the same trap by making your code as consistent as possible.
-
-```{r, eval = FALSE}
-# Never do this!
-col_mins <- function(x, y) {}
-rowMaxes <- function(y, x) {}
-```
-
-If you have a family of functions that do similar things, make sure they have consistent names and arguments. Use a common prefix to indicate that they are connected. That's better than a common suffix because autocomplete allows you to type the prefix and see all the members of the family. 
-
-```{r, eval = FALSE}
-# Good
-input_select()
-input_checkbox()
-input_text()
-
-# Not so good
-select_input()
-checkbox_input()
-text_input()
-```
-
-A good example of this design is the stringr package: if you don't remember exactly which function you need, you can type `str_` and jog your memory.
-
-Where possible, avoid overriding existing functions and variables. It's impossible to do in general because so many good names are already taken by other packages, but avoiding the most common names from base R will avoid confusion.
-
-```{r, eval = FALSE}
-# Don't do this!
-T <- FALSE
-c <- 10
-mean <- function(x) sum(x)
-```
-
-Use comments, lines starting with `#`, to explain the "why" of your code. You generally should avoid comments that explain the "what" or the "how". If you can't understand what the code does from reading it, you should think about how to rewrite it to be more clear. Do you need to add some intermediate variables with useful names? Do you need to break out a subcomponent of a large function so you can name it? However, your code can never capture the reasoning behind your decisions: why did you choose this approach instead of an alternative? What else did you try that didn't work? It's a great idea to capture that sort of thinking in a comment.
-
-Another important use of comments is to break up your file into easily readable chunks. Use long lines of `-` and `=` to make it easy to spot the breaks.
-
-```{r, eval = FALSE}
-# Load data --------------------------------------
-
-# Plot data --------------------------------------
-```
-
-RStudio provides a keyboard shortcut to create these headers (Cmd/Ctrl + Shift + R), and will display them in the code navigation drop-down at the bottom-left of the editor:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("screenshots/rstudio-nav.png")
-```
-
-### Exercises
-
-1.  Read the source code for each of the following three functions, puzzle out
-    what they do, and then brainstorm better names.
-    
-    ```{r}
-    f1 <- function(string, prefix) {
-      substr(string, 1, nchar(prefix)) == prefix
-    }
-    f2 <- function(x) {
-      if (length(x) <= 1) return(NULL)
-      x[-length(x)]
-    }
-    f3 <- function(x, y) {
-      rep(y, length.out = length(x))
-    }
-    ```
-    
-1.  Take a function that you've written recently and spend 5 minutes 
-    brainstorming a better name for it and its arguments.
-
-1.  Compare and contrast `rnorm()` and `MASS::mvrnorm()`. How could you make
-    them more consistent? 
-    
-1.  Make a case for why `norm_r()`, `norm_d()` etc would be better than
-    `rnorm()`, `dnorm()`. Make a case for the opposite.
-
-## Conditional execution
-
-An `if` statement allows you to conditionally execute code. It looks like this:
-
-```{r, eval = FALSE}
-if (condition) {
-  # code executed when condition is TRUE
-} else {
-  # code executed when condition is FALSE
-}
-```
-
-To get help on `if` you need to surround it in backticks: `` ?`if` ``. The help isn't particularly helpful if you're not already an experienced programmer, but at least you know how to get to it!
-
-Here's a simple function that uses an `if` statement. The goal of this function is to return a logical vector describing whether or not each element of a vector is named.
-
-```{r}
-has_name <- function(x) {
-  nms <- names(x)
-  if (is.null(nms)) {
-    rep(FALSE, length(x))
-  } else {
-    !is.na(nms) & nms != ""
-  }
-}
-```
-
-This function takes advantage of the standard return rule: a function returns the last value that it computed. Here that is either one of the two branches of the `if` statement.
-
-### Conditions
-
-The `condition` must evaluate to either `TRUE` or `FALSE`. If it's a vector, you'll get a warning message; if it's an `NA`, you'll get an error. Watch out for these messages in your own code:
-
-```{r, error = TRUE}
-if (c(TRUE, FALSE)) {}
-
-if (NA) {}
-```
-
-You can use `||` (or) and `&&` (and) to combine multiple logical expressions. These operators are "short-circuiting": as soon as `||` sees the first `TRUE` it returns `TRUE` without computing anything else. As soon as `&&` sees the first `FALSE` it returns `FALSE`. You should never use `|` or `&` in an `if` statement: these are vectorised operations that apply to multiple values (that's why you use them in `filter()`). If you do have a logical vector, you can use `any()` or `all()` to collapse it to a single value.
-
-Be careful when testing for equality. `==` is vectorised, which means that it's easy to get more than one output.  Either check the length is already 1, collapse with `all()` or `any()`, or use the non-vectorised `identical()`. `identical()` is very strict: it always returns either a single `TRUE` or a single `FALSE`, and doesn't coerce types. This means that you need to be careful when comparing integers and doubles:
-
-```{r}
-identical(0L, 0)
-```
-
-You also need to be wary of floating point numbers:
-
-```{r}
-x <- sqrt(2) ^ 2
-x
-x == 2
-x - 2
-```
-
-Instead use `dplyr::near()` for comparisons, as described in [comparisons].
-
-And remember, `x == NA` doesn't do anything useful!
-
-### Multiple conditions
-
-You can chain multiple if statements together:
-
-```{r, eval = FALSE}
-if (this) {
-  # do that
-} else if (that) {
-  # do something else
-} else {
-  # 
-}
-```
-
-But if you end up with a very long series of chained `if` statements, you should consider rewriting. One useful technique is the `switch()` function. It allows you to evaluate selected code based on position or name.
-
-```{r, echo = FALSE}
-function(x, y, op) {
-  switch(op,
-    plus = x + y,
-    minus = x - y,
-    times = x * y,
-    divide = x / y,
-    stop("Unknown op!")
-  )
-}
-```
-
-Another useful function that can often eliminate long chains of `if` statements is `cut()`. It's used to discretise continuous variables.
-
-### Code style
-
-Both `if` and `function` should (almost) always be followed by squiggly brackets (`{}`), and the contents should be indented by two spaces. This makes it easier to see the hierarchy in your code by skimming the left-hand margin.
-
-An opening curly brace should never go on its own line and should always be followed by a new line. A closing curly brace should always go on its own line, unless it's followed by `else`. Always indent the code inside curly braces.
-
-```{r, eval = FALSE}
-# Good
-if (y < 0 && debug) {
-  message("Y is negative")
-}
-
-if (y == 0) {
-  log(x)
-} else {
-  y ^ x
-}
-
-# Bad
-if (y < 0 && debug)
-message("Y is negative")
-
-if (y == 0) {
-  log(x)
-} 
-else {
-  y ^ x
-}
-```
-
-It's ok to drop the curly braces if you have a very short `if` statement that can fit on one line:
-
-```{r}
-y <- 10
-x <- if (y < 20) "Too low" else "Too high"
-```
-
-I recommend this only for very brief `if` statements. Otherwise, the full form is easier to read:
-
-```{r}
-if (y < 20) {
-  x <- "Too low" 
-} else {
-  x <- "Too high"
-}
-```
-
-### Exercises
-
-1.  What's the difference between `if` and `ifelse()`? Carefully read the help
-    and construct three examples that illustrate the key differences.
-
-1.  Write a greeting function that says "good morning", "good afternoon",
-    or "good evening", depending on the time of day. (Hint: use a time
-    argument that defaults to `lubridate::now()`. That will make it 
-    easier to test your function.)
-
-1.  Implement a `fizzbuzz` function. It takes a single number as input. If
-    the number is divisible by three, it returns "fizz". If it's divisible by
-    five it returns "buzz". If it's divisible by three and five, it returns
-    "fizzbuzz". Otherwise, it returns the number. Make sure you first write 
-    working code before you create the function.
-    
-1.  How could you use `cut()` to simplify this set of nested if-else statements?
-
-    ```{r, eval = FALSE}
-    if (temp <= 0) {
-      "freezing"
-    } else if (temp <= 10) {
-      "cold"
-    } else if (temp <= 20) {
-      "cool"
-    } else if (temp <= 30) {
-      "warm"
-    } else {
-      "hot"
-    }
-    ```
-    
-    How would you change the call to `cut()` if I'd used `<` instead of `<=`?
-    What is the other chief advantage of `cut()` for this problem? (Hint:
-    what happens if you have many values in `temp`?)
-
-1.  What happens if you use `switch()` with numeric values?
-
-1.  What does this `switch()` call do? What happens if `x` is "e"?
-
-    ```{r, eval = FALSE}
-    switch(x, 
-      a = ,
-      b = "ab",
-      c = ,
-      d = "cd"
-    )
-    ```
-    
-    Experiment, then carefully read the documentation. 
-
-## Function arguments
-
-The arguments to a function typically fall into two broad sets: one set supplies the __data__ to compute on, and the other supplies arguments that control the __details__ of the computation. For example:
-
-* In `log()`, the data is `x`, and the detail is the `base` of the logarithm.
-
-* In `mean()`, the data is `x`, and the details are how much data to trim
-  from the ends (`trim`) and how to handle missing values (`na.rm`).
-
-* In `t.test()`, the data are `x` and `y`, and the details of the test are
-  `alternative`, `mu`, `paired`, `var.equal`, and `conf.level`.
-  
-* In `str_c()` you can supply any number of strings to `...`, and the details
-  of the concatenation are controlled by `sep` and `collapse`.
-  
-Generally, data arguments should come first. Detail arguments should go on the end, and usually should have default values. You specify a default value in the same way you call a function with a named argument:
-
-```{r}
-# Compute confidence interval around mean using normal approximation
-mean_ci <- function(x, conf = 0.95) {
-  se <- sd(x) / sqrt(length(x))
-  alpha <- 1 - conf
-  mean(x) + se * qnorm(c(alpha / 2, 1 - alpha / 2))
-}
-
-x <- runif(100)
-mean_ci(x)
-mean_ci(x, conf = 0.99)
-```
-
-The default value should almost always be the most common value. The few exceptions to this rule are to do with safety. For example, it makes sense for `na.rm` to default to `FALSE` because missing values are important. Even though `na.rm = TRUE` is what you usually put in your code, it's a bad idea to silently ignore missing values by default.
-
-When you call a function, you typically omit the names of the data arguments, because they are used so commonly. If you override the default value of a detail argument, you should use the full name:
-
-```{r, eval = FALSE}
-# Good
-mean(1:10, na.rm = TRUE)
-
-# Bad
-mean(x = 1:10, , FALSE)
-mean(, TRUE, x = c(1:10, NA))
-```
-
-You can refer to an argument by its unique prefix (e.g. `mean(x, n = TRUE)`), but this is generally best avoided given the possibilities for confusion.
-
-Notice that when you call a function, you should place a space around `=` in function calls, and always put a space after a comma, not before (just like in regular English). Using whitespace makes it easier to skim the function for the important components.
-
-```{r, eval = FALSE}
-# Good
-average <- mean(feet / 12 + inches, na.rm = TRUE)
-
-# Bad
-average<-mean(feet/12+inches,na.rm=TRUE)
-```
-
-### Choosing names
-
-The names of the arguments are also important. R doesn't care, but the readers of your code (including future-you!) will. Generally you should prefer longer, more descriptive names, but there are a handful of very common, very short names. It's worth memorising these:
-
-* `x`, `y`, `z`: vectors.
-* `w`: a vector of weights.
-* `df`: a data frame.
-* `i`, `j`: numeric indices (typically rows and columns).
-* `n`: length, or number of rows.
-* `p`: number of columns.
-
-Otherwise, consider matching names of arguments in existing R functions. For example, use `na.rm` to determine if missing values should be removed.
-
-### Checking values
-
-As you start to write more functions, you'll eventually get to the point where you don't remember exactly how your function works. At this point it's easy to call your function with invalid inputs. To avoid this problem, it's often useful to make constraints explicit. For example, imagine you've written some functions for computing weighted summary statistics:
-
-```{r}
-wt_mean <- function(x, w) {
-  sum(x * w) / sum(w)
-}
-wt_var <- function(x, w) {
-  mu <- wt_mean(x, w)
-  sum(w * (x - mu) ^ 2) / sum(w)
-}
-wt_sd <- function(x, w) {
-  sqrt(wt_var(x, w))
-}
-```
-
-What happens if `x` and `w` are not the same length?
-
-```{r}
-wt_mean(1:6, 1:3)
-```
-
-In this case, because of R's vector recycling rules, we don't get an error. 
-
-It's good practice to check important preconditions, and throw an error (with `stop()`), if they are not true:
-
-```{r}
-wt_mean <- function(x, w) {
-  if (length(x) != length(w)) {
-    stop("`x` and `w` must be the same length", call. = FALSE)
-  }
-  sum(w * x) / sum(w)
-}
-```
-
-Be careful not to take this too far. There's a tradeoff between how much time you spend making your function robust, versus how long you spend writing it. For example, if you also added a `na.rm` argument, I probably wouldn't check it carefully:
-
-```{r}
-wt_mean <- function(x, w, na.rm = FALSE) {
-  if (!is.logical(na.rm)) {
-    stop("`na.rm` must be logical")
-  }
-  if (length(na.rm) != 1) {
-    stop("`na.rm` must be length 1")
-  }
-  if (length(x) != length(w)) {
-    stop("`x` and `w` must be the same length", call. = FALSE)
-  }
-  
-  if (na.rm) {
-    miss <- is.na(x) | is.na(w)
-    x <- x[!miss]
-    w <- w[!miss]
-  }
-  sum(w * x) / sum(w)
-}
-```
-
-This is a lot of extra work for little additional gain. A useful compromise is the built-in `stopifnot()`: it checks that each argument is `TRUE`, and produces a generic error message if not.
-
-```{r, error = TRUE}
-wt_mean <- function(x, w, na.rm = FALSE) {
-  stopifnot(is.logical(na.rm), length(na.rm) == 1)
-  stopifnot(length(x) == length(w))
-  
-  if (na.rm) {
-    miss <- is.na(x) | is.na(w)
-    x <- x[!miss]
-    w <- w[!miss]
-  }
-  sum(w * x) / sum(w)
-}
-wt_mean(1:6, 6:1, na.rm = "foo")
-```
-
-Note that when using `stopifnot()` you assert what should be true rather than checking for what might be wrong.
-
-### Dot-dot-dot (...)
-
-Many functions in R take an arbitrary number of inputs:
-
-```{r}
-sum(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
-stringr::str_c("a", "b", "c", "d", "e", "f")
-```
-
-How do these functions work? They rely on a special argument: `...` (pronounced dot-dot-dot). This special argument captures any number of arguments that aren't otherwise matched. 
-
-It's useful because you can then send those `...` on to another function. This is a useful catch-all if your function primarily wraps another function. For example, I commonly create these helper functions that wrap around `str_c()`:
-
-```{r}
-commas <- function(...) stringr::str_c(..., collapse = ", ")
-commas(letters[1:10])
-
-rule <- function(..., pad = "-") {
-  title <- paste0(...)
-  width <- getOption("width") - nchar(title) - 5
-  cat(title, " ", stringr::str_dup(pad, width), "\n", sep = "")
-}
-rule("Important output")
-```
-
-Here `...` lets me forward on any arguments that I don't want to deal with to `str_c()`. It's a very convenient technique. But it does come at a price: any misspelled arguments will not raise an error. This makes it easy for typos to go unnoticed:
-
-```{r}
-x <- c(1, 2)
-sum(x, na.mr = TRUE)
-```
-
-If you just want to capture the values of the `...`, use `list(...)`.
-
-### Lazy evaluation
-
-Arguments in R are lazily evaluated: they're not computed until they're needed. That means if they're never used, they're never called. This is an important property of R as a programming language, but is generally not important when you're writing your own functions for data analysis. You can read more about lazy evaluation at <http://adv-r.had.co.nz/Functions.html#lazy-evaluation>.
-
-### Exercises
-
-1.  What does `commas(letters, collapse = "-")` do? Why?
-
-1.  It'd be nice if you could supply multiple characters to the `pad` argument, 
-    e.g. `rule("Title", pad = "-+")`. Why doesn't this currently work? How 
-    could you fix it?
-    
-1.  What does the `trim` argument to `mean()` do? When might you use it?
-
-1.  The default value for the `method` argument to `cor()` is 
-    `c("pearson", "kendall", "spearman")`. What does that mean? What 
-    value is used by default?
-
-## Return values
-
-Figuring out what your function should return is usually straightforward: it's why you created the function in the first place! There are two things you should consider when returning a value: 
-
-1. Does returning early make your function easier to read? 
-
-2. Can you make your function pipeable?
-
-### Explicit return statements
-
-The value returned by the function is usually the last statement it evaluates, but you can choose to return early by using `return()`. I think it's best to save the use of `return()` to signal that you can return early with a simpler solution. A common reason to do this is because the inputs are empty:
-
-```{r}
-complicated_function <- function(x, y, z) {
-  if (length(x) == 0 || length(y) == 0) {
-    return(0)
-  }
-    
-  # Complicated code here
-}
-
-```
-
-Another reason is because you have a `if` statement with one complex block and one simple block. For example, you might write an if statement like this:
-
-```{r, eval = FALSE}
-f <- function() {
-  if (x) {
-    # Do 
-    # something
-    # that
-    # takes
-    # many
-    # lines
-    # to
-    # express
-  } else {
-    # return something short
-  }
-}
-```
-
-But if the first block is very long, by the time you get to the `else`, you've forgotten the `condition`. One way to rewrite it is to use an early return for the simple case:
-
-```{r, eval = FALSE}
-
-f <- function() {
-  if (!x) {
-    return(something_short)
-  }
-
-  # Do 
-  # something
-  # that
-  # takes
-  # many
-  # lines
-  # to
-  # express
-}
-```
-
-This tends to make the code easier to understand, because you don't need quite so much context to understand it.
-
-### Writing pipeable functions
-
-If you want to write your own pipeable functions, it's important to think about the return value. Knowing the return value's object type will mean that your pipeline will "just work". For example, with dplyr and tidyr the object type is the data frame. 
-
-There are two basic types of pipeable functions: transformations and side-effects. With __transformations__, an object is passed to the function's first argument and a modified object is returned. With __side-effects__, the passed object is not transformed. Instead, the function performs an action on the object, like drawing a plot or saving a file. Side-effects functions should "invisibly" return the first argument, so that while they're not printed they can still be used in a pipeline. For example, this simple function prints the number of missing values in a data frame:
-
-```{r}
-show_missings <- function(df) {
-  n <- sum(is.na(df))
-  cat("Missing values: ", n, "\n", sep = "")
-  
-  invisible(df)
-}
-```
-
-If we call it interactively, the `invisible()` means that the input `df` doesn't get printed out:
-
-```{r}
-show_missings(mtcars)
-```
-
-But it's still there, it's just not printed by default:
-
-```{r}
-x <- show_missings(mtcars) 
-class(x)
-dim(x)
-```
-
-And we can still use it in a pipe:
-
-```{r, include = FALSE}
-library(dplyr)
-```
-```{r}
-mtcars %>% 
-  show_missings() %>% 
-  mutate(mpg = ifelse(mpg < 20, NA, mpg)) %>% 
-  show_missings() 
-```
-
-## Environment
-
-The last component of a function is its environment. This is not something you need to understand deeply when you first start writing functions. However, it's important to know a little bit about environments because they are crucial to how functions work. The environment of a function controls how R finds the value associated with a name. For example, take this function:
-
-```{r}
-f <- function(x) {
-  x + y
-} 
-```
-
-In many programming languages, this would be an error, because `y` is not defined inside the function. In R, this is valid code because R uses rules called __lexical scoping__ to find the value associated with a name. Since `y` is not defined inside the function, R will look in the __environment__ where the function was defined:
-
-```{r}
-y <- 100
-f(10)
-
-y <- 1000
-f(10)
-```
-
-This behaviour seems like a recipe for bugs, and indeed you should avoid creating functions like this deliberately, but by and large it doesn't cause too many problems (especially if you regularly restart R to get to a clean slate). 
-
-The advantage of this behaviour is that from a language standpoint it allows R to be very consistent. Every name is looked up using the same set of rules. For `f()` that includes the behaviour of two things that you might not expect: `{` and `+`. This allows you to do devious things like:
-
-```{r}
-`+` <- function(x, y) {
-  if (runif(1) < 0.1) {
-    sum(x, y)
-  } else {
-    sum(x, y) * 1.1
-  }
-}
-table(replicate(1000, 1 + 2))
-rm(`+`)
-```
-
-This is a common phenomenon in R. R places few limits on your power. You can do many things that you can't do in other programming languages. You can do many things that 99% of the time are extremely ill-advised (like overriding how addition works!). But this power and flexibility is what makes tools like ggplot2 and dplyr possible. Learning how to make best use of this flexibility is beyond the scope of this book, but you can read about in [_Advanced R_](http://adv-r.had.co.nz).
diff --git a/functions.qmd b/functions.qmd
new file mode 100644
index 000000000..2a8ac91f0
--- /dev/null
+++ b/functions.qmd
@@ -0,0 +1,899 @@
+# Functions {#sec-functions}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+One of the best ways to improve your reach as a data scientist is to write functions.
+Functions allow you to automate common tasks in a more powerful and general way than copy-and-pasting.
+Writing a function has three big advantages over using copy-and-paste:
+
+1.  You can give a function an evocative name that makes your code easier to understand.
+
+2.  As requirements change, you only need to update code in one place, instead of many.
+
+3.  You eliminate the chance of making incidental mistakes when you copy and paste (i.e. updating a variable name in one place, but not in another).
+
+4.  It makes it easier to reuse work from project-to-project, increasing your productivity over time.
+
+A good rule of thumb is to consider writing a function whenever you've copied and pasted a block of code more than twice (i.e. you now have three copies of the same code).
+In this chapter, you'll learn about three useful types of functions:
+
+-   Vector functions take one or more vectors as input and return a vector as output.
+-   Data frame functions take a data frame as input and return a data frame as output.
+-   Plot functions that take a data frame as input and return a plot as output.
+
+Each of these sections include many examples to help you generalize the patterns that you see.
+These examples wouldn't be possible without the help of folks of twitter, and we encourage follow the links in the comment to see original inspirations.
+You might also want to read the original motivating tweets for [general functions](https://twitter.com/hadleywickham/status/1571603361350164486) and [plotting functions](https://twitter.com/hadleywickham/status/1574373127349575680) to see even more functions.
+
+### Prerequisites
+
+We'll wrap up a variety of functions from around the tidyverse.
+We'll also use nycflights13 as a source of familiar data to use our functions with.
+
+```{r}
+#| message: false
+library(tidyverse)
+library(nycflights13)
+```
+
+## Vector functions
+
+We'll begin with vector functions: functions that take one or more vectors and return a vector result.
+For example, take a look at this code.
+What does it do?
+
+```{r}
+df <- tibble(
+  a = rnorm(5),
+  b = rnorm(5),
+  c = rnorm(5),
+  d = rnorm(5),
+)
+
+df |> mutate(
+  a = (a - min(a, na.rm = TRUE)) / 
+    (max(a, na.rm = TRUE) - min(a, na.rm = TRUE)),
+  b = (b - min(b, na.rm = TRUE)) / 
+    (max(b, na.rm = TRUE) - min(a, na.rm = TRUE)),
+  c = (c - min(c, na.rm = TRUE)) / 
+    (max(c, na.rm = TRUE) - min(c, na.rm = TRUE)),
+  d = (d - min(d, na.rm = TRUE)) / 
+    (max(d, na.rm = TRUE) - min(d, na.rm = TRUE)),
+)
+```
+
+You might be able to puzzle out that this rescales each column to have a range from 0 to 1.
+But did you spot the mistake?
+When Hadley wrote this code he made an error when copying-and-pasting and forgot to change an `a` to a `b`.
+Preventing this type of mistake is one very good reason to learn how to write functions.
+
+### Writing a function
+
+To write a function you need to first analyse your repeated code to figure what parts are constant and what parts vary.
+If we take the code above and pull it outside of `mutate()`, it's a little easier to see the pattern because each repetition is now one line:
+
+```{r}
+#| eval: false
+
+(a - min(a, na.rm = TRUE)) / (max(a, na.rm = TRUE) - min(a, na.rm = TRUE))
+(b - min(b, na.rm = TRUE)) / (max(b, na.rm = TRUE) - min(b, na.rm = TRUE))
+(c - min(c, na.rm = TRUE)) / (max(c, na.rm = TRUE) - min(c, na.rm = TRUE))
+(d - min(d, na.rm = TRUE)) / (max(d, na.rm = TRUE) - min(d, na.rm = TRUE))  
+```
+
+To make this a bit clearer we can replace the bit that varies with `█`:
+
+```{r}
+#| eval: false
+(█ - min(█, na.rm = TRUE)) / (max(█, na.rm = TRUE) - min(█, na.rm = TRUE))
+```
+
+To turn this into a function you need three things:
+
+1.  A **name**.
+    Here we'll use `rescale01` because this function rescales a vector to lie between 0 and 1.
+
+2.  The **arguments**.
+    The arguments are things that vary across calls and our analysis above tells us that we have just one.
+    We'll call it `x` because this is the conventional name for a numeric vector.
+
+3.  The **body**.
+    The body is the code that's repeated across all the calls.
+
+Then you create a function by following the template:
+
+```{r}
+name <- function(arguments) {
+  body
+}
+```
+
+For this case that leads to:
+
+```{r}
+rescale01 <- function(x) {
+  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
+}
+```
+
+At this point you might test with a few simple inputs to make sure you've captured the logic correctly:
+
+```{r}
+rescale01(c(-10, 0, 10))
+rescale01(c(1, 2, 3, NA, 5))
+```
+
+Then you can rewrite the call to `mutate()` as:
+
+```{r}
+df |> mutate(
+  a = rescale01(a),
+  b = rescale01(b),
+  c = rescale01(c),
+  d = rescale01(d),
+)
+```
+
+(In @sec-iteration, you'll learn how to use `across()` to reduce the duplication even further so all you need is `df |> mutate(across(a:d, rescale01))`).
+
+### Improving our function
+
+You might notice that the `rescale01()` function does some unnecessary work --- instead of computing `min()` twice and `max()` once we could instead compute both the minimum and maximum in one step with `range()`:
+
+```{r}
+rescale01 <- function(x) {
+  rng <- range(x, na.rm = TRUE)
+  (x - rng[1]) / (rng[2] - rng[1])
+}
+```
+
+Or you might try this function on a vector that includes an infinite value:
+
+```{r}
+x <- c(1:10, Inf)
+rescale01(x)
+```
+
+That result is not particularly useful so we could ask `range()` to ignore infinite values:
+
+```{r}
+rescale01 <- function(x) {
+  rng <- range(x, na.rm = TRUE, finite = TRUE)
+  (x - rng[1]) / (rng[2] - rng[1])
+}
+
+rescale01(x)
+```
+
+These changes illustrate an important benefit of functions: because we've moved the repeated code into a function, we only need to make the change in one place.
+
+### Mutate functions
+
+Now you've got the basic idea of functions, let's take a look at a whole bunch of examples.
+We'll start by looking at "mutate" functions, i.e. functions that work well inside of `mutate()` and `filter()` because they return an output of the same length as the input.
+
+Let's start with a simple variation of `rescale01()`.
+Maybe you want to compute the Z-score, rescaling a vector to have a mean of zero and a standard deviation of one:
+
+```{r}
+z_score <- function(x) {
+  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
+}
+```
+
+Or maybe you want to wrap up a straightforward `case_when()` and give it a useful name.
+For example, this `clamp()` function ensures all values of a vector lie in between a minimum or a maximum:
+
+```{r}
+clamp <- function(x, min, max) {
+  case_when(
+    x < min ~ min,
+    x > max ~ max,
+    .default = x
+  )
+}
+
+clamp(1:10, min = 3, max = 7)
+```
+
+Of course functions don't just need to work with numeric variables.
+You might want to do some repeated string manipulation.
+Maybe you need to make the first character upper case:
+
+```{r}
+first_upper <- function(x) {
+  str_sub(x, 1, 1) <- str_to_upper(str_sub(x, 1, 1))
+  x
+}
+
+first_upper("hello")
+```
+
+Or maybe you want to strip percent signs, commas, and dollar signs from a string before converting it into a number:
+
+```{r}
+# https://twitter.com/NVlabormarket/status/1571939851922198530
+clean_number <- function(x) {
+  is_pct <- str_detect(x, "%")
+  num <- x |> 
+    str_remove_all("%") |> 
+    str_remove_all(",") |> 
+    str_remove_all(fixed("$")) |> 
+    as.numeric(x)
+  if_else(is_pct, num / 100, num)
+}
+
+clean_number("$12,300")
+clean_number("45%")
+```
+
+Sometimes your functions will be highly specialized for one data analysis step.
+For example, if you have a bunch of variables that record missing values as 997, 998, or 999, you might want to write a function to replace them with `NA`:
+
+```{r}
+fix_na <- function(x) {
+  if_else(x %in% c(997, 998, 999), NA, x)
+}
+```
+
+We've focused on examples that take a single vector because we think they're the most common.
+But there's no reason that your function can't take multiple vector inputs.
+
+### Summary functions
+
+Another important family of vector functions is summary functions, functions that return a single value for use in `summarize()`.
+Sometimes this can just be a matter of setting a default argument or two:
+
+```{r}
+commas <- function(x) {
+  str_flatten(x, collapse = ", ", last = " and ")
+}
+
+commas(c("cat", "dog", "pigeon"))
+```
+
+Or you might wrap up a simple computation, like for the coefficient of variation, which divides the standard deviation by the mean:
+
+```{r}
+cv <- function(x, na.rm = FALSE) {
+  sd(x, na.rm = na.rm) / mean(x, na.rm = na.rm)
+}
+
+cv(runif(100, min = 0, max = 50))
+cv(runif(100, min = 0, max = 500))
+```
+
+Or maybe you just want to make a common pattern easier to remember by giving it a memorable name:
+
+```{r}
+# https://twitter.com/gbganalyst/status/1571619641390252033
+n_missing <- function(x) {
+  sum(is.na(x))
+} 
+```
+
+You can also write functions with multiple vector inputs.
+For example, maybe you want to compute the mean absolute prediction error to help you compare model predictions with actual values:
+
+```{r}
+# https://twitter.com/neilgcurrie/status/1571607727255834625
+mape <- function(actual, predicted) {
+  sum(abs((actual - predicted) / actual)) / length(actual)
+}
+```
+
+::: callout-note
+## RStudio
+
+Once you start writing functions, there are two RStudio shortcuts that are super useful:
+
+-   To find the definition of a function that you've written, place the cursor on the name of the function and press `F2`.
+
+-   To quickly jump to a function, press `Ctrl + .` to open the fuzzy file and function finder and type the first few letters of your function name.
+    You can also navigate to files, Quarto sections, and more, making it a very handy navigation tool.
+:::
+
+### Exercises
+
+1.  Practice turning the following code snippets into functions.
+    Think about what each function does.
+    What would you call it?
+    How many arguments does it need?
+
+    ```{r}
+    #| eval: false
+
+    mean(is.na(x))
+    mean(is.na(y))
+    mean(is.na(z))
+
+    x / sum(x, na.rm = TRUE)
+    y / sum(y, na.rm = TRUE)
+    z / sum(z, na.rm = TRUE)
+
+    round(x / sum(x, na.rm = TRUE) * 100, 1)
+    round(y / sum(y, na.rm = TRUE) * 100, 1)
+    round(z / sum(z, na.rm = TRUE) * 100, 1)
+    ```
+
+2.  In the second variant of `rescale01()`, infinite values are left unchanged.
+    Can you rewrite `rescale01()` so that `-Inf` is mapped to 0, and `Inf` is mapped to 1?
+
+3.  Given a vector of birthdates, write a function to compute the age in years.
+
+4.  Write your own functions to compute the variance and skewness of a numeric vector.
+    You can look up the definitions on Wikipedia or elsewhere.
+
+5.  Write `both_na()`, a summary function that takes two vectors of the same length and returns the number of positions that have an `NA` in both vectors.
+
+6.  Read the documentation to figure out what the following functions do.
+    Why are they useful even though they are so short?
+
+    ```{r}
+    is_directory <- function(x) {
+      file.info(x)$isdir
+    }
+    is_readable <- function(x) {
+      file.access(x, 4) == 0
+    }
+    ```
+
+## Data frame functions
+
+Vector functions are useful for pulling out code that's repeated within a dplyr verb.
+But you'll often also repeat the verbs themselves, particularly within a large pipeline.
+When you notice yourself copying and pasting multiple verbs multiple times, you might think about writing a data frame function.
+Data frame functions work like dplyr verbs: they take a data frame as the first argument, some extra arguments that say what to do with it, and return a data frame or vector.
+
+To let you write a function that uses dplyr verbs, we'll first introduce you to the challenge of indirection and how you can overcome it with embracing, `{{ }}`.
+With this theory under your belt, we'll then show you a bunch of examples to illustrate what you might do with it.
+
+### Indirection and tidy evaluation
+
+When you start writing functions that use dplyr verbs you rapidly hit the problem of indirection.
+Let's illustrate the problem with a very simple function: `grouped_mean()`.
+The goal of this function is to compute the mean of `mean_var` grouped by `group_var`:
+
+```{r}
+grouped_mean <- function(df, group_var, mean_var) {
+  df |> 
+    group_by(group_var) |> 
+    summarize(mean(mean_var))
+}
+```
+
+If we try and use it, we get an error:
+
+```{r}
+#| error: true
+diamonds |> grouped_mean(cut, carat)
+```
+
+To make the problem a bit more clear, we can use a made up data frame:
+
+```{r}
+df <- tibble(
+  mean_var = 1,
+  group_var = "g",
+  group = 1,
+  x = 10,
+  y = 100
+)
+
+df |> grouped_mean(group, x)
+df |> grouped_mean(group, y)
+```
+
+Regardless of how we call `grouped_mean()` it always does `df |> group_by(group_var) |> summarize(mean(mean_var))`, instead of `df |> group_by(group) |> summarize(mean(x))` or `df |> group_by(group) |> summarize(mean(y))`.
+This is a problem of indirection, and it arises because dplyr uses **tidy evaluation** to allow you to refer to the names of variables inside your data frame without any special treatment.
+
+Tidy evaluation is great 95% of the time because it makes your data analyses very concise as you never have to say which data frame a variable comes from; it's obvious from the context.
+The downside of tidy evaluation comes when we want to wrap up repeated tidyverse code into a function.
+Here we need some way to tell `group_mean()` and `summarize()` not to treat `group_var` and `mean_var` as the name of the variables, but instead look inside them for the variable we actually want to use.
+
+Tidy evaluation includes a solution to this problem called **embracing** 🤗.
+Embracing a variable means to wrap it in braces so (e.g.) `var` becomes `{{ var }}`.
+Embracing a variable tells dplyr to use the value stored inside the argument, not the argument as the literal variable name.
+One way to remember what's happening is to think of `{{ }}` as looking down a tunnel --- `{{ var }}` will make a dplyr function look inside of `var` rather than looking for a variable called `var`.
+
+So to make `grouped_mean()` work, we need to surround `group_var` and `mean_var` with `{{ }}`:
+
+```{r}
+grouped_mean <- function(df, group_var, mean_var) {
+  df |> 
+    group_by({{ group_var }}) |> 
+    summarize(mean({{ mean_var }}))
+}
+
+df |> grouped_mean(group, x)
+```
+
+Success!
+
+### When to embrace? {#sec-embracing}
+
+So the key challenge in writing data frame functions is figuring out which arguments need to be embraced.
+Fortunately, this is easy because you can look it up from the documentation 😄.
+There are two terms to look for in the docs which correspond to the two most common sub-types of tidy evaluation:
+
+-   **Data-masking**: this is used in functions like `arrange()`, `filter()`, and `summarize()` that compute with variables.
+
+-   **Tidy-selection**: this is used for functions like `select()`, `relocate()`, and `rename()` that select variables.
+
+Your intuition about which arguments use tidy evaluation should be good for many common functions --- just think about whether you can compute (e.g., `x + 1`) or select (e.g., `a:x`).
+
+In the following sections, we'll explore the sorts of handy functions you might write once you understand embracing.
+
+### Common use cases
+
+If you commonly perform the same set of summaries when doing initial data exploration, you might consider wrapping them up in a helper function:
+
+```{r}
+summary6 <- function(data, var) {
+  data |> summarize(
+    min = min({{ var }}, na.rm = TRUE),
+    mean = mean({{ var }}, na.rm = TRUE),
+    median = median({{ var }}, na.rm = TRUE),
+    max = max({{ var }}, na.rm = TRUE),
+    n = n(),
+    n_miss = sum(is.na({{ var }})),
+    .groups = "drop"
+  )
+}
+
+diamonds |> summary6(carat)
+```
+
+(Whenever you wrap `summarize()` in a helper, we think it's good practice to set `.groups = "drop"` to both avoid the message and leave the data in an ungrouped state.)
+
+The nice thing about this function is, because it wraps `summarize()`, you can use it on grouped data:
+
+```{r}
+diamonds |> 
+  group_by(cut) |> 
+  summary6(carat)
+```
+
+Furthermore, since the arguments to summarize are data-masking also means that the `var` argument to `summary6()` is data-masking.
+That means you can also summarize computed variables:
+
+```{r}
+diamonds |> 
+  group_by(cut) |> 
+  summary6(log10(carat))
+```
+
+To summarize multiple variables, you'll need to wait until @sec-across, where you'll learn how to use `across()`.
+
+Another popular `summarize()` helper function is a version of `count()` that also computes proportions:
+
+```{r}
+# https://twitter.com/Diabb6/status/1571635146658402309
+count_prop <- function(df, var, sort = FALSE) {
+  df |>
+    count({{ var }}, sort = sort) |>
+    mutate(prop = n / sum(n))
+}
+
+diamonds |> count_prop(clarity)
+```
+
+This function has three arguments: `df`, `var`, and `sort`, and only `var` needs to be embraced because it's passed to `count()` which uses data-masking for all variables.
+Note that we use a default value for `sort` so that if the user doesn't supply their own value it will default to `FALSE`.
+
+Or maybe you want to find the sorted unique values of a variable for a subset of the data.
+Rather than supplying a variable and a value to do the filtering, we'll allow the user to supply a condition:
+
+```{r}
+unique_where <- function(df, condition, var) {
+  df |> 
+    filter({{ condition }}) |> 
+    distinct({{ var }}) |> 
+    arrange({{ var }})
+}
+
+# Find all the destinations in December
+flights |> unique_where(month == 12, dest)
+```
+
+Here we embrace `condition` because it's passed to `filter()` and `var` because it's passed to `distinct()` and `arrange()`.
+
+We've made all these examples to take a data frame as the first argument, but if you're working repeatedly with the same data, it can make sense to hardcode it.
+For example, the following function always works with the flights dataset and always selects `time_hour`, `carrier`, and `flight` since they form the compound primary key that allows you to identify a row.
+
+```{r}
+subset_flights <- function(rows, cols) {
+  flights |> 
+    filter({{ rows }}) |> 
+    select(time_hour, carrier, flight, {{ cols }})
+}
+```
+
+### Data-masking vs. tidy-selection
+
+Sometimes you want to select variables inside a function that uses data-masking.
+For example, imagine you want to write a `count_missing()` that counts the number of missing observations in rows.
+You might try writing something like:
+
+```{r}
+#| error: true
+count_missing <- function(df, group_vars, x_var) {
+  df |> 
+    group_by({{ group_vars }}) |> 
+    summarize(
+      n_miss = sum(is.na({{ x_var }})),
+      .groups = "drop"
+    )
+}
+
+flights |> 
+  count_missing(c(year, month, day), dep_time)
+```
+
+This doesn't work because `group_by()` uses data-masking, not tidy-selection.
+We can work around that problem by using the handy `pick()` function, which allows you to use tidy-selection inside data-masking functions:
+
+```{r}
+count_missing <- function(df, group_vars, x_var) {
+  df |> 
+    group_by(pick({{ group_vars }})) |> 
+    summarize(
+      n_miss = sum(is.na({{ x_var }})),
+      .groups = "drop"
+  )
+}
+
+flights |> 
+  count_missing(c(year, month, day), dep_time)
+```
+
+Another convenient use of `pick()` is to make a 2d table of counts.
+Here we count using all the variables in the `rows` and `columns`, then use `pivot_wider()` to rearrange the counts into a grid:
+
+```{r}
+# https://twitter.com/pollicipes/status/1571606508944719876
+count_wide <- function(data, rows, cols) {
+  data |> 
+    count(pick(c({{ rows }}, {{ cols }}))) |> 
+    pivot_wider(
+      names_from = {{ cols }}, 
+      values_from = n,
+      names_sort = TRUE,
+      values_fill = 0
+    )
+}
+
+diamonds |> count_wide(c(clarity, color), cut)
+```
+
+While our examples have mostly focused on dplyr, tidy evaluation also underpins tidyr, and if you look at the `pivot_wider()` docs you can see that `names_from` uses tidy-selection.
+
+### Exercises
+
+1.  Using the datasets from nycflights13, write a function that:
+
+    1.  Finds all flights that were cancelled (i.e. `is.na(arr_time)`) or delayed by more than an hour.
+
+        ```{r}
+        #| eval: false
+        flights |> filter_severe()
+        ```
+
+    2.  Counts the number of cancelled flights and the number of flights delayed by more than an hour.
+
+        ```{r}
+        #| eval: false
+        flights |> group_by(dest) |> summarize_severe()
+        ```
+
+    3.  Finds all flights that were cancelled or delayed by more than a user supplied number of hours:
+
+        ```{r}
+        #| eval: false
+        flights |> filter_severe(hours = 2)
+        ```
+
+    4.  Summarizes the weather to compute the minimum, mean, and maximum, of a user supplied variable:
+
+        ```{r}
+        #| eval: false
+        weather |> summarize_weather(temp)
+        ```
+
+    5.  Converts the user supplied variable that uses clock time (e.g., `dep_time`, `arr_time`, etc.) into a decimal time (i.e. hours + (minutes / 60)).
+
+        ```{r}
+        #| eval: false
+        flights |> standardize_time(sched_dep_time)
+        ```
+
+2.  For each of the following functions list all arguments that use tidy evaluation and describe whether they use data-masking or tidy-selection: `distinct()`, `count()`, `group_by()`, `rename_with()`, `slice_min()`, `slice_sample()`.
+
+3.  Generalize the following function so that you can supply any number of variables to count.
+
+    ```{r}
+    count_prop <- function(df, var, sort = FALSE) {
+      df |>
+        count({{ var }}, sort = sort) |>
+        mutate(prop = n / sum(n))
+    }
+    ```
+
+## Plot functions
+
+Instead of returning a data frame, you might want to return a plot.
+Fortunately, you can use the same techniques with ggplot2, because `aes()` is a data-masking function.
+For example, imagine that you're making a lot of histograms:
+
+```{r}
+#| fig-show: hide
+diamonds |> 
+  ggplot(aes(x = carat)) +
+  geom_histogram(binwidth = 0.1)
+
+diamonds |> 
+  ggplot(aes(x = carat)) +
+  geom_histogram(binwidth = 0.05)
+```
+
+Wouldn't it be nice if you could wrap this up into a histogram function?
+This is easy as pie once you know that `aes()` is a data-masking function and you need to embrace:
+
+```{r}
+#| fig-alt: |
+#|   A histogram of carats of diamonds, ranging from 0 to 5, showing a unimodal, 
+#|   right-skewed distribution with a peak between 0 to 1 carats.
+
+histogram <- function(df, var, binwidth = NULL) {
+  df |> 
+    ggplot(aes(x = {{ var }})) + 
+    geom_histogram(binwidth = binwidth)
+}
+
+diamonds |> histogram(carat, 0.1)
+```
+
+Note that `histogram()` returns a ggplot2 plot, meaning you can still add on additional components if you want.
+Just remember to switch from `|>` to `+`:
+
+```{r}
+#| fig.show: hide
+diamonds |> 
+  histogram(carat, 0.1) +
+  labs(x = "Size (in carats)", y = "Number of diamonds")
+```
+
+### More variables
+
+It's straightforward to add more variables to the mix.
+For example, maybe you want an easy way to eyeball whether or not a dataset is linear by overlaying a smooth line and a straight line:
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of height vs. mass of StarWars characters showing a positive 
+#|   relationship. A smooth curve of the relationship is plotted in red, and 
+#|   the best fit line is ploted in blue.
+
+# https://twitter.com/tyler_js_smith/status/1574377116988104704
+linearity_check <- function(df, x, y) {
+  df |>
+    ggplot(aes(x = {{ x }}, y = {{ y }})) +
+    geom_point() +
+    geom_smooth(method = "loess", formula = y ~ x, color = "red", se = FALSE) +
+    geom_smooth(method = "lm", formula = y ~ x, color = "blue", se = FALSE) 
+}
+
+starwars |> 
+  filter(mass < 1000) |> 
+  linearity_check(mass, height)
+```
+
+Or maybe you want an alternative to colored scatterplots for very large datasets where overplotting is a problem:
+
+```{r}
+#| fig-alt: |
+#|   Hex plot of price vs. carat of diamonds showing a positive relationship. 
+#|   There are more diamonds that are less than 2 carats than more than 2 carats.
+
+# https://twitter.com/ppaxisa/status/1574398423175921665
+hex_plot <- function(df, x, y, z, bins = 20, fun = "mean") {
+  df |> 
+    ggplot(aes(x = {{ x }}, y = {{ y }}, z = {{ z }})) + 
+    stat_summary_hex(
+      aes(color = after_scale(fill)), # make border same color as fill
+      bins = bins, 
+      fun = fun,
+    )
+}
+
+diamonds |> hex_plot(carat, price, depth)
+```
+
+### Combining with other tidyverse
+
+Some of the most useful helpers combine a dash of data manipulation with ggplot2.
+For example, if you might want to do a vertical bar chart where you automatically sort the bars in frequency order using `fct_infreq()`.
+Since the bar chart is vertical, we also need to reverse the usual order to get the highest values at the top:
+
+```{r}
+#| fig-alt: |
+#|   Bar plot of clarify of diamonds, where clarity is on the y-axis and counts 
+#|   are on the x-axis, and the bars are ordered in order of frequency: SI1, 
+#|   VS2, SI2, VS1, VVS2, VVS1, IF, I1.
+
+sorted_bars <- function(df, var) {
+  df |> 
+    mutate({{ var }} := fct_rev(fct_infreq({{ var }})))  |>
+    ggplot(aes(y = {{ var }})) +
+    geom_bar()
+}
+
+diamonds |> sorted_bars(clarity)
+```
+
+We have to use a new operator here, `:=`, because we are generating the variable name based on user-supplied data.
+Variable names go on the left hand side of `=`, but R's syntax doesn't allow anything to the left of `=` except for a single literal name.
+To work around this problem, we use the special operator `:=` which tidy evaluation treats in exactly the same way as `=`.
+
+Or maybe you want to make it easy to draw a bar plot just for a subset of the data:
+
+```{r}
+#| fig-alt: |
+#|   Bar plot of clarity of diamonds. The most common is SI1, then SI2, then 
+#|   VS2, then VS1, then VVS2, then VVS1, then I1, then lastly IF.
+
+conditional_bars <- function(df, condition, var) {
+  df |> 
+    filter({{ condition }}) |> 
+    ggplot(aes(x = {{ var }})) + 
+    geom_bar()
+}
+
+diamonds |> conditional_bars(cut == "Good", clarity)
+```
+
+You can also get creative and display data summaries in other ways.
+You can find a cool application at <https://gist.github.com/GShotwell/b19ef520b6d56f61a830fabb3454965b>; it uses the axis labels to display the highest value.
+As you learn more about ggplot2, the power of your functions will continue to increase.
+
+We'll finish with a more complicated case: labelling the plots you create.
+
+### Labeling
+
+Remember the histogram function we showed you earlier?
+
+```{r}
+histogram <- function(df, var, binwidth = NULL) {
+  df |> 
+    ggplot(aes(x = {{ var }})) + 
+    geom_histogram(binwidth = binwidth)
+}
+```
+
+Wouldn't it be nice if we could label the output with the variable and the bin width that was used?
+To do so, we're going to have to go under the covers of tidy evaluation and use a function from the package we haven't talked about yet: rlang.
+rlang is a low-level package that's used by just about every other package in the tidyverse because it implements tidy evaluation (as well as many other useful tools).
+
+To solve the labeling problem we can use `rlang::englue()`.
+This works similarly to `str_glue()`, so any value wrapped in `{ }` will be inserted into the string.
+But it also understands `{{ }}`, which automatically inserts the appropriate variable name:
+
+```{r}
+#| fig-alt: |
+#|   Histogram of carats of diamonds, ranging from 0 to 5. The distribution is 
+#|   unimodal and right skewed with a peak between 0 to 1 carats.
+
+histogram <- function(df, var, binwidth) {
+  label <- rlang::englue("A histogram of {{var}} with binwidth {binwidth}")
+  
+  df |> 
+    ggplot(aes(x = {{ var }})) + 
+    geom_histogram(binwidth = binwidth) + 
+    labs(title = label)
+}
+
+diamonds |> histogram(carat, 0.1)
+```
+
+You can use the same approach in any other place where you want to supply a string in a ggplot2 plot.
+
+### Exercises
+
+Build up a rich plotting function by incrementally implementing each of the steps below:
+
+1.  Draw a scatterplot given dataset and `x` and `y` variables.
+
+2.  Add a line of best fit (i.e. a linear model with no standard errors).
+
+3.  Add a title.
+
+## Style
+
+R doesn't care what your function or arguments are called but the names make a big difference for humans.
+Ideally, the name of your function will be short, but clearly evoke what the function does.
+That's hard!
+But it's better to be clear than short, as RStudio's autocomplete makes it easy to type long names.
+
+Generally, function names should be verbs, and arguments should be nouns.
+There are some exceptions: nouns are ok if the function computes a very well known noun (i.e. `mean()` is better than `compute_mean()`), or accessing some property of an object (i.e. `coef()` is better than `get_coefficients()`).
+Use your best judgement and don't be afraid to rename a function if you figure out a better name later.
+
+```{r}
+#| eval: false
+
+# Too short
+f()
+
+# Not a verb, or descriptive
+my_awesome_function()
+
+# Long, but clear
+impute_missing()
+collapse_years()
+```
+
+R also doesn't care about how you use white space in your functions but future readers will.
+Continue to follow the rules from @sec-workflow-style.
+Additionally, `function()` should always be followed by squiggly brackets (`{}`), and the contents should be indented by an additional two spaces.
+This makes it easier to see the hierarchy in your code by skimming the left-hand margin.
+
+```{r}
+# Missing extra two spaces
+density <- function(color, facets, binwidth = 0.1) {
+diamonds |> 
+  ggplot(aes(x = carat, y = after_stat(density), color = {{ color }})) +
+  geom_freqpoly(binwidth = binwidth) +
+  facet_wrap(vars({{ facets }}))
+}
+
+# Pipe indented incorrectly
+density <- function(color, facets, binwidth = 0.1) {
+  diamonds |> 
+  ggplot(aes(x = carat, y = after_stat(density), color = {{ color }})) +
+  geom_freqpoly(binwidth = binwidth) +
+  facet_wrap(vars({{ facets }}))
+}
+```
+
+As you can see we recommend putting extra spaces inside of `{{ }}`.
+This makes it very obvious that something unusual is happening.
+
+### Exercises
+
+1.  Read the source code for each of the following two functions, puzzle out what they do, and then brainstorm better names.
+
+    ```{r}
+    f1 <- function(string, prefix) {
+      str_sub(string, 1, str_length(prefix)) == prefix
+    }
+
+    f3 <- function(x, y) {
+      rep(y, length.out = length(x))
+    }
+    ```
+
+2.  Take a function that you've written recently and spend 5 minutes brainstorming a better name for it and its arguments.
+
+3.  Make a case for why `norm_r()`, `norm_d()` etc. would be better than `rnorm()`, `dnorm()`.
+    Make a case for the opposite.
+    How could you make the names even clearer?
+
+## Summary
+
+In this chapter, you learned how to write functions for three useful scenarios: creating a vector, creating a data frames, or creating a plot.
+Along the way you saw many examples, which hopefully started to get your creative juices flowing, and gave you some ideas for where functions might help your analysis code.
+
+We have only shown you the bare minimum to get started with functions and there's much more to learn.
+A few places to learn more are:
+
+-   To learn more about programming with tidy evaluation, see useful recipes in [programming with dplyr](https://dplyr.tidyverse.org/articles/programming.html) and [programming with tidyr](https://tidyr.tidyverse.org/articles/programming.html) and learn more about the theory in [What is data-masking and why do I need {{?](https://rlang.r-lib.org/reference/topic-data-mask.html).
+-   To learn more about reducing duplication in your ggplot2 code, read the [Programming with ggplot2](https://ggplot2-book.org/programming.html){.uri} chapter of the ggplot2 book.
+-   For more advice on function style, see the [tidyverse style guide](https://style.tidyverse.org/functions.html){.uri}.
+
+In the next chapter, we'll dive into iteration which gives you further tools for reducing code duplication.
diff --git a/hierarchy.Rmd b/hierarchy.Rmd
deleted file mode 100644
index 1d8c8f6fb..000000000
--- a/hierarchy.Rmd
+++ /dev/null
@@ -1,188 +0,0 @@
-# Hierarchical data {#hierarchy}
-
-## Introduction
-
-This chapter belongs in [wrangle](#wrangle-intro): it will give you a set of tools for working with hierarchical data, such as the deeply nested lists you often get when working with JSON. However, you can only learn it now because working with hierarchical structures requires some programming skills, particularly an understanding of data structures, functions, and iteration. Now you have those tools under your belt, you can learn how to work with hierarchical data.
-
-The 
-
-As well as tools to simplify iteration, purrr provides tools for handling deeply nested lists. There are three common sources of such data:
-
-* JSON and XML
-* 
-
-The map functions apply a function to every element in a list. They are the most commonly used part of purrr, but not the only part. Since lists are often used to represent complex hierarchies, purrr also provides tools to work with hierarchy:
-
-* You can extract deeply nested elements in a single call by supplying
-  a character vector to the map functions.
-
-* You can remove a level of the hierarchy with the flatten functions.
-
-* You can flip levels of the hierarchy with the transpose function.
-
-### Prerequisites
-
-This chapter focusses mostly on purrr. As well as the tools for iteration that you've already learned about, purrr also provides a number of tools specifically designed to manipulate hierarchical data.
-
-```{r setup}
-library(purrr)
-```
-
-## Initial exploration
-
-Sometimes you get data structures that are very deeply nested. A common source of such data is JSON from a web API. I've previously downloaded a list of GitHub issues related to this book and saved it as `issues.json`. Now I'm going to load it into a list with jsonlite. By default `fromJSON()` tries to be helpful and simplifies the structure a little for you. Here I'm going to show you how to do it with purrr, so I set `simplifyVector = FALSE`:
-
-```{r}
-# From https://api.github.com/repos/hadley/r4ds/issues
-issues <- jsonlite::fromJSON("issues.json", simplifyVector = FALSE)
-```
-
-You might be tempted to use `str()` on this data. Unfortunately, however, `str()` is not designed for lists that are both deep and wide, and you'll tend to get overwhelmed by the output. A better strategy is to pull the list apart piece by piece.
-
-First, figure out how many elements are in the list, take a look at one, and then check they're all the same structure. In this case there are eight elements, and the first element is another list.
-
-```{r}
-length(issues)
-str(issues[[1]])
-```
-
-(In this case we got lucky and the structure is (just) simple enough to print out with `str()`. If you're unlucky, you may need to repeat this procedure.)
-
-```{r}
-tibble::tibble(
-  i = seq_along(issues),
-  names = issues %>% map(names) 
-) %>% 
-  tidyr::unnest(names) %>% 
-  table() %>% 
-  t()
-```
-
-Another alternative is the __listviewer__ package, <https://github.com/timelyportfolio/listviewer>. 
-
-## Extracting deeply nested elements
-
-To work with this sort of data, you typically want to turn it into a data frame by extracting the related vectors that you're most interested in:
-
-```{r}
-
-issues %>% map_int("id")
-issues %>% map_lgl("locked")
-issues %>% map_chr("state")
-```
-
-You can use the same technique to extract more deeply nested structure. For example, imagine you want to extract the name and id of the user. You could do that in two steps:
-
-```{r}
-users <- issues %>% map("user")
-users %>% map_chr("login")
-users %>% map_int("id")
-```
-
-But by supplying a character _vector_ to `map_*`, you can do it in one:
-
-```{r}
-issues %>% map_chr(c("user", "login"))
-issues %>% map_int(c("user", "id"))
-```
-
-What happens if that path is missing in some of the elements? For example, lets try and extract the HTML url to the pull request:
-
-```{r, error = TRUE}
-issues %>% map_chr(c("pull_request", "html_url"))
-```
-
-Unfortunately that doesn't work. Whenever you see an error from purrr complaining about the "type" of the result, it's because it's trying to shove it into a simple vector (here a character). You can diagnose the problem more easily if you use `map()`:
-
-```{r}
-issues %>% map(c("pull_request", "html_url"))
-```
-
-To get the results into a character vector, we need to tell purrr what it should change `NULL` to. You can do that with the `.null` argument. The most common value to use is `NA`:
-
-```{r}
-issues %>% map_chr(c("pull_request", "html_url"), .null = NA)
-```
-
-(You might wonder why that isn't the default value since it's so useful. Well, if it was the default, you'd never get an error message if you had a typo in the names. You'd just get a vector of missing values. That would be annoying to debug because it's a silent failure.)
-
-It's possible to mix position and named indexing by using a list
-
-```{r}
-issues %>% map_chr(list("pull_request", 1), .null = NA)
-```
-
-
-## Removing a level of hierarchy
-
-As well as indexing deeply into hierarchy, it's sometimes useful to flatten it. That's the job of the flatten family of functions: `flatten()`, `flatten_lgl()`, `flatten_int()`, `flatten_dbl()`, and `flatten_chr()`. In the code below we take a list of lists of double vectors, then flatten it to a list of double vectors, then to a double vector.
-
-```{r}
-x <- list(list(a = 1, b = 2), list(c = 3, d = 4))
-str(x)
-
-y <- flatten(x) 
-str(y)
-flatten_dbl(y)
-```
-
-Graphically, that sequence of operations looks like:
-
-```{r, echo = FALSE}
-knitr::include_graphics("diagrams/lists-flatten.png")
-```
-
-Whenever I get confused about a sequence of flattening operations, I'll often draw a diagram like this to help me understand what's going on.
-
-Base R has `unlist()`, but I recommend avoiding it for the same reason I recommend avoiding `sapply()`: it always succeeds. Even if your data structure accidentally changes, `unlist()` will continue to work silently the wrong type of output. This tends to create problems that are frustrating to debug.
-
-## Switching levels in the hierarchy {#transpose}
-
-Other times the hierarchy feels "inside out". You can use `transpose()` to flip the first and second levels of a list: 
-
-```{r}
-x <- list(
-  x = list(a = 1, b = 3, c = 5),
-  y = list(a = 2, b = 4, c = 6)
-)
-x %>% str()
-x %>% transpose() %>% str()
-```
-
-Graphically, this looks like:
-
-```{r, echo = FALSE}
-knitr::include_graphics("diagrams/lists-transpose.png")
-```
-
-You'll see an example of this in the next section, as `transpose()` is particularly useful in conjunction with adverbs like `safely()` and `quietly()`.
-
-It's called transpose by analogy to matrices. When you subset a transposed matrix, you switch indices: `x[i, j]` is the same as `t(x)[j, i]`. It's the same idea when transposing a list, but the subsetting looks a little different: `x[[i]][[j]]` is equivalent to `transpose(x)[[j]][[i]]`. Similarly, a transpose is its own inverse so `transpose(transpose(x))` is equal to `x`.
-
-Transpose is also useful when working with JSON APIs. Many JSON APIs represent data frames in a row-based format, rather than R's column-based format. `transpose()` makes it easy to switch between the two:
-
-```{r}
-df <- tibble::tibble(x = 1:3, y = c("a", "b", "c"))
-df %>% transpose() %>% str()
-```
-
-## Turning lists into data frames
-
-* Have a deeply nested list with missing pieces
-* Need a tidy data frame so you can visualise, transform, model etc.
-* What do you do?
-* By hand with purrr, talk about `fromJSON` and `tidyJSON`
-* tidyjson
-
-### Exercises
-
-1.  Challenge: read all the CSV files in a directory. Which ones failed
-    and why? 
-
-    ```{r, eval = FALSE}
-    files <- dir("data", pattern = "\\.csv$")
-    files %>%
-      set_names(., basename(.)) %>%
-      map_df(safely(readr::read_csv), .id = "filename") %>%
-    ```
-
diff --git a/images/0-keynote.key b/images/0-keynote.key
index 6b3d19f94..06e3fe949 100644
Binary files a/images/0-keynote.key and b/images/0-keynote.key differ
diff --git a/images/EDA-boxplot.pdf b/images/EDA-boxplot.pdf
deleted file mode 100644
index b2ec98e65..000000000
Binary files a/images/EDA-boxplot.pdf and /dev/null differ
diff --git a/images/EDA-hclust.pdf b/images/EDA-hclust.pdf
deleted file mode 100644
index ed1df208f..000000000
Binary files a/images/EDA-hclust.pdf and /dev/null differ
diff --git a/images/EDA-hclust.png b/images/EDA-hclust.png
deleted file mode 100644
index 28d8189ab..000000000
Binary files a/images/EDA-hclust.png and /dev/null differ
diff --git a/images/EDA-kmeans.pdf b/images/EDA-kmeans.pdf
deleted file mode 100644
index f26b99017..000000000
Binary files a/images/EDA-kmeans.pdf and /dev/null differ
diff --git a/images/EDA-kmeans.png b/images/EDA-kmeans.png
deleted file mode 100644
index 1066e01b5..000000000
Binary files a/images/EDA-kmeans.png and /dev/null differ
diff --git a/images/EDA-linkage.pdf b/images/EDA-linkage.pdf
deleted file mode 100644
index 1c62a8541..000000000
Binary files a/images/EDA-linkage.pdf and /dev/null differ
diff --git a/images/EDA-linkage.png b/images/EDA-linkage.png
deleted file mode 100644
index 2c600b524..000000000
Binary files a/images/EDA-linkage.png and /dev/null differ
diff --git a/images/EDA-plotly.png b/images/EDA-plotly.png
deleted file mode 100644
index e26a8d683..000000000
Binary files a/images/EDA-plotly.png and /dev/null differ
diff --git a/images/RMarkdownFlow.png b/images/RMarkdownFlow.png
deleted file mode 100644
index 324b9395a..000000000
Binary files a/images/RMarkdownFlow.png and /dev/null differ
diff --git a/images/blank.png b/images/blank.png
deleted file mode 100644
index 10a15df6d..000000000
Binary files a/images/blank.png and /dev/null differ
diff --git a/images/embed_jpg.R b/images/embed_jpg.R
deleted file mode 100644
index ce35b0870..000000000
--- a/images/embed_jpg.R
+++ /dev/null
@@ -1,28 +0,0 @@
-is_latex <- function() {
-  identical(knitr::opts_knit$get('rmarkdown.pandoc.to'), "latex")
-}
-
-embed_jpg <- function(path, dpi) {
-  dim <- jpg_dim(path)
-
-  if (is_latex()) {
-    width <- round(dim[2] / dpi, 2)
-
-    knitr::asis_output(paste0(
-      "\\includegraphics[",
-      "width=", width, "in",
-      "]{", path, "}"
-    ))
-  } else {
-    knitr::asis_output(paste0(
-      "<img src='", path, "'",
-      " width='", round(dim[2] / (dpi / 96)), "'",
-      " height='", round(dim[1] / (dpi / 96)), "'",
-      " />"
-    ))
-  }
-}
-
-jpg_dim <- function(path) {
-  dim(jpeg::readJPEG(path, native = TRUE))
-}
diff --git a/images/growth-chart.png b/images/growth-chart.png
deleted file mode 100644
index 9b6d9ad6a..000000000
Binary files a/images/growth-chart.png and /dev/null differ
diff --git a/images/inline-1-heat.png b/images/inline-1-heat.png
deleted file mode 100644
index e8580bf71..000000000
Binary files a/images/inline-1-heat.png and /dev/null differ
diff --git a/images/interactive-1-htmlwidget.png b/images/interactive-1-htmlwidget.png
deleted file mode 100644
index 916f74f94..000000000
Binary files a/images/interactive-1-htmlwidget.png and /dev/null differ
diff --git a/images/interactive-2-shiny.png b/images/interactive-2-shiny.png
deleted file mode 100644
index 3b8ce1cdd..000000000
Binary files a/images/interactive-2-shiny.png and /dev/null differ
diff --git a/images/markdown-1-markup.png b/images/markdown-1-markup.png
deleted file mode 100644
index 9f46eaa63..000000000
Binary files a/images/markdown-1-markup.png and /dev/null differ
diff --git a/images/outputs-2-pdf.png b/images/outputs-2-pdf.png
deleted file mode 100644
index 252ec99f6..000000000
Binary files a/images/outputs-2-pdf.png and /dev/null differ
diff --git a/images/outputs-3-toc.png b/images/outputs-3-toc.png
deleted file mode 100644
index 17b323230..000000000
Binary files a/images/outputs-3-toc.png and /dev/null differ
diff --git a/images/params-1-hawaii.png b/images/params-1-hawaii.png
deleted file mode 100644
index 2e21659d7..000000000
Binary files a/images/params-1-hawaii.png and /dev/null differ
diff --git a/images/params-2-aleutians.png b/images/params-2-aleutians.png
deleted file mode 100644
index 01d4bf03a..000000000
Binary files a/images/params-2-aleutians.png and /dev/null differ
diff --git a/images/params-3-florida.png b/images/params-3-florida.png
deleted file mode 100644
index a72286b8f..000000000
Binary files a/images/params-3-florida.png and /dev/null differ
diff --git a/images/pepper-1.jpg b/images/pepper-1.jpg
deleted file mode 100644
index f95130ab8..000000000
Binary files a/images/pepper-1.jpg and /dev/null differ
diff --git a/images/pepper-2.jpg b/images/pepper-2.jpg
deleted file mode 100644
index cd69ace72..000000000
Binary files a/images/pepper-2.jpg and /dev/null differ
diff --git a/images/pepper-3.jpg b/images/pepper-3.jpg
deleted file mode 100644
index e8c4a77f1..000000000
Binary files a/images/pepper-3.jpg and /dev/null differ
diff --git a/images/pepper.jpg b/images/pepper.jpg
deleted file mode 100644
index c974d3a98..000000000
Binary files a/images/pepper.jpg and /dev/null differ
diff --git a/images/quarto-flow.png b/images/quarto-flow.png
new file mode 100644
index 000000000..1c2900c7b
Binary files /dev/null and b/images/quarto-flow.png differ
diff --git a/images/rmarkdown-wizard.png b/images/rmarkdown-wizard.png
deleted file mode 100644
index ba746196b..000000000
Binary files a/images/rmarkdown-wizard.png and /dev/null differ
diff --git a/images/slides-1-viridis.png b/images/slides-1-viridis.png
deleted file mode 100644
index 08a826bf3..000000000
Binary files a/images/slides-1-viridis.png and /dev/null differ
diff --git a/images/tables-2-kable.png b/images/tables-2-kable.png
deleted file mode 100644
index c3d3099a0..000000000
Binary files a/images/tables-2-kable.png and /dev/null differ
diff --git a/images/tidy-10.png b/images/tidy-10.png
deleted file mode 100644
index 3e1eeeb2f..000000000
Binary files a/images/tidy-10.png and /dev/null differ
diff --git a/images/tidy-11.png b/images/tidy-11.png
deleted file mode 100644
index c266b7a80..000000000
Binary files a/images/tidy-11.png and /dev/null differ
diff --git a/images/tidy-12.png b/images/tidy-12.png
deleted file mode 100644
index 0cfbb3777..000000000
Binary files a/images/tidy-12.png and /dev/null differ
diff --git a/images/tidy-13.png b/images/tidy-13.png
deleted file mode 100644
index cf5cc437d..000000000
Binary files a/images/tidy-13.png and /dev/null differ
diff --git a/images/tidy-14.png b/images/tidy-14.png
deleted file mode 100644
index 804a0b7cb..000000000
Binary files a/images/tidy-14.png and /dev/null differ
diff --git a/images/tidy-15.png b/images/tidy-15.png
deleted file mode 100644
index ae537f8e3..000000000
Binary files a/images/tidy-15.png and /dev/null differ
diff --git a/images/tidy-16.png b/images/tidy-16.png
deleted file mode 100644
index fa7c12b7b..000000000
Binary files a/images/tidy-16.png and /dev/null differ
diff --git a/images/tidy-17.png b/images/tidy-17.png
deleted file mode 100644
index a801653b7..000000000
Binary files a/images/tidy-17.png and /dev/null differ
diff --git a/images/tidy-18.png b/images/tidy-18.png
deleted file mode 100644
index c97984104..000000000
Binary files a/images/tidy-18.png and /dev/null differ
diff --git a/images/tidy-2.png b/images/tidy-2.png
deleted file mode 100644
index 566d40aa4..000000000
Binary files a/images/tidy-2.png and /dev/null differ
diff --git a/images/tidy-3.png b/images/tidy-3.png
deleted file mode 100644
index c359d8429..000000000
Binary files a/images/tidy-3.png and /dev/null differ
diff --git a/images/tidy-3a.png b/images/tidy-3a.png
deleted file mode 100644
index a8fd85bb1..000000000
Binary files a/images/tidy-3a.png and /dev/null differ
diff --git a/images/tidy-4.png b/images/tidy-4.png
deleted file mode 100644
index 1f9ae0513..000000000
Binary files a/images/tidy-4.png and /dev/null differ
diff --git a/images/tidy-5.png b/images/tidy-5.png
deleted file mode 100644
index 733111eab..000000000
Binary files a/images/tidy-5.png and /dev/null differ
diff --git a/images/tidy-6.png b/images/tidy-6.png
deleted file mode 100644
index 0ee45240e..000000000
Binary files a/images/tidy-6.png and /dev/null differ
diff --git a/images/tidy-7.png b/images/tidy-7.png
deleted file mode 100644
index 6ef00c28d..000000000
Binary files a/images/tidy-7.png and /dev/null differ
diff --git a/images/tidy-8.png b/images/tidy-8.png
deleted file mode 100644
index d175bb43d..000000000
Binary files a/images/tidy-8.png and /dev/null differ
diff --git a/images/tidy-9.png b/images/tidy-9.png
deleted file mode 100644
index 01c577664..000000000
Binary files a/images/tidy-9.png and /dev/null differ
diff --git a/images/visualization-coordinate-systems.png b/images/visualization-coordinate-systems.png
deleted file mode 100644
index 12431e236..000000000
Binary files a/images/visualization-coordinate-systems.png and /dev/null differ
diff --git a/images/visualization-geoms-1.png b/images/visualization-geoms-1.png
deleted file mode 100644
index 6eb60c0d3..000000000
Binary files a/images/visualization-geoms-1.png and /dev/null differ
diff --git a/images/visualization-geoms-2.png b/images/visualization-geoms-2.png
deleted file mode 100644
index 2f8de847f..000000000
Binary files a/images/visualization-geoms-2.png and /dev/null differ
diff --git a/images/visualization-geoms-3.png b/images/visualization-geoms-3.png
deleted file mode 100644
index c9f6b1ece..000000000
Binary files a/images/visualization-geoms-3.png and /dev/null differ
diff --git a/images/visualization-geoms-4.png b/images/visualization-geoms-4.png
deleted file mode 100644
index 8307bb699..000000000
Binary files a/images/visualization-geoms-4.png and /dev/null differ
diff --git a/images/visualization-grammar-1.png b/images/visualization-grammar-1.png
deleted file mode 100644
index 997fa571b..000000000
Binary files a/images/visualization-grammar-1.png and /dev/null differ
diff --git a/images/visualization-grammar-2.png b/images/visualization-grammar-2.png
deleted file mode 100644
index 4c7ee9997..000000000
Binary files a/images/visualization-grammar-2.png and /dev/null differ
diff --git a/images/visualization-grammar-3.png b/images/visualization-grammar-3.png
deleted file mode 100644
index dc7b51721..000000000
Binary files a/images/visualization-grammar-3.png and /dev/null differ
diff --git a/images/visualization-grammar.png b/images/visualization-grammar.png
new file mode 100644
index 000000000..f4e11c639
Binary files /dev/null and b/images/visualization-grammar.png differ
diff --git a/images/visualization-stat-point.png b/images/visualization-stat-point.png
deleted file mode 100644
index 86316ab4d..000000000
Binary files a/images/visualization-stat-point.png and /dev/null differ
diff --git a/images/visualization-stats.png b/images/visualization-stats.png
deleted file mode 100644
index bf3a9e60c..000000000
Binary files a/images/visualization-stats.png and /dev/null differ
diff --git a/images/website-2-website.png b/images/website-2-website.png
deleted file mode 100644
index a67713f9c..000000000
Binary files a/images/website-2-website.png and /dev/null differ
diff --git a/import.Rmd b/import.Rmd
deleted file mode 100644
index 37f977b9e..000000000
--- a/import.Rmd
+++ /dev/null
@@ -1,649 +0,0 @@
-# Data import
-
-## Introduction
-
-Working with data provided by R packages is a great way to learn the tools of data science, but at some point you want to stop learning and start working with your own data. In this chapter, you'll learn how to read plain-text rectangular files into R. Here, we'll only scratch the surface of data import, but many of the principles will translate to other forms of data. We'll finish with a few pointers to packages that are useful for other types of data.
-
-### Prerequisites
-
-In this chapter, you'll learn how to load flat files in R with the __readr__ package, which is part of the core tidyverse.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-```
-
-## Getting started
-
-Most of readr's functions are concerned with turning flat files into data frames:
-
-* `read_csv()` reads comma delimited files, `read_csv2()` reads semicolon
-  separated files (common in countries where `,` is used as the decimal place),
-  `read_tsv()` reads tab delimited files, and `read_delim()` reads in files
-  with any delimiter.
-
-* `read_fwf()` reads fixed width files. You can specify fields either by their
-  widths with `fwf_widths()` or their position with `fwf_positions()`.
-  `read_table()` reads a common variation of fixed width files where columns
-  are separated by white space.
-
-* `read_log()` reads Apache style log files. (But also check out
-  [webreadr](https://github.com/Ironholds/webreadr) which is built on top
-  of `read_log()` and provides many more helpful tools.)
-
-These functions all have similar syntax: once you've mastered one, you can use the others with ease. For the rest of this chapter we'll focus on `read_csv()`. Not only are csv files one of the most common forms of data storage, but once you understand `read_csv()`, you can easily apply your knowledge to all the other functions in readr.
-
-The first argument to `read_csv()` is the most important: it's the path to the file to read.
-
-```{r, message = TRUE}
-heights <- read_csv("data/heights.csv")
-```
-
-When you run `read_csv()` it prints out a column specification that gives the name and type of each column. That's an important part of readr, which we'll come back to in [parsing a file].
-
-You can also supply an inline csv file. This is useful for experimenting with readr and for creating reproducible examples to share with others:
-
-```{r}
-read_csv("a,b,c
-1,2,3
-4,5,6")
-```
-
-In both cases `read_csv()` uses the first line of the data for the column names, which is a very common convention. There are two cases where you might want to tweak this behaviour:
-
-1.  Sometimes there are a few lines of metadata at the top of the file. You can
-    use `skip = n` to skip the first `n` lines; or use `comment = "#"` to drop
-    all lines that start with (e.g.) `#`.
-    
-    ```{r}
-    read_csv("The first line of metadata
-      The second line of metadata
-      x,y,z
-      1,2,3", skip = 2)
-    
-    read_csv("# A comment I want to skip
-      x,y,z
-      1,2,3", comment = "#")
-    ```
-    
-1.  The data might not have column names. You can use `col_names = FALSE` to
-    tell `read_csv()` not to treat the first row as headings, and instead
-    label them sequentially from `X1` to `Xn`:
-    
-    ```{r}
-    read_csv("1,2,3\n4,5,6", col_names = FALSE)
-    ```
-    
-    (`"\n"` is a convenient shortcut for adding a new line. You'll learn more
-    about it and other types of string escape in [string basics].)
-    
-    Alternatively you can pass `col_names` a character vector which will be
-    used as the column names:
-    
-    ```{r}
-    read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
-    ```
-
-Another option that commonly needs tweaking is `na`: this specifies the value (or values) that are used to represent missing values in your file:
-
-```{r}
-read_csv("a,b,c\n1,2,.", na = ".")
-```
-
-This is all you need to know to read ~75% of CSV files that you'll encounter in practice. You can also easily adapt what you've learned to read tab separated files with `read_tsv()` and fixed width files with `read_fwf()`. To read in more challenging files, you'll need to learn more about how readr parses each column, turning them into R vectors.
-
-### Compared to base R
-
-If you've used R before, you might wonder why we're not using `read.csv()`. There are a few good reasons to favour readr functions over the base equivalents:
-
-* They are typically much faster (~10x) than their base equivalents.
-  Long running jobs have a progress bar, so you can see what's happening. 
-  If you're looking for raw speed, try `data.table::fread()`. It doesn't fit 
-  quite so well into the tidyverse, but it can be quite a bit faster.
-
-* They produce tibbles, they don't convert character vectors to factors,
-  use row names, or munge the column names. These are common sources of
-  frustration with the base R functions.
-
-* They are more reproducible. Base R functions inherit some behaviour from
-  your operating system and environment variables, so import code that works 
-  on your computer might not work on someone else's.
-
-### Exercises
-
-1.  What function would you use to read a file where fields were separated with  
-    "|"?
-    
-1.  Apart from `file`, `skip`, and `comment`, what other arguments do
-    `read_csv()` and `read_tsv()` have in common?
-    
-1.  What are the most important arguments to `read_fwf()`?
-   
-1.  Sometimes strings in a CSV file contain commas. To prevent them from
-    causing problems they need to be surrounded by a quoting character, like
-    `"` or `'`. By convention, `read_csv()` assumes that the quoting
-    character will be `"`, and if you want to change it you'll need to
-    use `read_delim()` instead. What arguments do you need to specify
-    to read the following text into a data frame?
-    
-    ```{r, eval = FALSE}
-    "x,y\n1,'a,b'"
-    ```
-    
-1.  Identify what is wrong with each of the following inline CSV files. 
-    What happens when you run the code?
-    
-    ```{r, eval = FALSE}
-    read_csv("a,b\n1,2,3\n4,5,6")
-    read_csv("a,b,c\n1,2\n1,2,3,4")
-    read_csv("a,b\n\"1")
-    read_csv("a,b\n1,2\na,b")
-    read_csv("a;b\n1;3")
-    ```
-
-## Parsing a vector
-
-Before we get into the details of how readr reads files from disk, we need to take a little detour to talk about the `parse_*()` functions. These functions take a character vector and return a more specialised vector like a logical, integer, or date:
-
-```{r}
-str(parse_logical(c("TRUE", "FALSE", "NA")))
-str(parse_integer(c("1", "2", "3")))
-str(parse_date(c("2010-01-01", "1979-10-14")))
-```
-
-These functions are useful in their own right, but are also an important building block for readr. Once you've learned how the individual parsers work in this section, we'll circle back and see how they fit together to parse a complete file in the next section.
-
-Like all functions in the tidyverse, the `parse_*()` functions are uniform: the first argument is a character vector to parse, and the `na` argument specifies which strings should be treated as missing:
-
-```{r}
-parse_integer(c("1", "231", ".", "456"), na = ".")
-```
-
-If parsing fails, you'll get a warning:
-
-```{r}
-x <- parse_integer(c("123", "345", "abc", "123.45"))
-```
-
-And the failures will be missing in the output:
-
-```{r}
-x
-```
-
-If there are many parsing failures, you'll need to use `problems()` to get the complete set. This returns a tibble, which you can then manipulate with dplyr.
-
-```{r}
-problems(x)
-```
-
-Using parsers is mostly a matter of understanding what's available and how they deal with different types of input. There are eight particularly important parsers:
-
-1.  `parse_logical()` and `parse_integer()` parse logicals and integers
-    respectively. There's basically nothing that can go wrong with these
-    parsers so I won't describe them here further.
-    
-1.  `parse_double()` is a strict numeric parser, and `parse_number()` 
-    is a flexible numeric parser. These are more complicated than you might
-    expect because different parts of the world write numbers in different
-    ways.
-    
-1.  `parse_character()` seems so simple that it shouldn't be necessary. But
-    one complication makes it quite important: character encodings.
-
-1.  `parse_factor()` create factors, the data structure that R uses to represent
-    categorical variables with fixed and known values.
-
-1.  `parse_datetime()`, `parse_date()`, and `parse_time()` allow you to
-    parse various date & time specifications. These are the most complicated
-    because there are so many different ways of writing dates.
-
-The following sections describe these parsers in more detail.
-
-### Numbers
-
-It seems like it should be straightforward to parse a number, but three problems make it tricky:
-
-1. People write numbers differently in different parts of the world.
-   For example, some countries use `.` in between the integer and fractional 
-   parts of a real number, while others use `,`.
-   
-1. Numbers are often surrounded by other characters that provide some
-   context, like "$1000" or "10%".
-
-1. Numbers often contain "grouping" characters to make them easier to read, 
-   like "1,000,000", and these grouping characters vary around the world.
-
-To address the first problem, readr has the notion of a "locale", an object that specifies parsing options that differ from place to place. When parsing numbers, the most important option is the character you use for the decimal mark. You can override the default value of `.` by creating a new locale and setting the `decimal_mark` argument:
-
-```{r}
-parse_double("1.23")
-parse_double("1,23", locale = locale(decimal_mark = ","))
-```
-
-readr's default locale is US-centric, because generally R is US-centric (i.e. the documentation of base R is written in American English). An alternative approach would be to try and guess the defaults from your operating system. This is hard to do well, and, more importantly, makes your code fragile: even if it works on your computer, it might fail when you email it to a colleague in another country.
-
-`parse_number()` addresses the second problem: it ignores non-numeric characters before and after the number. This is particularly useful for currencies and percentages, but also works to extract numbers embedded in text.
-
-```{r}
-parse_number("$100")
-parse_number("20%")
-parse_number("It cost $123.45")
-```
-
-The final problem is addressed by the combination of `parse_number()` and the locale as `parse_number()` will ignore the "grouping mark":
-
-```{r}
-# Used in America
-parse_number("$123,456,789")
-
-# Used in many parts of Europe
-parse_number("123.456.789", locale = locale(grouping_mark = "."))
-
-# Used in Switzerland
-parse_number("123'456'789", locale = locale(grouping_mark = "'"))
-```
-
-### Strings {#readr-strings}
-
-It seems like `parse_character()` should be really simple --- it could just return its input. Unfortunately life isn't so simple, as there are multiple ways to represent the same string. To understand what's going on, we need to dive into the details of how computers represent strings. In R, we can get at the underlying representation of a string using `charToRaw()`:
-
-```{r}
-charToRaw("Hadley")
-```
-
-Each hexadecimal number represents a byte of information: `48` is H, `61` is a, and so on. The mapping from hexadecimal number to character is called the encoding, and in this case the encoding is called ASCII. ASCII does a great job of representing English characters, because it's the __American__ Standard Code for Information Interchange.
-
-Things get more complicated for languages other than English. In the early days of computing there were many competing standards for encoding non-English characters, and to correctly interpret a string you needed to know both the values and the encoding. For example, two common encodings are Latin1 (aka ISO-8859-1, used for Western European languages) and Latin2 (aka ISO-8859-2, used for Eastern European languages). In Latin1, the byte `b1` is "±", but in Latin2, it's "ą"! Fortunately, today there is one standard that is supported almost everywhere: UTF-8. UTF-8 can encode just about every character used by humans today, as well as many extra symbols (like emoji!).
-
-readr uses UTF-8 everywhere: it assumes your data is UTF-8 encoded when you read it, and always uses it when writing. This is a good default, but will fail for data produced by older systems that don't understand UTF-8. If this happens to you, your strings will look weird when you print them. Sometimes just one or two characters might be messed up; other times you'll get complete gibberish. For example:
-
-```{r}
-x1 <- "El Ni\xf1o was particularly bad this year"
-x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
-
-x1
-x2
-```
-
-To fix the problem you need to specify the encoding in `parse_character()`:
-
-```{r}
-parse_character(x1, locale = locale(encoding = "Latin1"))
-parse_character(x2, locale = locale(encoding = "Shift-JIS"))
-```
-
-How do you find the correct encoding? If you're lucky, it'll be included somewhere in the data documentation. Unfortunately, that's rarely the case, so readr provides  `guess_encoding()` to help you figure it out. It's not foolproof, and it works better when you have lots of text (unlike here), but it's a reasonable place to start. Expect to try a few different encodings before you find the right one.
-
-```{r}
-guess_encoding(charToRaw(x1))
-guess_encoding(charToRaw(x2))
-```
-
-The first argument to `guess_encoding()` can either be a path to a file, or, as in this case, a raw vector (useful if the strings are already in R).
-
-Encodings are a rich and complex topic, and I've only scratched the surface here. If you'd like to learn more I'd recommend reading the detailed explanation at <http://kunststube.net/encoding/>.
-
-### Factors {#readr-factors}
-
-R uses factors to represent categorical variables that have a known set of possible values. Give `parse_factor()` a vector of known `levels` to generate a warning whenever an unexpected value is present:
-
-```{r}
-fruit <- c("apple", "banana")
-parse_factor(c("apple", "banana", "bananana"), levels = fruit)
-```
-
-But if you have many problematic entries, it's often easier to leave as character vectors and then use the tools you'll learn about in [strings] and [factors] to clean them up.
-
-### Dates, date-times, and times {#readr-datetimes}
-
-You pick between three parsers depending on whether you want a date (the number of days since 1970-01-01), a date-time (the number of seconds since midnight 1970-01-01), or a time (the number of seconds since midnight). When called without any additional arguments:
-
-*   `parse_datetime()` expects an ISO8601 date-time. ISO8601 is an
-    international standard in which the components of a date are
-    organised from biggest to smallest: year, month, day, hour, minute, 
-    second.
-    
-    ```{r}
-    parse_datetime("2010-10-01T2010")
-    # If time is omitted, it will be set to midnight
-    parse_datetime("20101010")
-    ```
-    
-    This is the most important date/time standard, and if you work with
-    dates and times frequently, I recommend reading
-    <https://en.wikipedia.org/wiki/ISO_8601>
-    
-*   `parse_date()` expects a four digit year, a `-` or `/`, the month, a `-` 
-    or `/`, then the day:
-    
-    ```{r}
-    parse_date("2010-10-01")
-    ```
-
-*   `parse_time()` expects the hour, `:`, minutes, optionally `:` and seconds, 
-    and an optional am/pm specifier:
-  
-    ```{r}
-    library(hms)
-    parse_time("01:10 am")
-    parse_time("20:10:01")
-    ```
-    
-    Base R doesn't have a great built in class for time data, so we use 
-    the one provided in the hms package.
-
-If these defaults don't work for your data you can supply your own date-time `format`, built up of the following pieces:
-
-Year
-: `%Y` (4 digits). 
-: `%y` (2 digits); 00-69 -> 2000-2069, 70-99 -> 1970-1999.
-
-Month
-: `%m` (2 digits).
-: `%b` (abbreviated name, like "Jan").
-: `%B` (full name, "January").
-
-Day
-: `%d` (2 digits).
-: `%e` (optional leading space).
-
-Time
-: `%H` 0-23 hour.
-: `%I` 0-12, must be used with `%p`.
-: `%p` AM/PM indicator.
-: `%M` minutes.
-: `%S` integer seconds.
-: `%OS` real seconds. 
-: `%Z` Time zone (as name, e.g. `America/Chicago`). Beware of abbreviations:
-  if you're American, note that "EST" is a Canadian time zone that does not
-  have daylight savings time. It is _not_ Eastern Standard Time! We'll
-  come back to this [time zones].
-: `%z` (as offset from UTC, e.g. `+0800`). 
-
-Non-digits
-: `%.` skips one non-digit character.
-: `%*` skips any number of non-digits.
-
-The best way to figure out the correct format is to create a few examples in a character vector, and test with one of the parsing functions. For example:
-
-```{r}
-parse_date("01/02/15", "%m/%d/%y")
-parse_date("01/02/15", "%d/%m/%y")
-parse_date("01/02/15", "%y/%m/%d")
-```
-
-If you're using `%b` or `%B` with non-English month names, you'll need to set the  `lang` argument to `locale()`. See the list of built-in languages in `date_names_langs()`, or if your language is not already included, create your own with `date_names()`.
-
-```{r}
-parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
-```
-
-### Exercises
-
-1.  What are the most important arguments to `locale()`? 
-
-1.  What happens if you try and set `decimal_mark` and `grouping_mark` 
-    to the same character? What happens to the default value of 
-    `grouping_mark` when you set `decimal_mark` to ","? What happens
-    to the default value of `decimal_mark` when you set the `grouping_mark`
-    to "."?
-
-1.  I didn't discuss the `date_format` and `time_format` options to
-    `locale()`. What do they do? Construct an example that shows when 
-    they might be useful.
-
-1.  If you live outside the US, create a new locale object that encapsulates 
-    the settings for the types of file you read most commonly.
-    
-1.  What's the difference between `read_csv()` and `read_csv2()`?
-    
-1.  What are the most common encodings used in Europe? What are the
-    most common encodings used in Asia? Do some googling to find out.
-
-1.  Generate the correct format string to parse each of the following 
-    dates and times:
-    
-    ```{r}
-    d1 <- "January 1, 2010"
-    d2 <- "2015-Mar-07"
-    d3 <- "06-Jun-2017"
-    d4 <- c("August 19 (2015)", "July 1 (2015)")
-    d5 <- "12/30/14" # Dec 30, 2014
-    t1 <- "1705"
-    t2 <- "11:15:10.12 PM"
-    ```
-
-## Parsing a file
-
-Now that you've learned how to parse an individual vector, it's time to return to the beginning and explore how readr parses a file. There are two new things that you'll learn about in this section:
-
-1. How readr automatically guesses the type of each column.
-1. How to override the default specification.
-
-### Strategy
-
-readr uses a heuristic to figure out the type of each column: it reads the first 1000 rows and uses some (moderately conservative) heuristics to figure out the type of each column. You can emulate this process with a character vector using `guess_parser()`, which returns readr's best guess, and `parse_guess()` which uses that guess to parse the column:
-
-```{r}
-guess_parser("2010-10-01")
-guess_parser("15:01")
-guess_parser(c("TRUE", "FALSE"))
-guess_parser(c("1", "5", "9"))
-guess_parser(c("12,352,561"))
-
-str(parse_guess("2010-10-10"))
-```
-
-The heuristic tries each of the following types, stopping when it finds a match:
-
-* logical: contains only "F", "T", "FALSE", or "TRUE".
-* integer: contains only numeric characters (and `-`).
-* double: contains only valid doubles (including numbers like `4.5e-5`).
-* number: contains valid doubles with the grouping mark inside.
-* time: matches the default `time_format`.
-* date: matches the default `date_format`.
-* date-time: any ISO8601 date.
-
-If none of these rules apply, then the column will stay as a vector of strings.
-
-### Problems
-
-These defaults don't always work for larger files. There are two basic problems:
-
-1.  The first thousand rows might be a special case, and readr guesses
-    a type that is not sufficiently general. For example, you might have 
-    a column of doubles that only contains integers in the first 1000 rows. 
-
-1.  The column might contain a lot of missing values. If the first 1000
-    rows contain only `NA`s, readr will guess that it's a character 
-    vector, whereas you probably want to parse it as something more
-    specific.
-
-readr contains a challenging CSV that illustrates both of these problems:
-
-```{r}
-challenge <- read_csv(readr_example("challenge.csv"))
-```
-
-(Note the use of `readr_example()` which finds the path to one of the files included with the package)
-
-There are two printed outputs: the column specification generated by looking at the first 1000 rows, and the first five parsing failures. It's always a good idea to explicitly pull out the `problems()`, so you can explore them in more depth:
-
-```{r}
-problems(challenge)
-```
-
-A good strategy is to work column by column until there are no problems remaining. Here we can see that there are a lot of parsing problems with the `x` column - there are trailing characters after the integer value. That suggests we need to use a double parser instead.
-
-To fix the call, start by copying and pasting the column specification into your original call:
-
-```{r, eval = FALSE}
-challenge <- read_csv(
-  readr_example("challenge.csv"), 
-  col_types = cols(
-    x = col_integer(),
-    y = col_character()
-  )
-)
-```
-
-Then you can tweak the type of the `x` column:
-
-```{r}
-challenge <- read_csv(
-  readr_example("challenge.csv"), 
-  col_types = cols(
-    x = col_double(),
-    y = col_character()
-  )
-)
-```
-
-That fixes the first problem, but if we look at the last few rows, you'll see that they're dates stored in a character vector:
-
-```{r}
-tail(challenge)
-```
-
-You can fix that by specifying that `y` is a date column:
-
-```{r}
-challenge <- read_csv(
-  readr_example("challenge.csv"), 
-  col_types = cols(
-    x = col_double(),
-    y = col_date()
-  )
-)
-tail(challenge)
-```
-
-Every `parse_xyz()` function has a corresponding `col_xyz()` function. You use `parse_xyz()` when the data is in a character vector in R already; you use `col_xyz()` when you want to tell readr how to load the data.
-
-I highly recommend always supplying `col_types`, building up from the print-out provided by readr. This ensures that you have a consistent and reproducible data import script. If you rely on the default guesses and your data changes, readr will continue to read it in. If you want to be really strict, use `stop_for_problems()`: that will throw an error and stop your script if there are any parsing problems.
-
-### Other strategies
-
-There are a few other general strategies to help you parse files:
-
-*   In the previous example, we just got unlucky: if we look at just
-    one more row than the default, we can correctly parse in one shot:
-   
-    ```{r}
-    challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
-    challenge2
-    ```
-
-*   Sometimes it's easier to diagnose problems if you just read in all
-    the columns as character vectors:
-   
-    ```{r}
-    challenge2 <- read_csv(readr_example("challenge.csv"), 
-      col_types = cols(.default = col_character())
-    )
-    ```
-    
-    This is particularly useful in conjunction with `type_convert()`,
-    which applies the parsing heuristics to the character columns in a data
-    frame.
-
-    ```{r}
-    df <- tribble(
-      ~x,  ~y,
-      "1", "1.21",
-      "2", "2.32",
-      "3", "4.56"
-    )
-    df
-    
-    # Note the column types
-    type_convert(df)
-    ```
-    
-*   If you're reading a very large file, you might want to set `n_max` to
-    a smallish number like 10,000 or 100,000. That will accelerate your 
-    iterations while you eliminate common problems.
-
-*   If you're having major parsing problems, sometimes it's easier
-    to just read into a character vector of lines with `read_lines()`,
-    or even a character vector of length 1 with `read_file()`. Then you
-    can use the string parsing skills you'll learn later to parse
-    more exotic formats.
-
-## Writing to a file
-
-readr also comes with two useful functions for writing data back to disk: `write_csv()` and `write_tsv()`. Both functions increase the chances of the output file being read back in correctly by:
-
-* Always encoding strings in UTF-8.
-  
-* Saving dates and date-times in ISO8601 format so they are easily
-  parsed elsewhere.
-
-If you want to export a csv file to Excel, use `write_excel_csv()` --- this writes a special character (a "byte order mark") at the start of the file which tells Excel that you're using the UTF-8 encoding.
-
-The most important arguments are `x` (the data frame to save), and `path` (the location to save it). You can also specify how missing values are written with `na`, and if you want to `append` to an existing file.
-
-```{r, eval = FALSE}
-write_csv(challenge, "challenge.csv")
-```
-
-Note that the type information is lost when you save to csv:
-
-```{r, warning = FALSE}
-challenge
-write_csv(challenge, "challenge-2.csv")
-read_csv("challenge-2.csv")
-```
-
-This makes CSVs a little unreliable for caching interim results---you need to recreate the column specification every time you load in. There are two alternatives:
-
-1.  `write_rds()` and `read_rds()` are uniform wrappers around the base 
-    functions `readRDS()` and `saveRDS()`. These store data in R's custom 
-    binary format called RDS:
-    
-    ```{r}
-    write_rds(challenge, "challenge.rds")
-    read_rds("challenge.rds")
-    ```
-  
-1.  The feather package implements a fast binary file format that can
-    be shared across programming languages:
-    
-    ```{r, eval = FALSE}
-    library(feather)
-    write_feather(challenge, "challenge.feather")
-    read_feather("challenge.feather")
-    #> # A tibble: 2,000 x 2
-    #>       x      y
-    #>   <dbl> <date>
-    #> 1   404   <NA>
-    #> 2  4172   <NA>
-    #> 3  3004   <NA>
-    #> 4   787   <NA>
-    #> 5    37   <NA>
-    #> 6  2332   <NA>
-    #> # ... with 1,994 more rows
-    ```
-
-Feather tends to be faster than RDS and is usable outside of R. RDS supports list-columns (which you'll learn about in [many models]); feather currently does not.
-
-```{r, include = FALSE}
-file.remove("challenge-2.csv")
-file.remove("challenge.rds")
-```
-
-## Other types of data
-
-To get other types of data into R, we recommend starting with the tidyverse packages listed below. They're certainly not perfect, but they are a good place to start. For rectangular data:
-
-* __haven__ reads SPSS, Stata, and SAS files.
-
-* __readxl__ reads excel files (both `.xls` and `.xlsx`).
-
-* __DBI__, along with a database specific backend (e.g. __RMySQL__, 
-  __RSQLite__, __RPostgreSQL__ etc) allows you to run SQL queries against a 
-  database and return a data frame.
-
-For hierarchical data: use __jsonlite__ (by Jeroen Ooms) for json, and __xml2__ for XML. Jenny Bryan has some excellent worked examples at <https://jennybc.github.io/purrr-tutorial/>.
-
-For other file types, try the [R data import/export manual](https://cran.r-project.org/doc/manuals/r-release/R-data.html) and the [__rio__](https://github.com/leeper/rio) package.
diff --git a/import.qmd b/import.qmd
new file mode 100644
index 000000000..f487e9b62
--- /dev/null
+++ b/import.qmd
@@ -0,0 +1,41 @@
+# Import {#sec-import .unnumbered}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+In this part of the book, you'll learn how to import a wider range of data into R, as well as how to get it into a form useful form for analysis.
+Sometimes this is just a matter of calling a function from the appropriate data import package.
+But in more complex cases it might require both tidying and transformation in order to get to the tidy rectangle that you'd prefer to work with.
+
+```{r}
+#| label: fig-ds-import
+#| echo: false
+#| fig-cap: |
+#|   Data import is the beginning of the data science process; without
+#|   data you can't do data science!
+#| fig-alt: |
+#|   Our data science model with import highlighted in blue.
+#| out.width: NULL
+
+knitr::include_graphics("diagrams/data-science/import.png", dpi = 270)
+```
+
+In this part of the book you'll learn how to access data stored in the following ways:
+
+-   In @sec-import-spreadsheets, you'll learn how to import data from Excel spreadsheets and Google Sheets.
+
+-   In @sec-import-databases, you'll learn about getting data out of a database and into R (and you'll also learn a little about how to get data out of R and into a database).
+
+-   In @sec-arrow, you'll learn about Arrow, a powerful tool for working with out-of-memory data, particularly when it's stored in the parquet format.
+
+-   In @sec-rectangling, you'll learn how to work with hierarchical data, including the deeply nested lists produced by data stored in the JSON format.
+
+-   In @sec-scraping, you'll learn web "scraping", the art and science of extracting data from web pages.
+
+There are two important tidyverse packages that we don't discuss here: haven and xml2.
+If you're working with data from SPSS, Stata, and SAS files, check out the **haven** package, <https://haven.tidyverse.org>.
+If you're working with XML data, check out the **xml2** package, <https://xml2.r-lib.org>.
+Otherwise, you'll need to do some research to figure which package you'll need to use; google is your friend here 😃.
diff --git a/index.qmd b/index.qmd
new file mode 100644
index 000000000..a65f5f99a
--- /dev/null
+++ b/index.qmd
@@ -0,0 +1,27 @@
+# Welcome {.unnumbered}
+
+This is the website for the 2nd edition of **"R for Data Science"**.
+This book will teach you how to do data science with R: You'll learn how to get your data into R, get it into the most useful structure, transform it and visualize.
+
+In this book, you will find a practicum of skills for data science.
+Just as a chemist learns how to clean test tubes and stock a lab, you'll learn how to clean data and draw plots---and many other things besides.
+These are the skills that allow data science to happen, and here you will find the best practices for doing each of these things with R.
+You'll learn how to use the grammar of graphics, literate programming, and reproducible research to save time.
+You'll also learn how to manage cognitive resources to facilitate discoveries when wrangling, visualizing, and exploring data.
+
+This website is and will always be free, licensed under the [CC BY-NC-ND 3.0](https://creativecommons.org/licenses/by-nc-nd/3.0/us/) License.
+If you'd like a physical copy of the book, you can order it on [Amazon](https://www.amazon.com/dp/1492097403?&tag=hadlwick-20).
+If you appreciate reading the book for free and would like to give back, please make a donation to [Kākāpō Recovery](https://www.doc.govt.nz/kakapo-donate): the [kākāpō](https://www.youtube.com/watch?v=9T1vfsHYiKY) (which appears on the cover of R4DS) is a critically endangered parrot native to New Zealand; there are only 248 left.
+
+If you speak another language, you might be interested in the freely available translations of the 1st edition:
+
+-   [Spanish](http://es.r4ds.hadley.nz)
+-   [Italian](https://it.r4ds.hadley.nz)
+-   [Turkish](https://tr.r4ds.hadley.nz)
+
+Please note that R4DS uses a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html).
+By contributing to this book, you agree to abide by its terms.
+
+## Acknowledgements {.unnumbered}
+
+R4DS is hosted by <https://www.netlify.com> as part of their support of open source software and communities.
diff --git a/index.rmd b/index.rmd
deleted file mode 100644
index 9c6b2a518..000000000
--- a/index.rmd
+++ /dev/null
@@ -1,26 +0,0 @@
----
-knit: "bookdown::render_book"
-title: "R for Data Science"
-author: ["Garrett Grolemund", "Hadley Wickham"]
-description: "This book will teach you how to do data science with R: You'll learn how to get your data into R, get it into the most useful structure, transform it, visualise it and model it. In this book, you will find a practicum of skills for data science. Just as a chemist learns how to clean test tubes and stock a lab, you'll learn how to clean data and draw plots---and many other things besides. These are the skills that allow data science to happen, and here you will find the best practices for doing each of these things with R. You'll learn how to use the grammar of graphics, literate programming, and reproducible research to save time. You'll also learn how to manage cognitive resources to facilitate discoveries when wrangling, visualising, and exploring data."
-url: 'http\://r4ds.had.co.nz/'
-github-repo: hadley/r4ds
-twitter-handle: hadley
-cover-image: cover.png
-site: bookdown::bookdown_site
-documentclass: book
----
-
-# Welcome {-}
-
-This is the website for __"R for Data Science"__. This book will teach you how to do data science with R: You'll learn how to get your data into R, get it into the most useful structure, transform it, visualise it and model it. In this book, you will find a practicum of skills for data science. Just as a chemist learns how to clean test tubes and stock a lab, you'll learn how to clean data and draw plots---and many other things besides. These are the skills that allow data science to happen, and here you will find the best practices for doing each of these things with R. You'll learn how to use the grammar of graphics, literate programming, and reproducible research to save time. You'll also learn how to manage cognitive resources to facilitate discoveries when wrangling, visualising, and exploring data. 
-
-Want a physical copy of this material?
-
-Published by O'Reilly January 2017 First Edition. Order from [amazon](http://amzn.to/2aHLAQ1).
-
-<img src="cover.png" width="250" height="375" alt="Cover image" />
-
-(R for Data Science was formerly called Data Science with R in Hands-On Programming with R)
-
-This work is licensed under the [Creative Commons Attribution-NonCommercial-NoDerivs 3.0](http://creativecommons.org/licenses/by-nc-nd/3.0/us/) United States License. 
diff --git a/intro.Rmd b/intro.Rmd
deleted file mode 100644
index 63089b2bc..000000000
--- a/intro.Rmd
+++ /dev/null
@@ -1,281 +0,0 @@
-# Introduction
-
-Data science is an exciting discipline that allows you to turn raw data into understanding, insight, and knowledge. The goal of "R for Data Science" is to help you learn the most important tools in R that will allow you to do data science. After reading this book, you'll have the tools to tackle a wide variety of data science challenges, using the best parts of R. 
-
-## What you will learn
-
-Data science is a huge field, and there's no way you can master it by reading a single book. The goal of this book is to give you a solid foundation in the most important tools. Our model of the tools needed in a typical data science project looks something like this:
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/data-science.png")
-```
-
-First you must __import__ your data into R. This typically means that you take data stored in a file, database, or web API, and load it into a data frame in R. If you can't get your data into R, you can't do data science on it!
-
-Once you've imported your data, it is a good idea to __tidy__ it. Tidying your data means storing it in a consistent form that matches the semantics of the dataset with the way it is stored. In brief, when your data is tidy, each column is a variable, and each row is an observation. Tidy data is important because the consistent structure lets you focus your struggle on questions about the data, not fighting to get the data into the right form for different functions.
-
-Once you have tidy data, a common first step is to __transform__ it. Transformation includes narrowing in on observations of interest (like all people in one city, or all data from the last year), creating new variables that are functions of existing variables (like computing velocity from speed and time), and calculating a set of summary statistics (like counts or means). Together, tidying and transforming are called __wrangling__, because getting your data in a form that's natural to work with often feels like a fight!
-
-Once you have tidy data with the variables you need, there are two main engines of knowledge generation: visualisation and modelling. These have complementary strengths and weaknesses so any real analysis will iterate between them many times.
-
-__Visualisation__ is a fundamentally human activity. A good visualisation will show you things that you did not expect, or raise new questions about the data. A good visualisation might also hint that you're asking the wrong question, or you need to collect different data. Visualisations can surprise you, but don't scale particularly well because they require a human to interpret them.
-
-__Models__ are complementary tools to visualisation. Once you have made your questions sufficiently precise, you can use a model to answer them. Models are a fundamentally mathematical or computational tool, so they generally scale well. Even when they don't, it's usually cheaper to buy more computers than it is to buy more brains! But every model makes assumptions, and by its very nature a model cannot question its own assumptions. That means a model cannot fundamentally surprise you.
-
-The last step of data science is __communication__, an absolutely critical part of any data analysis project. It doesn't matter how well your models and visualisation have led you to understand the data unless you can also communicate your results to others.
-
-Surrounding all these tools is __programming__. Programming is a cross-cutting tool that you use in every part of the project. You don't need to be an expert programmer to be a data scientist, but learning more about programming pays off because becoming a better programmer allows you to automate common tasks, and solve new problems with greater ease.
-
-You'll use these tools in every data science project, but for most projects they're not enough. There's a rough 80-20 rule at play; you can tackle about 80% of every project using the tools that you'll learn in this book, but you'll need other tools to tackle the remaining 20%. Throughout this book we'll point you to resources where you can learn more.
-
-## How this book is organised
-
-The previous description of the tools of data science is organised roughly according to the order in which you use them in an analysis (although of course you'll iterate through them multiple times). In our experience, however, this is not the best way to learn them:
-
-* Starting with data ingest and tidying is sub-optimal because 80% of the time 
-  it's routine and boring, and the other 20% of the time it's weird and
-  frustrating. That's a bad place to start learning a new subject! Instead, 
-  we'll start with visualisation and transformation of data that's already been
-  imported and tidied. That way, when you ingest and tidy your own data, your
-  motivation will stay high because you know the pain is worth it.
-  
-* Some topics are best explained with other tools. For example, we believe that
-  it's easier to understand how models work if you already know about 
-  visualisation, tidy data, and programming.
-  
-* Programming tools are not necessarily interesting in their own right, 
-  but do allow you to tackle considerably more challenging problems. We'll
-  give you a selection of programming tools in the middle of the book, and 
-  then you'll see how they can combine with the data science tools to tackle 
-  interesting modelling problems.
-
-Within each chapter, we try and stick to a similar pattern: start with some motivating examples so you can see the bigger picture, and then dive into the details. Each section of the book is paired with exercises to help you practice what you've learned. While it's tempting to skip the exercises, there's no better way to learn than practicing on real problems.
-
-## What you won't learn
-
-There are some important topics that this book doesn't cover. We believe it's important to stay ruthlessly focused on the essentials so you can get up and running as quickly as possible. That means this book can't cover every important topic.
-
-### Big data
-
-This book proudly focuses on small, in-memory datasets. This is the right place to start because you can't tackle big data unless you have experience with small data. The tools you learn in this book will easily handle hundreds of megabytes of data, and with a little care you can typically use them to work with 1-2 Gb of data. If you're routinely working with larger data (10-100 Gb, say), you should learn more about [data.table](https://github.com/Rdatatable/data.table). This book doesn't teach data.table because it has a very concise interface which makes it harder to learn since it offers fewer linguistic cues. But if you're working with large data, the performance payoff is worth the extra effort required to learn it.
-
-If your data is bigger than this, carefully consider if your big data problem might actually be a small data problem in disguise. While the complete data might be big, often the data needed to answer a specific question is small. You might be able to find a subset, subsample, or summary that fits in memory and still allows you to answer the question that you're interested in. The challenge here is finding the right small data, which often requires a lot of iteration.
-
-Another possibility is that your big data problem is actually a large number of small data problems. Each individual problem might fit in memory, but you have millions of them. For example, you might want to fit a model to each person in your dataset. That would be trivial if you had just 10 or 100 people, but instead you have a million. Fortunately each problem is independent of the others (a setup that is sometimes called embarrassingly parallel), so you just need a system (like Hadoop or Spark) that allows you to send different datasets to different computers for processing. Once you've figured out how to answer the question for a single subset using the tools described in this book, you learn new tools like sparklyr, rhipe, and ddr to solve it for the full dataset.
-
-### Python, Julia, and friends
-
-In this book, you won't learn anything about Python, Julia, or any other programming language useful for data science. This isn't because we think these tools are bad. They're not! And in practice, most data science teams use a mix of languages, often at least R and Python.
-
-However, we strongly believe that it's best to master one tool at a time. You will get better faster if you dive deep, rather than spreading yourself thinly over many topics. This doesn't mean you should only know one thing, just that you'll generally learn faster if you stick to one thing at a time. You should strive to learn new things throughout your career, but make sure your understanding is solid before you move on to the next interesting thing.
-
-We think R is a great place to start your data science journey because it is an environment designed from the ground up to support data science. R is not just a programming language, but it is also an interactive environment for doing data science. To support interaction, R is a much more flexible language than many of its peers. This flexibility comes with its downsides, but the big upside is how easy it is to evolve tailored grammars for specific parts of the data science process. These mini languages help you think about problems as a data scientist, while supporting fluent interaction between your brain and the computer.
-
-### Non-rectangular data
-
-This book focuses exclusively on rectangular data: collections of values that are each associated with a variable and an observation. There are lots of datasets that do not naturally fit in this paradigm: including images, sounds, trees, and text. But rectangular data frames are extremely common in science and industry, and we believe that they are a great place to start your data science journey.
-
-### Hypothesis confirmation
-
-It's possible to divide data analysis into two camps: hypothesis generation and hypothesis confirmation (sometimes called confirmatory analysis). The focus of this book is unabashedly on hypothesis generation, or data exploration. Here you'll look deeply at the data and, in combination with your subject knowledge, generate many interesting hypotheses to help explain why the data behaves the way it does. You evaluate the hypotheses informally, using your scepticism to challenge the data in multiple ways.
-
-The complement of hypothesis generation is hypothesis confirmation. Hypothesis confirmation is hard for two reasons:
-
-1.  You need a precise mathematical model in order to generate falsifiable
-    predictions. This often requires considerable statistical sophistication.
-
-1.  You can only use an observation once to confirm a hypothesis. As soon as
-    you use it more than once you're back to doing exploratory analysis. 
-    This means to do hypothesis confirmation you need to "preregister" 
-    (write out in advance) your analysis plan, and not deviate from it
-    even when you have seen the data. We'll talk a little about some 
-    strategies you can use to make this easier in [modelling](#model-intro).
-
-It's common to think about modelling as a tool for hypothesis confirmation, and visualisation as a tool for hypothesis generation. But that's a false dichotomy: models are often used for exploration, and with a little care you can use visualisation for confirmation. The key difference is how often do you look at each observation: if you look only once, it's confirmation; if you look more than once, it's exploration.
-
-## Prerequisites
-
-We've made a few assumptions about what you already know in order to get the most out of this book. You should be generally numerically literate, and it's helpful if you have some programming experience already. If you've never programmed before, you might find [Hands on Programming with R](http://amzn.com/1449359019) by Garrett to be a useful adjunct to this book.
-
-There are four things you need to run the code in this book: R, RStudio, a collection of R packages called the __tidyverse__, and a handful of other packages. Packages are the fundamental units of reproducible R code. They include reusable functions, the documentation that describes how to use them, and sample data. 
-
-### R 
-
-To download R, go to CRAN, the **c**omprehensive **R** **a**rchive **n**etwork. CRAN is composed of a set of mirror servers distributed around the world and is used to distribute R and R packages. Don't try and pick a mirror that's close to you: instead use the cloud mirror, <https://cloud.r-project.org>, which automatically figures it out for you.
-
-A new major version of R comes out once a year, and there are 2-3 minor releases each year. It's a good idea to update regularly. Upgrading can be a bit of a hassle, especially for major versions, which require you to reinstall all your packages, but putting it off only makes it worse.
-
-### RStudio
-
-RStudio is an integrated development environment, or IDE, for R programming. Download and install it from <http://www.rstudio.com/download>. RStudio is updated a couple of times a year. When a new version is available, RStudio will let you know. It's a good idea to upgrade regularly so you can take advantage of the latest and greatest features. For this book, make sure you have RStudio 1.0.0.
-
-When you start RStudio, you'll see two key regions in the interface:
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/rstudio-console.png")
-```
-
-For now, all you need to know is that you type R code in the console pane, and press enter to run it. You'll learn more as we go along!
-
-### The tidyverse
-
-You'll also need to install some R packages. An R __package__ is a collection of functions, data, and documentation that extends the capabilities of base R. Using packages is key to the successful use of R. The majority of the packages that you will learn in this book are part of the so-called tidyverse. The packages in the tidyverse share a common philosophy of data and R programming, and are designed to work together naturally. 
-
-You can install the complete tidyverse with a single line of code:
-
-```{r, eval = FALSE}
-install.packages("tidyverse")
-```
-
-On your own computer, type that line of code in the console, and then press enter to run it. R will download the packages from CRAN and install them on to your computer. If you have problems installing, make sure that you are connected to the internet, and that <https://cloud.r-project.org/> isn't blocked by your firewall or proxy. 
-
-You will not be able to use the functions, objects, and help files in a package until you load it with `library()`. Once you have installed a package, you can load it with the `library()` function:
-
-```{r}
-library(tidyverse)
-```
-
-This tells you that tidyverse is loading the ggplot2, tibble, tidyr, readr, purrr, and dplyr packages. These are considered to be the __core__ of the tidyverse because you'll use them in almost every analysis. 
-
-Packages in the tidyverse change fairly frequently. You can see if updates are available, and optionally install them, by running `tidyverse_update()`.
-
-### Other packages
-
-There are many other excellent packages that are not part of the tidyverse, because they solve problems in a different domain, or are designed with a different set of underlying principles. This doesn't make them better or worse, just different. In other words, the complement to the tidyverse is not the messyverse, but many other universes of interrelated packages. As you tackle more data science projects with R, you'll learn new packages and new ways of thinking about data. 
-
-In this book we'll use three data packages from outside the tidyverse:
-
-```{r, eval = FALSE}
-install.packages(c("nycflights13", "gapminder", "Lahman"))
-```
-
-These packages provide data on airline flights, world development, and baseball that we'll use to illustrate key data science ideas.
-
-## Running R code
-
-The previous section showed you a couple of examples of running R code. Code in the book looks like this:
-
-```{r, eval = TRUE}
-1 + 2
-#> [1] 3
-```
-
-If you run the same code in your local console, it will look like this:
-
-```
-> 1 + 2
-[1] 3
-```
-
-There are two main differences. In your console, you type after the `>`, called the __prompt__; we don't show the prompt in the book. In the book, output is commented out with `#>`; in your console it appears directly after your code. These two differences mean that if you're working with an electronic version of the book, you can easily copy code out of the book and into the console.
-
-Throughout the book we use a consistent set of conventions to refer to code:
-
-* Functions are in a code font and followed by parentheses, like `sum()`, 
-  or `mean()`.
-
-* Other R objects (like data or function arguments) are in a code font,
-  without parentheses, like `flights` or `x`.
-  
-* If we want to make it clear what package an object comes from, we'll use
-  the package name followed by two colons, like `dplyr::mutate()`, or   
-  `nycflights13::flights`. This is also valid R code.
-
-## Getting help and learning more
-
-This book is not an island; there is no single resource that will allow you to master R. As you start to apply the techniques described in this book to your own data you will soon find questions that I do not answer. This section describes a few tips on how to get help, and to help you keep learning.
-
-If you get stuck, start with Google. Typically adding "R" to a query is enough to restrict it to relevant results: if the search isn't useful, it often means that there aren't any R-specific results available. Google is particularly useful for error messages. If you get an error message and you have no idea what it means, try googling it! Chances are that someone else has been confused by it in the past, and there will be help somewhere on the web. (If the error message isn't in English, run `Sys.setenv(LANGUAGE = "en")` and re-run the code; you're more likely to find help for English error messages.)
-
-If Google doesn't help, try [stackoverflow](http://stackoverflow.com). Start by spending a little time searching for an existing answer, including `[R]` to restrict your search to questions and answers that use R. If you don't find anything useful, prepare a minimal reproducible example or __reprex__.  A good reprex makes it easier for other people to help you, and often you'll figure out the problem yourself in the course of making it.
-
-There are three things you need to include to make your example reproducible: required packages, data, and code.
-
-1.  **Packages** should be loaded at the top of the script, so it's easy to
-    see which ones the example needs. This is a good time to check that you're
-    using the latest version of each package; it's possible you've discovered
-    a bug that's been fixed since you installed the package. For packages
-    in the tidyverse, the easiest way to check is to run `tidyverse_update()`.
-
-1.  The easiest way to include **data** in a question is to use `dput()` to 
-    generate the R code to recreate it. For example, to recreate the `mtcars` 
-    dataset in R, I'd perform the following steps:
-  
-    1. Run `dput(mtcars)` in R
-    2. Copy the output
-    3. In my reproducible script, type `mtcars <- ` then paste.
-    
-    Try and find the smallest subset of your data that still reveals
-    the problem.
-
-1.  Spend a little bit of time ensuring that your **code** is easy for others to
-    read:
-
-    * Make sure you've used spaces and your variable names are concise, yet
-      informative.
-    
-    * Use comments to indicate where your problem lies.
-    
-    * Do your best to remove everything that is not related to the problem.  
-      The shorter your code is, the easier it is to understand, and the 
-      easier it is to fix.
-
-Finish by checking that you have actually made a reproducible example by starting a fresh R session and copying and pasting your script in. 
-
-You should also spend some time preparing yourself to solve problems before they occur. Investing a little time in learning R each day will pay off handsomely in the long run. One way is to follow what Hadley, Garrett, and everyone else at RStudio are doing on the [RStudio blog](https://blog.rstudio.org). This is where we post announcements about new packages, new IDE features, and in-person courses. You might also want to follow Hadley ([\@hadleywickham](https://twitter.com/hadleywickham)) or Garrett ([\@statgarrett](https://twitter.com/statgarrett)) on Twitter, or follow [\@rstudiotips](https://twitter.com/rstudiotips) to keep up with new features in the IDE.
-
-To keep up with the R community more broadly, we recommend reading <http://www.r-bloggers.com>: it aggregates over 500 blogs about R from around the world. If you're an active Twitter user, follow the `#rstats` hashtag. Twitter is one of the key tools that Hadley uses to keep up with new developments in the community.
-
-## Acknowledgements
-
-This book isn't just the product of Hadley and Garrett, but is the result of many conversations (in person and online) that we've had with the many people in the R community. There are a few people we'd like to thank in particular, because they have spent many hours answering our dumb questions and helping us to better think about data science:
-
-* Jenny Bryan and Lionel Henry for many helpful discussions around working
-  with lists and list-columns.
-  
-* The three chapters on workflow were adapted (with permission), from
-  <http://stat545.com/block002_hello-r-workspace-wd-project.html> by 
-  Jenny Bryan.
-
-* Genevera Allen for discussions about models, modelling, the statistical
-  learning perspective, and the difference between hypothesis generation and 
-  hypothesis confirmation.
-
-* Yihui Xie for his work on the [bookdown](https://github.com/rstudio/bookdown) 
-  package, and for tirelessly responding to my feature requests.
-
-* Bill Behrman for his thoughtful reading of the entire book, and for trying 
-  it out with his data science class at Stanford.
-
-* The \#rstats twitter community who reviewed all of the draft chapters
-  and provided tons of useful feedback.
-
-* Tal Galili for augmenting his dendextend package to support a section on clustering that did not make it into the final draft.
-
-This book was written in the open, and many people contributed pull requests to fix minor problems. Special thanks goes to everyone who contributed via GitHub: 
-
-```{r, results = "asis", echo = FALSE, message = FALSE}
-library(dplyr)
-# git --no-pager shortlog -ns > contribs.txt
-contribs <- readr::read_tsv("contribs.txt", col_names = c("n", "name"))
-
-contribs <- contribs %>% 
-  filter(!name %in% c("hadley", "Garrett", "Hadley Wickham",
-                      "Garrett Grolemund")) %>% 
-  arrange(name) %>% 
-  mutate(uname = ifelse(!grepl(" ", name), paste0("@", name), name))
-
-cat("Thanks go to all contributers in alphabetical order: ")
-cat(paste0(contribs$uname, collapse = ", "))
-cat(".\n")
-```
-
-## Colophon
-
-An online version of this book is available at <http://r4ds.had.co.nz>. It will continue to evolve in between reprints of the physical book. The source of the book is available at <https://github.com/hadley/r4ds>. The book is powered by <https://bookdown.org> which makes it easy to turn R markdown files into HTML, PDF, and EPUB.
-
-This book was built with:
-
-```{r}
-devtools::session_info(c("tidyverse"))
-```
diff --git a/intro.qmd b/intro.qmd
new file mode 100644
index 000000000..149f36d4c
--- /dev/null
+++ b/intro.qmd
@@ -0,0 +1,350 @@
+# Introduction {#sec-intro}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+Data science is an exciting discipline that allows you to transform raw data into understanding, insight, and knowledge.
+The goal of "R for Data Science" is to help you learn the most important tools in R that will allow you to do data science efficiently and reproducibly, and to have some fun along the way 😃.
+After reading this book, you'll have the tools to tackle a wide variety of data science challenges using the best parts of R.
+
+## What you will learn
+
+Data science is a vast field, and there's no way you can master it all by reading a single book.
+This book aims to give you a solid foundation in the most important tools and enough knowledge to find the resources to learn more when necessary.
+Our model of the steps of a typical data science project looks something like @fig-ds-diagram.
+
+```{r}
+#| label: fig-ds-diagram
+#| echo: false
+#| fig-cap: |
+#|   In our model of the data science process, you start with data import
+#|   and tidying. Next, you understand your data with an iterative cycle of
+#|   transforming, visualizing, and modeling. You finish the process 
+#|   by communicating your results to other humans.
+#| fig-alt: |
+#|   A diagram displaying the data science cycle: Import -> Tidy -> Understand 
+#|   (which has the phases Transform -> Visualize -> Model in a cycle) -> 
+#|   Communicate. Surrounding all of these is Communicate.
+#| out.width: NULL
+
+knitr::include_graphics("diagrams/data-science/base.png", dpi = 270)
+```
+
+First, you must **import** your data into R.
+This typically means that you take data stored in a file, database, or web application programming interface (API) and load it into a data frame in R.
+If you can't get your data into R, you can't do data science on it!
+
+Once you've imported your data, it is a good idea to **tidy** it.
+Tidying your data means storing it in a consistent form that matches the semantics of the dataset with how it is stored.
+In brief, when your data is tidy, each column is a variable and each row is an observation.
+Tidy data is important because the consistent structure lets you focus your efforts on answering questions about the data, not fighting to get the data into the right form for different functions.
+
+Once you have tidy data, a common next step is to **transform** it.
+Transformation includes narrowing in on observations of interest (like all people in one city or all data from the last year), creating new variables that are functions of existing variables (like computing speed from distance and time), and calculating a set of summary statistics (like counts or means).
+Together, tidying and transforming are called **wrangling** because getting your data in a form that's natural to work with often feels like a fight!
+
+Once you have tidy data with the variables you need, there are two main engines of knowledge generation: visualization and modeling.
+These have complementary strengths and weaknesses, so any real data analysis will iterate between them many times.
+
+**Visualization** is a fundamentally human activity.
+A good visualization will show you things you did not expect or raise new questions about the data.
+A good visualization might also hint that you're asking the wrong question or that you need to collect different data.
+Visualizations can surprise you, but they don't scale particularly well because they require a human to interpret them.
+
+**Models** are complementary tools to visualization.
+Once you have made your questions sufficiently precise, you can use a model to answer them.
+Models are fundamentally mathematical or computational tools, so they generally scale well.
+Even when they don't, it's usually cheaper to buy more computers than it is to buy more brains!
+But every model makes assumptions, and by its very nature, a model cannot question its own assumptions.
+That means a model cannot fundamentally surprise you.
+
+The last step of data science is **communication**, an absolutely critical part of any data analysis project.
+It doesn't matter how well your models and visualization have led you to understand the data unless you can also communicate your results to others.
+
+Surrounding all these tools is **programming**.
+Programming is a cross-cutting tool that you use in nearly every part of a data science project.
+You don't need to be an expert programmer to be a successful data scientist, but learning more about programming pays off because becoming a better programmer allows you to automate common tasks and solve new problems with greater ease.
+
+You'll use these tools in every data science project, but they're not enough for most projects.
+There's a rough 80/20 rule at play: you can tackle about 80% of every project using the tools you'll learn in this book, but you'll need other tools to tackle the remaining 20%.
+Throughout this book, we'll point you to resources where you can learn more.
+
+## How this book is organized
+
+The previous description of the tools of data science is organized roughly according to the order in which you use them in an analysis (although, of course, you'll iterate through them multiple times).
+In our experience, however, learning data importing and tidying first is suboptimal because, 80% of the time, it's routine and boring, and the other 20% of the time, it's weird and frustrating.
+That's a bad place to start learning a new subject!
+Instead, we'll start with visualization and transformation of data that's already been imported and tidied.
+That way, when you ingest and tidy your own data, your motivation will stay high because you know the pain is worth the effort.
+
+Within each chapter, we try to adhere to a consistent pattern: start with some motivating examples so you can see the bigger picture, and then dive into the details.
+Each section of the book is paired with exercises to help you practice what you've learned.
+Although it can be tempting to skip the exercises, there's no better way to learn than by practicing on real problems.
+
+## What you won't learn
+
+There are several important topics that this book doesn't cover.
+We believe it's important to stay ruthlessly focused on the essentials so you can get up and running as quickly as possible.
+That means this book can't cover every important topic.
+
+### Modeling
+
+Modeling is super important for data science, but it's a big topic, and unfortunately, we just don't have the space to give it the coverage it deserves here.
+To learn more about modeling, we highly recommend [Tidy Modeling with R](https://www.tmwr.org) by our colleagues Max Kuhn and Julia Silge.
+This book will teach you the tidymodels family of packages, which, as you might guess from the name, share many conventions with the tidyverse packages we use in this book.
+
+### Big data
+
+This book proudly and primarily focuses on small, in-memory datasets.
+This is the right place to start because you can't tackle big data unless you have experience with small data.
+The tools you'll learn throughout the majority of this book will easily handle hundreds of megabytes of data, and with a bit of care, you can typically use them to work with a few gigabytes of data.
+We'll also show you how to get data out of databases and parquet files, both of which are often used to store big data.
+You won't necessarily be able to work with the entire dataset, but that's not a problem because you only need a subset or subsample to answer the question that you're interested in.
+
+If you're routinely working with larger data (10--100 GB, say), we recommend learning more about [data.table](https://github.com/Rdatatable/data.table).
+We don't teach it here because it uses a different interface than the tidyverse and requires you to learn some different conventions.
+However, it is incredibly faster, and the performance payoff is worth investing some time in learning it if you're working with large data.
+
+### Python, Julia, and friends
+
+In this book, you won't learn anything about Python, Julia, or any other programming language useful for data science.
+This isn't because we think these tools are bad.
+They're not!
+And in practice, most data science teams use a mix of languages, often at least R and Python.
+But we strongly believe that it's best to master one tool at a time, and R is a great place to start.
+
+## Prerequisites
+
+We've made a few assumptions about what you already know to get the most out of this book.
+You should be generally numerically literate, and it's helpful if you have some basic programming experience already.
+If you've never programmed before, you might find [Hands on Programming with R](https://rstudio-education.github.io/hopr/) by Garrett to be a valuable adjunct to this book.
+
+You need four things to run the code in this book: R, RStudio, a collection of R packages called the **tidyverse**, and a handful of other packages.
+Packages are the fundamental units of reproducible R code.
+They include reusable functions, documentation that describes how to use them, and sample data.
+
+### R
+
+To download R, go to CRAN, the **c**omprehensive **R** **a**rchive **n**etwork, <https://cloud.r-project.org>.
+A new major version of R comes out once a year, and there are 2-3 minor releases each year.
+It's a good idea to update regularly.
+Upgrading can be a bit of a hassle, especially for major versions that require you to re-install all your packages, but putting it off only makes it worse.
+We recommend R 4.2.0 or later for this book.
+
+### RStudio
+
+RStudio is an integrated development environment, or IDE, for R programming, which you can download from <https://posit.co/download/rstudio-desktop/>.
+RStudio is updated a couple of times a year, and it will automatically let you know when a new version is out, so there's no need to check back.
+It's a good idea to upgrade regularly to take advantage of the latest and greatest features.
+For this book, make sure you have at least RStudio 2022.02.0.
+
+When you start RStudio, @fig-rstudio-console, you'll see two key regions in the interface: the console pane and the output pane.
+For now, all you need to know is that you type the R code in the console pane and press enter to run it.
+You'll learn more as we go along![^intro-1]
+
+[^intro-1]: If you'd like a comprehensive overview of all of RStudio's features, see the RStudio User Guide at <https://docs.posit.co/ide/user>.
+
+```{r}
+#| label: fig-rstudio-console
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   The RStudio IDE has two key regions: type R code in the console pane
+#|   on the left, and look for plots in the output pane on the right.
+#| fig-alt: |
+#|   The RStudio IDE with the panes Console and Output highlighted.
+knitr::include_graphics("diagrams/rstudio/console.png", dpi = 270)
+```
+
+### The tidyverse
+
+You'll also need to install some R packages.
+An R **package** is a collection of functions, data, and documentation that extends the capabilities of base R.
+Using packages is key to the successful use of R.
+The majority of the packages that you will learn in this book are part of the so-called tidyverse.
+All packages in the tidyverse share a common philosophy of data and R programming and are designed to work together.
+
+You can install the complete tidyverse with a single line of code:
+
+```{r}
+#| eval: false
+
+install.packages("tidyverse")
+```
+
+On your computer, type that line of code in the console, and then press enter to run it.
+R will download the packages from CRAN and install them on your computer.
+
+You will not be able to use the functions, objects, or help files in a package until you load it with `library()`.
+Once you have installed a package, you can load it using the `library()` function:
+
+```{r}
+library(tidyverse)
+```
+
+This tells you that tidyverse loads nine packages: dplyr, forcats, ggplot2, lubridate, purrr, readr, stringr, tibble, tidyr.
+These are considered the **core** of the tidyverse because you'll use them in almost every analysis.
+
+Packages in the tidyverse change fairly frequently.
+You can see if updates are available by running `tidyverse_update()`.
+
+### Other packages
+
+There are many other excellent packages that are not part of the tidyverse because they solve problems in a different domain or are designed with a different set of underlying principles.
+This doesn't make them better or worse; it just makes them different.
+In other words, the complement to the tidyverse is not the messyverse but many other universes of interrelated packages.
+As you tackle more data science projects with R, you'll learn new packages and new ways of thinking about data.
+
+We'll use many packages from outside the tidyverse in this book.
+For example, we'll use the following packages because they provide interesting datasets for us to work with in the process of learning R:
+
+```{r}
+#| eval: false
+
+install.packages(
+  c("arrow", "babynames", "curl", "duckdb", "gapminder", 
+    "ggrepel", "ggridges", "ggthemes", "hexbin", "janitor", "Lahman", 
+    "leaflet", "maps", "nycflights13", "openxlsx", "palmerpenguins", 
+    "repurrrsive", "tidymodels", "writexl")
+  )
+```
+
+We'll also use a selection of other packages for one off examples.
+You don't need to install them now, just remember that whenever you see an error like this:
+
+```{r}
+#| eval: false
+
+library(ggrepel)
+#> Error in library(ggrepel) : there is no package called ‘ggrepel’
+```
+
+You need to run `install.packages("ggrepel")` to install the package.
+
+## Running R code
+
+The previous section showed you several examples of running R code.
+The code in the book looks like this:
+
+```{r}
+#| eval: true
+1 + 2
+```
+
+If you run the same code in your local console, it will look like this:
+
+```         
+> 1 + 2
+[1] 3
+```
+
+There are two main differences.
+In your console, you type after the `>`, called the **prompt**; we don't show the prompt in the book.
+In the book, the output is commented out with `#>`; in your console, it appears directly after your code.
+These two differences mean that if you're working with an electronic version of the book, you can easily copy code out of the book and paste it into the console.
+
+Throughout the book, we use a consistent set of conventions to refer to code:
+
+-   Functions are displayed in a code font and followed by parentheses, like `sum()` or `mean()`.
+
+-   Other R objects (such as data or function arguments) are in a code font, without parentheses, like `flights` or `x`.
+
+-   Sometimes, to make it clear which package an object comes from, we'll use the package name followed by two colons, like `dplyr::mutate()` or `nycflights13::flights`.
+    This is also valid R code.
+
+## Acknowledgments
+
+This book isn't just the product of Hadley, Mine, and Garrett but is the result of many conversations (in person and online) that we've had with many people in the R community.
+We're incredibly grateful for all the conversations we've had with y'all; thank you so much!
+
+```{r}
+#| eval: false
+#| echo: false
+
+library(tidyverse)
+contribs_all_json <- gh::gh("/repos/:owner/:repo/contributors",
+  owner = "hadley",
+  repo = "r4ds",
+  .limit = Inf
+)
+contribs_all <- tibble(
+  login = contribs_all_json |> map_chr("login"),
+  n = contribs_all_json |> map_int("contributions")
+)
+
+contribs_old <- read_csv("contributors.csv", col_types = list())
+contribs_new <- contribs_all |> anti_join(contribs_old, by = "login")
+
+# Get info for new contributors
+needed_json <- map(
+  contribs_new$login, 
+  ~ gh::gh("/users/:username", username = .x),
+  .progress = TRUE
+)
+info_new <- tibble(
+  login = contribs_new$login,
+  name = map_chr(needed_json, "name", .default = NA),
+  blog = map_chr(needed_json, "blog", .default = NA)
+)
+info_old <- contribs_old |> select(login, name, blog)
+info_all <- bind_rows(info_old, info_new)
+
+contribs_all <- contribs_all |> 
+  left_join(info_all, by = "login") |> 
+  mutate(login_lowercase = str_to_lower(login)) |>
+  arrange(login_lowercase) |>
+  select(-login_lowercase)
+write_csv(contribs_all, "contributors.csv")
+```
+
+```{r}
+#| results: asis
+#| echo: false
+#| message: false
+
+library(dplyr)
+contributors <- readr::read_csv("contributors.csv")
+contributors <- contributors |> 
+  filter(!login %in% c("hadley", "garrettgman", "mine-cetinkaya-rundel")) |> 
+  mutate(
+    login = paste0("\\@", login),
+    desc = ifelse(is.na(name), login, paste0(name, " (", login, ")"))
+  )
+
+cat("This book was written in the open, and many people contributed via pull requests. A special thanks to all ",nrow(contributors), " of you who contributed improvements via GitHub pull requests (in alphabetical order by username): ", sep = "")
+cat(paste0(contributors$desc, collapse = ", "))
+cat(".\n")
+```
+
+## Colophon
+
+An online version of this book is available at <https://r4ds.hadley.nz>.
+It will continue to evolve in between reprints of the physical book.
+The source of the book is available at <https://github.com/hadley/r4ds>.
+The book is powered by [Quarto](https://quarto.org), which makes it easy to write books that combine text and executable code.
+
+```{r}
+#| eval: false
+#| echo: false
+#| results: asis
+
+pkgs <- sessioninfo::package_info(
+  tidyverse:::tidyverse_packages(),
+  dependencies = FALSE
+)
+df <- tibble(
+  package = pkgs$package,
+  version = pkgs$ondiskversion,
+  source = gsub("@", "\\\\@", pkgs$source)
+)
+knitr::kable(df, format = "markdown")
+```
+
+```{r}
+#| include: false
+
+cli:::ruler()
+```
diff --git a/iteration.Rmd b/iteration.Rmd
deleted file mode 100644
index a8e2a3f85..000000000
--- a/iteration.Rmd
+++ /dev/null
@@ -1,986 +0,0 @@
-# Iteration
-
-## Introduction
-
-In [functions], we talked about how important it is to reduce duplication in your code by creating functions instead of copying-and-pasting. Reducing code duplication has three main benefits:
-
-1.  It's easier to see the intent of your code, because your eyes are
-    drawn to what's different, not what stays the same.
-    
-1.  It's easier to respond to changes in requirements. As your needs 
-    change, you only need to make changes in one place, rather than
-    remembering to change every place that you copied-and-pasted the 
-    code.
-    
-1.  You're likely to have fewer bugs because each line of code is 
-    used in more places.
-
-One tool for reducing duplication is functions, which reduce duplication by identifying repeated patterns of code and extract them out into independent pieces that can be easily reused and updated. Another tool for reducing duplication is __iteration__, which helps you when you need to do the same thing to multiple inputs: repeating the same operation on different columns, or on different datasets. 
-In this chapter you'll learn about two important iteration paradigms: imperative programming and functional programming. On the imperative side you have tools like for loops and while loops, which are a great place to start because they make iteration very explicit, so it's obvious what's happening. However, for loops are quite verbose, and require quite a bit of bookkeeping code that is duplicated for every for loop. Functional programming (FP) offers tools to extract out this duplicated code, so each common for loop pattern gets its own function. Once you master the vocabulary of FP, you can solve many common iteration problems with less code, more ease, and fewer errors.
-
-### Prerequisites
-
-Once you've mastered the for loops provided by base R, you'll learn some of the powerful programming tools provided by purrr, one of the tidyverse core packages.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-```
-
-## For loops
-
-Imagine we have this simple tibble:
-
-```{r}
-df <- tibble(
-  a = rnorm(10),
-  b = rnorm(10),
-  c = rnorm(10),
-  d = rnorm(10)
-)
-```
-
-We want to compute the median of each column. You _could_ do with copy-and-paste:
-
-```{r}
-median(df$a)
-median(df$b)
-median(df$c)
-median(df$d)
-```
-
-But that breaks our rule of thumb: never copy and paste more than twice. Instead, we could use a for loop:
-
-```{r}
-output <- vector("double", ncol(df))  # 1. output
-for (i in seq_along(df)) {            # 2. sequence
-  output[[i]] <- median(df[[i]])      # 3. body
-}
-output
-```
-
-Every for loop has three components:
-
-1.  The __output__: `output <- vector("double", length(x))`. 
-    Before you start the loop, you must always allocate sufficient space 
-    for the output. This is very important for efficiency: if you grow
-    the for loop at each iteration using `c()` (for example), your for loop 
-    will be very slow. 
-    
-    A general way of creating an empty vector of given length is the `vector()`
-    function. It has two arguments: the type of the vector ("logical", 
-    "integer", "double", "character", etc) and the length of the vector. 
-
-1.  The __sequence__: `i in seq_along(df)`. This determines what to loop over:
-    each run of the for loop will assign `i` to a different value from 
-    `seq_along(df)`. It's useful to think of `i` as a pronoun, like "it".
-    
-    You might not have seen `seq_along()` before. It's a safe version of the 
-    familiar `1:length(l)`, with an important difference: if you have a
-    zero-length vector, `seq_along()` does the right thing:
-
-    ```{r}
-    y <- vector("double", 0)
-    seq_along(y)
-    1:length(y)
-    ```
-    
-    You probably won't create a zero-length vector deliberately, but
-    it's easy to create them accidentally. If you use `1:length(x)` instead
-    of `seq_along(x)`, you're likely to get a confusing error message.
-    
-1.  The __body__: `output[[i]] <- median(df[[i]])`. This is the code that does
-    the work. It's run repeatedly, each time with a different value for `i`.
-    The first iteration will run `output[[1]] <- median(df[[1]])`, 
-    the second will run `output[[2]] <- median(df[[2]])`, and so on.
-
-That's all there is to the for loop! Now is a good time to practice creating some basic (and not so basic) for loops using the exercises below. Then we'll move on some variations of the for loop that help you solve other problems that will crop up in practice. 
-
-### Exercises
-
-1.  Write for loops to:
-
-    1. Compute the mean of every column in `mtcars`.
-    1. Determine the type of each column in `nycflights13::flights`.
-    1. Compute the number of unique values in each column of `iris`.
-    1. Generate 10 random normals for each of $\mu = -10$, $0$, $10$, and $100$.
-    
-    Think about the output, sequence, and body __before__ you start writing
-    the loop.
-
-1.  Eliminate the for loop in each of the following examples by taking 
-    advantage of an existing function that works with vectors:
-    
-    ```{r, eval = FALSE}
-    out <- ""
-    for (x in letters) {
-      out <- stringr::str_c(out, x)
-    }
-    
-    x <- sample(100)
-    sd <- 0
-    for (i in seq_along(x)) {
-      sd <- sd + (x[i] - mean(x)) ^ 2
-    }
-    sd <- sqrt(sd / (length(x) - 1))
-    
-    x <- runif(100)
-    out <- vector("numeric", length(x))
-    out[1] <- x[1]
-    for (i in 2:length(x)) {
-      out[i] <- out[i - 1] + x[i]
-    }
-    ```
-
-1.  Combine your function writing and for loop skills:
-
-    1. Write a for loop that `prints()` the lyrics to the children's song 
-       "Alice the camel".
-
-    1. Convert the nursery rhyme "ten in the bed" to a function. Generalise 
-       it to any number of people in any sleeping structure.
-
-    1. Convert the song "99 bottles of beer on the wall" to a function.
-       Generalise to any number of any vessel containing any liquid on 
-       any surface.
-
-1.  It's common to see for loops that don't preallocate the output and instead
-    increase the length of a vector at each step:
-    
-    ```{r, eval = FALSE}
-    output <- vector("integer", 0)
-    for (i in seq_along(x)) {
-      output <- c(output, lengths(x[[i]]))
-    }
-    output
-    ```
-    
-    How does this affect performance? Design and execute an experiment.
-
-## For loop variations
-
-Once you have the basic for loop under your belt, there are some variations that you should be aware of. These variations are important regardless of how you do iteration, so don't forget about them once you've mastered the FP techniques you'll learn about in the next section.
-
-There are four variations on the basic theme of the for loop:
-
-1.  Modifying an existing object, instead of creating a new object.
-1.  Looping over names or values, instead of indices.
-1.  Handling outputs of unknown length.
-1.  Handling sequences of unknown length.
-
-### Modifying an existing object
-
-Sometimes you want to use a for loop to modify an existing object. For example, remember our challenge from [functions]. We wanted to rescale every column in a data frame:
-
-```{r}
-df <- tibble(
-  a = rnorm(10),
-  b = rnorm(10),
-  c = rnorm(10),
-  d = rnorm(10)
-)
-rescale01 <- function(x) {
-  rng <- range(x, na.rm = TRUE)
-  (x - rng[1]) / (rng[2] - rng[1])
-}
-
-df$a <- rescale01(df$a)
-df$b <- rescale01(df$b)
-df$c <- rescale01(df$c)
-df$d <- rescale01(df$d)
-```
-
-To solve this with a for loop we again think about the three components:
-
-1.  __Output__: we already have the output --- it's the same as the input!
-
-1.  __Sequence__: we can think about a data frame as a list of columns, so 
-    we can iterate over each column with `seq_along(df)`.
-
-1.  __Body__: apply `rescale01()`.
-
-This gives us:
-
-```{r}
-for (i in seq_along(df)) {
-  df[[i]] <- rescale01(df[[i]])
-}
-```
-
-Typically you'll be modifying a list or data frame with this sort of loop, so remember to use `[[`, not `[`. You might have spotted that I used `[[` in all my for loops: I think it's better to use `[[` even for atomic vectors because it makes it clear that I want to work with a single element.
-
-### Looping patterns
-
-There are three basic ways to loop over a vector. So far I've shown you the most general: looping over the numeric indices with `for (i in seq_along(xs))`, and extracting the value with `x[[i]]`. There are two other forms:
-
-1.  Loop over the elements: `for (x in xs)`. This is most useful if you only
-    care about side-effects, like plotting or saving a file, because it's
-    difficult to save the output efficiently.
-
-1.  Loop over the names: `for (nm in names(xs))`. This gives you name, which
-    you can use to access the value with `x[[nm]]`. This is useful if you want 
-    to use the name in a plot title or a file name. If you're creating
-    named output, make sure to name the results vector like so:
-    
-    ```{r, eval = FALSE}
-    results <- vector("list", length(x))
-    names(results) <- names(x)
-    ```
-
-Iteration over the numeric indices is the most general form, because given the position you can extract both the name and the value:
-
-```{r, eval = FALSE}
-for (i in seq_along(x)) {
-  name <- names(x)[[i]]
-  value <- x[[i]]
-}
-```
-
-### Unknown output length
-
-Sometimes you might not know how long the output will be. For example, imagine you want to simulate some random vectors of random lengths. You might be tempted to solve this problem by progressively growing the vector:
-
-```{r}
-means <- c(0, 1, 2)
-
-output <- double()
-for (i in seq_along(means)) {
-  n <- sample(100, 1)
-  output <- c(output, rnorm(n, means[[i]]))
-}
-str(output)
-```
-
-But this is not very efficient because in each iteration, R has to copy all the data from the previous iterations. In technical terms you get "quadratic" ($O(n^2)$) behaviour which means that a loop with three times as many elements would take nine ($3^2$) times as long to run.
-
-A better solution to save the results in a list, and then combine into a single vector after the loop is done:
-
-```{r}
-out <- vector("list", length(means))
-for (i in seq_along(means)) {
-  n <- sample(100, 1)
-  out[[i]] <- rnorm(n, means[[i]])
-}
-str(out)
-str(unlist(out))
-```
-
-Here I've used `unlist()` to flatten a list of vectors into a single vector. A stricter option is to use `purrr::flatten_dbl()` --- it will throw an error if the input isn't a list of doubles.
-
-This pattern occurs in other places too:
-
-1.  You might be generating a long string. Instead of `paste()`ing together 
-    each iteration with the previous, save the output in a character vector and
-    then combine that vector into a single string with 
-    `paste(output, collapse = "")`.
-   
-1.  You might be generating a big data frame. Instead of sequentially
-    `rbind()`ing in each iteration, save the output in a list, then use 
-    `dplyr::bind_rows(output)` to combine the output into a single
-    data frame.
-
-Watch out for this pattern. Whenever you see it, switch to a more complex result object, and then combine in one step at the end.
-
-### Unknown sequence length
-
-Sometimes you don't even know how long the input sequence should run for. This is common when doing simulations. For example, you might want to loop until you get three heads in a row. You can't do that sort of iteration with the for loop. Instead, you can use a while loop. A while loop is simpler than for loop because it only has two components, a condition and a body:
-
-```{r, eval = FALSE}
-while (condition) {
-  # body
-}
-```
-
-A while loop is also more general than a for loop, because you can rewrite any for loop as a while loop, but you can't rewrite every while loop as a for loop:
-
-```{r, eval = FALSE}
-for (i in seq_along(x)) {
-  # body
-}
-
-# Equivalent to
-i <- 1
-while (i <= length(x)) {
-  # body
-  i <- i + 1 
-}
-```
-
-Here's how we could use a while loop to find how many tries it takes to get three heads in a row:
-
-```{r}
-flip <- function() sample(c("T", "H"), 1)
-
-flips <- 0
-nheads <- 0
-
-while (nheads < 3) {
-  if (flip() == "H") {
-    nheads <- nheads + 1
-  } else {
-    nheads <- 0
-  }
-  flips <- flips + 1
-}
-flips
-```
-
-I mention while loops only briefly, because I hardly ever use them. They're most often used for simulation, which is outside the scope of this book. However, it is good to know they exist so that you're prepared for problems where the number of iterations is not known in advance.
-
-### Exercises
-
-1.  Imagine you have a directory full of CSV files that you want to read in.
-    You have their paths in a vector, 
-    `files <- dir("data/", pattern = "\\.csv$", full.names = TRUE)`, and now
-    want to read each one with `read_csv()`. Write the for loop that will 
-    load them into a single data frame. 
-
-1.  What happens if you use `for (nm in names(x))` and `x` has no names?
-    What if only some of the elements are named? What if the names are
-    not unique?
-
-1.  Write a function that prints the mean of each numeric column in a data 
-    frame, along with its name. For example, `show_mean(iris)` would print:
-    
-    ```{r, eval = FALSE}
-    show_mean(iris)
-    #> Sepal.Length: 5.84
-    #> Sepal.Width:  3.06
-    #> Petal.Length: 3.76
-    #> Petal.Width:  1.20
-    ```
-    
-    (Extra challenge: what function did I use to make sure that the numbers
-    lined up nicely, even though the variable names had different lengths?)
-
-1.  What does this code do? How does it work?
-
-    ```{r, eval = FALSE}
-    trans <- list( 
-      disp = function(x) x * 0.0163871,
-      am = function(x) {
-        factor(x, labels = c("auto", "manual"))
-      }
-    )
-    for (var in names(trans)) {
-      mtcars[[var]] <- trans[[var]](mtcars[[var]])
-    }
-    ```
-
-## For loops vs. functionals
-
-For loops are not as important in R as they are in other languages because R is a functional programming language. This means that it's possible to wrap up for loops in a function, and call that function instead of using the for loop directly.
-
-To see why this is important, consider (again) this simple data frame:
-
-```{r}
-df <- tibble(
-  a = rnorm(10),
-  b = rnorm(10),
-  c = rnorm(10),
-  d = rnorm(10)
-)
-```
-
-Imagine you want to compute the mean of every column. You could do that with a for loop:
-
-```{r}
-output <- vector("double", length(df))
-for (i in seq_along(df)) {
-  output[[i]] <- mean(df[[i]])
-}
-output
-```
-
-You realise that you're going to want to compute the means of every column pretty frequently, so you extract it out into a function:
-
-```{r}
-col_mean <- function(df) {
-  output <- vector("double", length(df))
-  for (i in seq_along(df)) {
-    output[i] <- mean(df[[i]])
-  }
-  output
-}
-```
-
-But then you think it'd also be helpful to be able to compute the median, and the standard deviation, so you copy and paste your `col_mean()` function and replace the `mean()` with `median()` and `sd()`:
-
-```{r}
-col_median <- function(df) {
-  output <- vector("double", length(df))
-  for (i in seq_along(df)) {
-    output[i] <- median(df[[i]])
-  }
-  output
-}
-col_sd <- function(df) {
-  output <- vector("double", length(df))
-  for (i in seq_along(df)) {
-    output[i] <- sd(df[[i]])
-  }
-  output
-}
-```
-
-Uh oh! You've copied-and-pasted this code twice, so it's time to think about how to generalise it. Notice that most of this code is for-loop boilerplate and it's hard to see the one thing (`mean()`, `median()`, `sd()`) that is different between the functions.
-
-What would you do if you saw a set of functions like this:
-
-```{r}
-f1 <- function(x) abs(x - mean(x)) ^ 1
-f2 <- function(x) abs(x - mean(x)) ^ 2
-f3 <- function(x) abs(x - mean(x)) ^ 3
-```
-
-Hopefully, you'd notice that there's a lot of duplication, and extract it out into an additional argument:
-
-```{r}
-f <- function(x, i) abs(x - mean(x)) ^ i
-```
-
-You've reduced the chance of bugs (because you now have 1/3 less code), and made it easy to generalise to new situations. 
-
-We can do exactly the same thing with `col_mean()`, `col_median()` and `col_sd()` by adding an argument that supplies the function to apply to each column:
-
-```{r}
-col_summary <- function(df, fun) {
-  out <- vector("double", length(df))
-  for (i in seq_along(df)) {
-    out[i] <- fun(df[[i]])
-  }
-  out
-}
-col_summary(df, median)
-col_summary(df, mean)
-```
-
-The idea of passing a function to another function is extremely powerful idea, and it's one of the behaviours that makes R a functional programming language. It might take you a while to wrap your head around the idea, but it's worth the investment. In the rest of the chapter, you'll learn about and use the __purrr__ package, which provides functions that eliminate the need for many common for loops. The apply family of functions in base R (`apply()`, `lapply()`, `tapply()`, etc) solve a similar problem, but purrr is more consistent and thus is easier to learn.
-
-The goal of using purrr functions instead of for loops is to allow you break common list manipulation challenges into independent pieces: 
-
-1. How can you solve the problem for a single element of the list? Once
-   you've solved that problem, purrr takes care of generalising your
-   solution to every element in the list.
-
-1. If you're solving a complex problem, how can you break it down into
-   bite-sized pieces that allow you to advance one small step towards a 
-   solution? With purrr, you get lots of small pieces that you can
-   compose together with the pipe.
-
-This structure makes it easier to solve new problems. It also makes it easier to understand your solutions to old problems when you re-read your old code.
-
-### Exercises
-
-1.  Read the documentation for `apply()`. In the 2d case, what two for loops
-    does it generalise?
-
-1.  Adapt `col_summary()` so that it only applies to numeric columns
-    You might want to start with an `is_numeric()` function that returns
-    a logical vector that has a TRUE corresponding to each numeric column.
-
-## The map functions
-
-The pattern of looping over a vector, doing something to each element and saving the results is so common that the purrr package provides a family of functions to do it for you. There is one function for each type of output:
-
-* `map()`     makes a list.
-* `map_lgl()` makes a logical vector.
-* `map_int()` makes an integer vector.
-* `map_dbl()` makes a double vector.
-* `map_chr()` makes a character vector.
-
-Each function takes a vector as input, applies a function to each piece, and then returns a new vector that's the same length (and has the same names) as the input. The type of the vector is determined by the suffix to the map function. 
-
-Once you master these functions, you'll find it takes much less time to solve iteration problems. But you should never feel bad about using a for loop instead of a map function. The map functions are a step up a tower of abstraction, and it can take a long time to get your head around how they work. The important thing is that you solve the problem that you're working on, not write the most concise and elegant code (although that's definitely something you want to strive towards!).
-
-Some people will tell you to avoid for loops because they are slow. They're wrong! (Well at least they're rather out of date, as for loops haven't been slow for many years). The chief benefits of using functions like `map()` is not speed, but clarity: they make your code easier to write and to read.
-
-We can use these functions to perform the same computations as the last for loop. Those summary functions returned doubles, so we need to use `map_dbl()`:
-
-```{r}
-map_dbl(df, mean)
-map_dbl(df, median)
-map_dbl(df, sd)
-```
-
-Compared to using a for loop, focus is on the operation being performed (i.e. `mean()`, `median()`, `sd()`), not the bookkeeping required to loop over every element and store the output. This is even more apparent if we use the pipe:
-
-```{r}
-df %>% map_dbl(mean)
-df %>% map_dbl(median)
-df %>% map_dbl(sd)
-```
-
-There are a few differences between `map_*()` and `col_summary()`:
-
-*   All purrr functions are implemented in C. This makes them a little faster
-    at the expense of readability.
-    
-*   The second argument, `.f`, the function to apply, can be a formula, a 
-    character vector, or an integer vector. You'll learn about those handy 
-    shortcuts in the next section.
-    
-*   `map_*()` uses ... ([dot dot dot]) to pass along additional arguments 
-    to `.f` each time it's called:
-
-    ```{r}
-    map_dbl(df, mean, trim = 0.5)
-    ```
-
-*   The map functions also preserve names:
-
-    ```{r}
-    z <- list(x = 1:3, y = 4:5)
-    map_int(z, length)
-    ```
-
-### Shortcuts
-
-There are a few shortcuts that you can use with `.f` in order to save a little typing. Imagine you want to fit a linear model to each group in a dataset. The following toy example splits the up the `mtcars` dataset in to three pieces (one for each value of cylinder) and fits the same linear model to each piece:  
-
-```{r}
-models <- mtcars %>% 
-  split(.$cyl) %>% 
-  map(function(df) lm(mpg ~ wt, data = df))
-```
-
-The syntax for creating an anonymous function in R is quite verbose so purrr provides a convenient shortcut: a one-sided formula.
-
-```{r}
-models <- mtcars %>% 
-  split(.$cyl) %>% 
-  map(~lm(mpg ~ wt, data = .))
-```
-
-Here I've used `.` as a pronoun: it refers to the current list element (in the same way that `i` referred to the current index in the for loop). 
-
-When you're looking at many models, you might want to extract a summary statistic like the $R^2$. To do that we need to first run `summary()` and then extract the component called `r.squared`. We could do that using the shorthand for anonymous functions:
-
-```{r}
-models %>% 
-  map(summary) %>% 
-  map_dbl(~.$r.squared)
-```
-
-But extracting named components is a common operation, so purrr provides an even shorter shortcut: you can use a string.
-
-```{r}
-models %>% 
-  map(summary) %>% 
-  map_dbl("r.squared")
-```
-
-You can also use an integer to select elements by position: 
-
-```{r}
-x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
-x %>% map_dbl(2)
-```
-
-### Base R
-  
-If you're familiar with the apply family of functions in base R, you might have noticed some similarities with the purrr functions:
-
-*   `lapply()` is basically identical to `map()`, except that `map()` is 
-    consistent with all the other functions in purrr, and you can use the 
-    shortcuts for `.f`.
-
-*   Base `sapply()` is a wrapper around `lapply()` that automatically
-    simplifies the output. This is useful for interactive work but is 
-    problematic in a function because you never know what sort of output
-    you'll get:
-    
-    ```{r}
-    x1 <- list(
-      c(0.27, 0.37, 0.57, 0.91, 0.20),
-      c(0.90, 0.94, 0.66, 0.63, 0.06), 
-      c(0.21, 0.18, 0.69, 0.38, 0.77)
-    )
-    x2 <- list(
-      c(0.50, 0.72, 0.99, 0.38, 0.78), 
-      c(0.93, 0.21, 0.65, 0.13, 0.27), 
-      c(0.39, 0.01, 0.38, 0.87, 0.34)
-    )
-    
-    threshold <- function(x, cutoff = 0.8) x[x > cutoff]
-    x1 %>% sapply(threshold) %>% str()
-    x2 %>% sapply(threshold) %>% str()
-    ```
-
-*   `vapply()` is a safe alternative to `sapply()` because you supply an
-    additional argument that defines the type. The only problem with 
-    `vapply()` is that it's a lot of typing: 
-    `vapply(df, is.numeric, logical(1))` is equivalent to
-    `map_lgl(df, is.numeric)`. One advantage of `vapply()` over purrr's map
-    functions is that it can also produce matrices --- the map functions only 
-    ever produce vectors.
-
-I focus on purrr functions here because they have more consistent names and arguments, helpful shortcuts, and in the future will provide easy parallelism and progress bars.
-
-### Exercises
-
-1.  Write code that uses one of the map functions to:
-
-    1. Compute the mean of every column in `mtcars`.
-    1. Determine the type of each column in `nycflights13::flights`.
-    1. Compute the number of unique values in each column of `iris`.
-    1. Generate 10 random normals for each of $\mu = -10$, $0$, $10$, and $100$.
-
-1.  How can you create a single vector that for each column in a data frame
-    indicates whether or not it's a factor?
-
-1.  What happens when you use the map functions on vectors that aren't lists?
-    What does `map(1:5, runif)` do? Why?
-    
-1.  What does `map(-2:2, rnorm, n = 5)` do? Why?
-    What does `map_dbl(-2:2, rnorm, n = 5)` do? Why?
-
-1.  Rewrite `map(x, function(df) lm(mpg ~ wt, data = df))` to eliminate the 
-    anonymous function. 
-
-## Dealing with failure
-
-When you use the map functions to repeat many operations, the chances are much higher that one of those operations will fail. When this happens, you'll get an error message, and no output. This is annoying: why does one failure prevent you from accessing all the other successes? How do you ensure that one bad apple doesn't ruin the whole barrel?
-
-In this section you'll learn how to deal this situation with a new function: `safely()`. `safely()` is an adverb: it takes a function (a verb) and returns a modified version. In this case, the modified function will never throw an error. Instead, it always returns a list with two elements:
-
-1. `result` is the original result. If there was an error, this will be `NULL`.
-
-1. `error` is an error object. If the operation was successful, this will be 
-   `NULL`.
-
-(You might be familiar with the `try()` function in base R. It's similar, but because it sometimes returns the original result and it sometimes returns an error object it's more difficult to work with.)
-
-Let's illustrate this with a simple example: `log()`:
-
-```{r}
-safe_log <- safely(log)
-str(safe_log(10))
-str(safe_log("a"))
-```
-
-When the function succeeds, the `result` element contains the result and the `error` element is `NULL`. When the function fails, the `result` element is `NULL` and the `error` element contains an error object.
-
-`safely()` is designed to work with map:
-
-```{r}
-x <- list(1, 10, "a")
-y <- x %>% map(safely(log))
-str(y)
-```
-
-This would be easier to work with if we had two lists: one of all the errors and one of all the output. That's easy to get with `purrr::transpose()`:
-
-```{r}
-y <- y %>% transpose()
-str(y)
-```
-
-It's up to you how to deal with the errors, but typically you'll either look at the values of `x` where `y` is an error, or work with the values of `y` that are ok:
-
-```{r}
-is_ok <- y$error %>% map_lgl(is_null)
-x[!is_ok]
-y$result[is_ok] %>% flatten_dbl()
-```
-
-Purrr provides two other useful adverbs:
-
-*   Like `safely()`, `possibly()` always succeeds. It's simpler than `safely()`, 
-    because you give it a default value to return when there is an error. 
-    
-    ```{r}
-    x <- list(1, 10, "a")
-    x %>% map_dbl(possibly(log, NA_real_))
-    ```
-    
-*   `quietly()` performs a similar role to `safely()`, but instead of capturing
-    errors, it captures printed output, messages, and warnings:
-    
-    ```{r}
-    x <- list(1, -1)
-    x %>% map(quietly(log)) %>% str()
-    ```
-
-## Mapping over multiple arguments
-
-So far we've mapped along a single input. But often you have multiple related inputs that you need iterate along in parallel. That's the job of the `map2()` and `pmap()` functions. For example, imagine you want to simulate some random normals with different means. You know how to do that with `map()`:
-
-```{r}
-mu <- list(5, 10, -3)
-mu %>% 
-  map(rnorm, n = 5) %>% 
-  str()
-```
-
-What if you also want to vary the standard deviation? One way to do that would be to iterate over the indices and index into vectors of means and sds:
-
-```{r}
-sigma <- list(1, 5, 10)
-seq_along(mu) %>% 
-  map(~rnorm(5, mu[[.]], sigma[[.]])) %>% 
-  str()
-```
-
-But that obfuscates the intent of the code. Instead we could use `map2()` which iterates over two vectors in parallel:
-
-```{r}
-map2(mu, sigma, rnorm, n = 5) %>% str()
-```
-
-`map2()` generates this series of function calls:
-
-```{r, echo = FALSE}
-knitr::include_graphics("diagrams/lists-map2.png")
-```
-
-Note that the arguments that vary for each call come _before_ the function; arguments that are the same for every call come _after_.
-
-Like `map()`, `map2()` is just a wrapper around a for loop:
-
-```{r}
-map2 <- function(x, y, f, ...) {
-  out <- vector("list", length(x))
-  for (i in seq_along(x)) {
-    out[[i]] <- f(x[[i]], y[[i]], ...)
-  }
-  out
-}
-```
-
-You could also imagine `map3()`, `map4()`, `map5()`, `map6()` etc, but that would get tedious quickly. Instead, purrr provides `pmap()` which takes a list of arguments. You might use that if you wanted to vary the mean, standard deviation, and number of samples:
-
-```{r}
-n <- list(1, 3, 5)
-args1 <- list(n, mu, sigma)
-args1 %>%
-  pmap(rnorm) %>% 
-  str()
-```
-
-That looks like:
-
-```{r, echo = FALSE}
-knitr::include_graphics("diagrams/lists-pmap-unnamed.png")
-```
-
-If you don't name the elements of list, `pmap()` will use positional matching when calling the function. That's a little fragile, and makes the code harder to read, so it's better to name the arguments:
-
-```{r, eval = FALSE}
-args2 <- list(mean = mu, sd = sigma, n = n)
-args2 %>% 
-  pmap(rnorm) %>% 
-  str()
-```
-
-That generates longer, but safer, calls:
-
-```{r, echo = FALSE}
-knitr::include_graphics("diagrams/lists-pmap-named.png")
-```
-
-Since the arguments are all the same length, it makes sense to store them in a data frame:
-
-```{r}
-params <- tribble(
-  ~mean, ~sd, ~n,
-    5,     1,  1,
-   10,     5,  3,
-   -3,    10,  5
-)
-params %>% 
-  pmap(rnorm)
-```
-
-As soon as your code gets complicated, I think a data frame is a good approach because it ensures that each column has a name and is the same length as all the other columns.
-
-### Invoking different functions
-
-There's one more step up in complexity - as well as varying the arguments to the function you might also vary the function itself:
-
-```{r}
-f <- c("runif", "rnorm", "rpois")
-param <- list(
-  list(min = -1, max = 1), 
-  list(sd = 5), 
-  list(lambda = 10)
-)
-```
-
-To handle this case, you can use `invoke_map()`:
-
-```{r}
-invoke_map(f, param, n = 5) %>% str()
-```
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/lists-invoke.png")
-```
-
-The first argument is a list of functions or character vector of function names. The second argument is a list of lists giving the arguments that vary for each function. The subsequent arguments are passed on to every function.
-
-And again, you can use `tribble()` to make creating these matching pairs a little easier:
-
-```{r, eval = FALSE}
-sim <- tribble(
-  ~f,      ~params,
-  "runif", list(min = -1, max = 1),
-  "rnorm", list(sd = 5),
-  "rpois", list(lambda = 10)
-)
-sim %>% 
-  mutate(sim = invoke_map(f, params, n = 10))
-```
-
-## Walk {#walk}
-
-Walk is an alternative to map that you use when you want to call a function for its side effects, rather than for its return value. You typically do this because you want to render output to the screen or save files to disk - the important thing is the action, not the return value. Here's a very simple example:
-
-```{r}
-x <- list(1, "a", 3)
-
-x %>% 
-  walk(print)
-```
-
-`walk()` is generally not that useful compared to `walk2()` or `pwalk()`. For example, if you had a list of plots and a vector of file names, you could use `pwalk()` to save each file to the corresponding location on disk:
-
-```{r, eval = FALSE}
-library(ggplot2)
-plots <- mtcars %>% 
-  split(.$cyl) %>% 
-  map(~ggplot(., aes(mpg, wt)) + geom_point())
-paths <- stringr::str_c(names(plots), ".pdf")
-
-pwalk(list(paths, plots), ggsave, path = tempdir())
-```
-
-`walk()`, `walk2()` and `pwalk()` all invisibly return `.x`, the first argument. This makes them suitable for use in the middle of pipelines. 
-
-## Other patterns of for loops
-
-Purrr provides a number of other functions that abstract over other types of for loops. You'll use them less frequently than the map functions, but they're useful to know about. The goal here is to briefly illustrate each function, so hopefully it will come to mind if you see a similar problem in the future. Then you can go look up the documentation for more details.
-
-### Predicate functions
-
-A number of functions work with __predicate__ functions that return either a single `TRUE` or `FALSE`.
-
-`keep()` and `discard()` keep elements of the input where the predicate is `TRUE` or `FALSE` respectively:
-
-```{r}
-iris %>% 
-  keep(is.factor) %>% 
-  str()
-
-iris %>% 
-  discard(is.factor) %>% 
-  str()
-```
-
-`some()` and `every()` determine if the predicate is true for any or for all of
-the elements.
-
-```{r}
-x <- list(1:5, letters, list(10))
-
-x %>% 
-  some(is_character)
-
-x %>% 
-  every(is_vector)
-```
-
-`detect()` finds the first element where the predicate is true; `detect_index()` returns its position.
-
-```{r}
-x <- sample(10)
-x
-
-x %>% 
-  detect(~ . > 5)
-
-x %>% 
-  detect_index(~ . > 5)
-```
-
-`head_while()` and `tail_while()` take elements from the start or end of a vector while a predicate is true:
-
-```{r}
-x %>% 
-  head_while(~ . > 5)
-
-x %>% 
-  tail_while(~ . > 5)
-```
-
-### Reduce and accumulate
-
-Sometimes you have a complex list that you want to reduce to a simple list by repeatedly applying a function that reduces a pair to a singleton. This is useful if you want to apply a two-table dplyr verb to multiple tables. For example, you might have a list of data frames, and you want to reduce to a single data frame by joining the elements together:
-
-```{r}
-dfs <- list(
-  age = tibble(name = "John", age = 30),
-  sex = tibble(name = c("John", "Mary"), sex = c("M", "F")),
-  trt = tibble(name = "Mary", treatment = "A")
-)
-
-dfs %>% reduce(full_join)
-```
-
-Or maybe you have a list of vectors, and want to find the intersection:
-
-```{r}
-vs <- list(
-  c(1, 3, 5, 6, 10),
-  c(1, 2, 3, 7, 8, 10),
-  c(1, 2, 3, 4, 8, 9, 10)
-)
-
-vs %>% reduce(intersect)
-```
-
-The reduce function takes a "binary" function (i.e. a function with two primary inputs), and applies it repeatedly to a list until there is only a single element left.
-
-Accumulate is similar but it keeps all the interim results. You could use it to implement a cumulative sum:
-
-```{r}
-x <- sample(10)
-x
-x %>% accumulate(`+`)
-```
-
-### Exercises
-
-1.  Implement your own version of `every()` using a for loop. Compare it with
-    `purrr::every()`. What does purrr's version do that your version doesn't?
-
-1.  Create an enhanced `col_sum()` that applies a summary function to every
-    numeric column in a data frame.
-
-1.  A possible base R equivalent of `col_sum()` is:
-
-    ```{r}
-    col_sum3 <- function(df, f) {
-      is_num <- sapply(df, is.numeric)
-      df_num <- df[, is_num]
-
-      sapply(df_num, f)
-    }
-    ```
-    
-    But it has a number of bugs as illustrated with the following inputs:
-    
-    ```{r, eval = FALSE}
-    df <- tibble(
-      x = 1:3, 
-      y = 3:1,
-      z = c("a", "b", "c")
-    )
-    # OK
-    col_sum3(df, mean)
-    # Has problems: don't always return numeric vector
-    col_sum3(df[1:2], mean)
-    col_sum3(df[1], mean)
-    col_sum3(df[0], mean)
-    ```
-    
-    What causes the bugs?
diff --git a/iteration.qmd b/iteration.qmd
new file mode 100644
index 000000000..63e328237
--- /dev/null
+++ b/iteration.qmd
@@ -0,0 +1,1016 @@
+# Iteration {#sec-iteration}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+In this chapter, you'll learn tools for iteration, repeatedly performing the same action on different objects.
+Iteration in R generally tends to look rather different from other programming languages because so much of it is implicit and we get it for free.
+For example, if you want to double a numeric vector `x` in R, you can just write `2 * x`.
+In most other languages, you'd need to explicitly double each element of `x` using some sort of for loop.
+
+This book has already given you a small but powerful number of tools that perform the same action for multiple "things":
+
+-   `facet_wrap()` and `facet_grid()` draws a plot for each subset.
+-   `group_by()` plus `summarize()` computes summary statistics for each subset.
+-   `unnest_wider()` and `unnest_longer()` create new rows and columns for each element of a list-column.
+
+Now it's time to learn some more general tools, often called **functional programming** tools because they are built around functions that take other functions as inputs.
+Learning functional programming can easily veer into the abstract, but in this chapter we'll keep things concrete by focusing on three common tasks: modifying multiple columns, reading multiple files, and saving multiple objects.
+
+### Prerequisites
+
+In this chapter, we'll focus on tools provided by dplyr and purrr, both core members of the tidyverse.
+You've seen dplyr before, but [purrr](http://purrr.tidyverse.org/) is new.
+We're just going to use a couple of purrr functions in this chapter, but it's a great package to explore as you improve your programming skills.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+## Modifying multiple columns {#sec-across}
+
+Imagine you have this simple tibble and you want to count the number of observations and compute the median of every column.
+
+```{r}
+df <- tibble(
+  a = rnorm(10),
+  b = rnorm(10),
+  c = rnorm(10),
+  d = rnorm(10)
+)
+```
+
+You could do it with copy-and-paste:
+
+```{r}
+df |> summarize(
+  n = n(),
+  a = median(a),
+  b = median(b),
+  c = median(c),
+  d = median(d),
+)
+```
+
+That breaks our rule of thumb to never copy and paste more than twice, and you can imagine that this will get very tedious if you have tens or even hundreds of columns.
+Instead, you can use `across()`:
+
+```{r}
+df |> summarize(
+  n = n(),
+  across(a:d, median),
+)
+```
+
+`across()` has three particularly important arguments, which we'll discuss in detail in the following sections.
+You'll use the first two every time you use `across()`: the first argument, `.cols`, specifies which columns you want to iterate over, and the second argument, `.fns`, specifies what to do with each column.
+You can use the `.names` argument when you need additional control over the names of output columns, which is particularly important when you use `across()` with `mutate()`.
+We'll also discuss two important variations, `if_any()` and `if_all()`, which work with `filter()`.
+
+### Selecting columns with `.cols`
+
+The first argument to `across()`, `.cols`, selects the columns to transform.
+This uses the same specifications as `select()`, @sec-select, so you can use functions like `starts_with()` and `ends_with()` to select columns based on their name.
+
+There are two additional selection techniques that are particularly useful for `across()`: `everything()` and `where()`.
+`everything()` is straightforward: it selects every (non-grouping) column:
+
+```{r}
+df <- tibble(
+  grp = sample(2, 10, replace = TRUE),
+  a = rnorm(10),
+  b = rnorm(10),
+  c = rnorm(10),
+  d = rnorm(10)
+)
+
+df |> 
+  group_by(grp) |> 
+  summarize(across(everything(), median))
+```
+
+Note grouping columns (`grp` here) are not included in `across()`, because they're automatically preserved by `summarize()`.
+
+`where()` allows you to select columns based on their type:
+
+-   `where(is.numeric)` selects all numeric columns.
+-   `where(is.character)` selects all string columns.
+-   `where(is.Date)` selects all date columns.
+-   `where(is.POSIXct)` selects all date-time columns.
+-   `where(is.logical)` selects all logical columns.
+
+Just like other selectors, you can combine these with Boolean algebra.
+For example, `!where(is.numeric)` selects all non-numeric columns, and `starts_with("a") & where(is.logical)` selects all logical columns whose name starts with "a".
+
+### Calling a single function
+
+The second argument to `across()` defines how each column will be transformed.
+In simple cases, as above, this will be a single existing function.
+This is a pretty special feature of R: we're passing one function (`median`, `mean`, `str_flatten`, ...) to another function (`across`).
+This is one of the features that makes R a functional programming language.
+
+It's important to note that we're passing this function to `across()`, so `across()` can call it; we're not calling it ourselves.
+That means the function name should never be followed by `()`.
+If you forget, you'll get an error:
+
+```{r}
+#| error: true
+df |> 
+  group_by(grp) |> 
+  summarize(across(everything(), median()))
+```
+
+This error arises because you're calling the function with no input, e.g.:
+
+```{r}
+#| error: true
+median()
+```
+
+### Calling multiple functions
+
+In more complex cases, you might want to supply additional arguments or perform multiple transformations.
+Let's motivate this problem with a simple example: what happens if we have some missing values in our data?
+`median()` propagates those missing values, giving us a suboptimal output:
+
+```{r}
+rnorm_na <- function(n, n_na, mean = 0, sd = 1) {
+  sample(c(rnorm(n - n_na, mean = mean, sd = sd), rep(NA, n_na)))
+}
+
+df_miss <- tibble(
+  a = rnorm_na(5, 1),
+  b = rnorm_na(5, 1),
+  c = rnorm_na(5, 2),
+  d = rnorm(5)
+)
+df_miss |> 
+  summarize(
+    across(a:d, median),
+    n = n()
+  )
+```
+
+It would be nice if we could pass along `na.rm = TRUE` to `median()` to remove these missing values.
+To do so, instead of calling `median()` directly, we need to create a new function that calls `median()` with the desired arguments:
+
+```{r}
+df_miss |> 
+  summarize(
+    across(a:d, function(x) median(x, na.rm = TRUE)),
+    n = n()
+  )
+```
+
+This is a little verbose, so R comes with a handy shortcut: for this sort of throw away, or **anonymous**[^iteration-1], function you can replace `function` with `\`[^iteration-2]:
+
+[^iteration-1]: Anonymous, because we never explicitly gave it a name with `<-`.
+    Another term programmers use for this is "lambda function".
+
+[^iteration-2]: In older code you might see syntax that looks like `~ .x + 1`.
+    This is another way to write anonymous functions but it only works inside tidyverse functions and always uses the variable name `.x`.
+    We now recommend the base syntax, `\(x) x + 1`.
+
+```{r}
+#| results: false
+df_miss |> 
+  summarize(
+    across(a:d, \(x) median(x, na.rm = TRUE)),
+    n = n()
+  )
+```
+
+In either case, `across()` effectively expands to the following code:
+
+```{r}
+#| eval: false
+
+df_miss |> 
+  summarize(
+    a = median(a, na.rm = TRUE),
+    b = median(b, na.rm = TRUE),
+    c = median(c, na.rm = TRUE),
+    d = median(d, na.rm = TRUE),
+    n = n()
+  )
+```
+
+When we remove the missing values from the `median()`, it would be nice to know just how many values were removed.
+We can find that out by supplying two functions to `across()`: one to compute the median and the other to count the missing values.
+You supply multiple functions by using a named list to `.fns`:
+
+```{r}
+df_miss |> 
+  summarize(
+    across(a:d, list(
+      median = \(x) median(x, na.rm = TRUE),
+      n_miss = \(x) sum(is.na(x))
+    )),
+    n = n()
+  )
+```
+
+If you look carefully, you might intuit that the columns are named using a glue specification (@sec-glue) like `{.col}_{.fn}` where `.col` is the name of the original column and `.fn` is the name of the function.
+That's not a coincidence!
+As you'll learn in the next section, you can use `.names` argument to supply your own glue spec.
+
+### Column names
+
+The result of `across()` is named according to the specification provided in the `.names` argument.
+We could specify our own if we wanted the name of the function to come first[^iteration-3]:
+
+[^iteration-3]: You can't currently change the order of the columns, but you could reorder them after the fact using `relocate()` or similar.
+
+```{r}
+df_miss |> 
+  summarize(
+    across(
+      a:d,
+      list(
+        median = \(x) median(x, na.rm = TRUE),
+        n_miss = \(x) sum(is.na(x))
+      ),
+      .names = "{.fn}_{.col}"
+    ),
+    n = n(),
+  )
+```
+
+The `.names` argument is particularly important when you use `across()` with `mutate()`.
+By default, the output of `across()` is given the same names as the inputs.
+This means that `across()` inside of `mutate()` will replace existing columns.
+For example, here we use `coalesce()` to replace `NA`s with `0`:
+
+```{r}
+df_miss |> 
+  mutate(
+    across(a:d, \(x) coalesce(x, 0))
+  )
+```
+
+If you'd like to instead create new columns, you can use the `.names` argument to give the output new names:
+
+```{r}
+df_miss |> 
+  mutate(
+    across(a:d, \(x) abs(x), .names = "{.col}_abs")
+  )
+```
+
+### Filtering
+
+`across()` is a great match for `summarize()` and `mutate()` but it's more awkward to use with `filter()`, because you usually combine multiple conditions with either `|` or `&`.
+It's clear that `across()` can help to create multiple logical columns, but then what?
+So dplyr provides two variants of `across()` called `if_any()` and `if_all()`:
+
+```{r}
+# same as df_miss |> filter(is.na(a) | is.na(b) | is.na(c) | is.na(d))
+df_miss |> filter(if_any(a:d, is.na))
+
+# same as df_miss |> filter(is.na(a) & is.na(b) & is.na(c) & is.na(d))
+df_miss |> filter(if_all(a:d, is.na))
+```
+
+### `across()` in functions
+
+`across()` is particularly useful to program with because it allows you to operate on multiple columns.
+For example, [Jacob Scott](https://twitter.com/_wurli/status/1571836746899283969) uses this little helper which wraps a bunch of lubridate functions to expand all date columns into year, month, and day columns:
+
+```{r}
+expand_dates <- function(df) {
+  df |> 
+    mutate(
+      across(where(is.Date), list(year = year, month = month, day = mday))
+    )
+}
+
+df_date <- tibble(
+  name = c("Amy", "Bob"),
+  date = ymd(c("2009-08-03", "2010-01-16"))
+)
+
+df_date |> 
+  expand_dates()
+```
+
+`across()` also makes it easy to supply multiple columns in a single argument because the first argument uses tidy-select; you just need to remember to embrace that argument, as we discussed in @sec-embracing.
+For example, this function will compute the means of numeric columns by default.
+But by supplying the second argument you can choose to summarize just selected columns:
+
+```{r}
+summarize_means <- function(df, summary_vars = where(is.numeric)) {
+  df |> 
+    summarize(
+      across({{ summary_vars }}, \(x) mean(x, na.rm = TRUE)),
+      n = n()
+    )
+}
+diamonds |> 
+  group_by(cut) |> 
+  summarize_means()
+
+diamonds |> 
+  group_by(cut) |> 
+  summarize_means(c(carat, x:z))
+```
+
+### Vs `pivot_longer()`
+
+Before we go on, it's worth pointing out an interesting connection between `across()` and `pivot_longer()` (@sec-pivoting).
+In many cases, you perform the same calculations by first pivoting the data and then performing the operations by group rather than by column.
+For example, take this multi-function summary:
+
+```{r}
+df |> 
+  summarize(across(a:d, list(median = median, mean = mean)))
+```
+
+We could compute the same values by pivoting longer and then summarizing:
+
+```{r}
+long <- df |> 
+  pivot_longer(a:d) |> 
+  group_by(name) |> 
+  summarize(
+    median = median(value),
+    mean = mean(value)
+  )
+long
+```
+
+And if you wanted the same structure as `across()` you could pivot again:
+
+```{r}
+long |> 
+  pivot_wider(
+    names_from = name,
+    values_from = c(median, mean),
+    names_vary = "slowest",
+    names_glue = "{name}_{.value}"
+  )
+```
+
+This is a useful technique to know about because sometimes you'll hit a problem that's not currently possible to solve with `across()`: when you have groups of columns that you want to compute with simultaneously.
+For example, imagine that our data frame contains both values and weights and we want to compute a weighted mean:
+
+```{r}
+df_paired <- tibble(
+  a_val = rnorm(10),
+  a_wts = runif(10),
+  b_val = rnorm(10),
+  b_wts = runif(10),
+  c_val = rnorm(10),
+  c_wts = runif(10),
+  d_val = rnorm(10),
+  d_wts = runif(10)
+)
+```
+
+There's currently no way to do this with `across()`[^iteration-4], but it's relatively straightforward with `pivot_longer()`:
+
+[^iteration-4]: Maybe there will be one day, but currently we don't see how.
+
+```{r}
+df_long <- df_paired |> 
+  pivot_longer(
+    everything(), 
+    names_to = c("group", ".value"), 
+    names_sep = "_"
+  )
+df_long
+
+df_long |> 
+  group_by(group) |> 
+  summarize(mean = weighted.mean(val, wts))
+```
+
+If needed, you could `pivot_wider()` this back to the original form.
+
+### Exercises
+
+1.  Practice your `across()` skills by:
+
+    1.  Computing the number of unique values in each column of `palmerpenguins::penguins`.
+
+    2.  Computing the mean of every column in `mtcars`.
+
+    3.  Grouping `diamonds` by `cut`, `clarity`, and `color` then counting the number of observations and computing the mean of each numeric column.
+
+2.  What happens if you use a list of functions in `across()`, but don't name them?
+    How is the output named?
+
+3.  Adjust `expand_dates()` to automatically remove the date columns after they've been expanded.
+    Do you need to embrace any arguments?
+
+4.  Explain what each step of the pipeline in this function does.
+    What special feature of `where()` are we taking advantage of?
+
+    ```{r}
+    #| results: false
+
+    show_missing <- function(df, group_vars, summary_vars = everything()) {
+      df |> 
+        group_by(pick({{ group_vars }})) |> 
+        summarize(
+          across({{ summary_vars }}, \(x) sum(is.na(x))),
+          .groups = "drop"
+        ) |>
+        select(where(\(x) any(x > 0)))
+    }
+    nycflights13::flights |> show_missing(c(year, month, day))
+    ```
+
+## Reading multiple files
+
+In the previous section, you learned how to use `dplyr::across()` to repeat a transformation on multiple columns.
+In this section, you'll learn how to use `purrr::map()` to do something to every file in a directory.
+Let's start with a little motivation: imagine you have a directory full of excel spreadsheets[^iteration-5] you want to read.
+You could do it with copy and paste:
+
+[^iteration-5]: If you instead had a directory of csv files with the same format, you can use the technique from @sec-readr-directory.
+
+```{r}
+#| eval: false
+data2019 <- readxl::read_excel("data/y2019.xlsx")
+data2020 <- readxl::read_excel("data/y2020.xlsx")
+data2021 <- readxl::read_excel("data/y2021.xlsx")
+data2022 <- readxl::read_excel("data/y2022.xlsx")
+```
+
+And then use `dplyr::bind_rows()` to combine them all together:
+
+```{r}
+#| eval: false
+data <- bind_rows(data2019, data2020, data2021, data2022)
+```
+
+You can imagine that this would get tedious quickly, especially if you had hundreds of files, not just four.
+The following sections show you how to automate this sort of task.
+There are three basic steps: use `list.files()` to list all the files in a directory, then use `purrr::map()` to read each of them into a list, then use `purrr::list_rbind()` to combine them into a single data frame.
+We'll then discuss how you can handle situations of increasing heterogeneity, where you can't do exactly the same thing to every file.
+
+### Listing files in a directory
+
+As the name suggests, `list.files()` lists the files in a directory.
+You'll almost always use three arguments:
+
+-   The first argument, `path`, is the directory to look in.
+
+-   `pattern` is a regular expression used to filter the file names.
+    The most common pattern is something like `[.]xlsx$` or `[.]csv$` to find all files with a specified extension.
+
+-   `full.names` determines whether or not the directory name should be included in the output.
+    You almost always want this to be `TRUE`.
+
+To make our motivating example concrete, this book contains a folder with 12 excel spreadsheets containing data from the gapminder package.
+Each file contains one year's worth of data for 142 countries.
+We can list them all with the appropriate call to `list.files()`:
+
+```{r}
+paths <- list.files("data/gapminder", pattern = "[.]xlsx$", full.names = TRUE)
+paths
+```
+
+### Lists
+
+Now that we have these 12 paths, we could call `read_excel()` 12 times to get 12 data frames:
+
+```{r}
+#| eval: false
+gapminder_1952 <- readxl::read_excel("data/gapminder/1952.xlsx")
+gapminder_1957 <- readxl::read_excel("data/gapminder/1957.xlsx")
+gapminder_1962 <- readxl::read_excel("data/gapminder/1962.xlsx")
+ ...,
+gapminder_2007 <- readxl::read_excel("data/gapminder/2007.xlsx")
+```
+
+But putting each sheet into its own variable is going to make it hard to work with them a few steps down the road.
+Instead, they'll be easier to work with if we put them into a single object.
+A list is the perfect tool for this job:
+
+```{r}
+#| eval: false
+files <- list(
+  readxl::read_excel("data/gapminder/1952.xlsx"),
+  readxl::read_excel("data/gapminder/1957.xlsx"),
+  readxl::read_excel("data/gapminder/1962.xlsx"),
+  ...,
+  readxl::read_excel("data/gapminder/2007.xlsx")
+)
+```
+
+```{r}
+#| include: false
+files <- map(paths, readxl::read_excel)
+```
+
+Now that you have these data frames in a list, how do you get one out?
+You can use `files[[i]]` to extract the i<sup>th</sup> element:
+
+```{r}
+files[[3]]
+```
+
+We'll come back to `[[` in more detail in @sec-subset-one.
+
+### `purrr::map()` and `list_rbind()`
+
+The code to collect those data frames in a list "by hand" is basically just as tedious to type as code that reads the files one-by-one.
+Happily, we can use `purrr::map()` to make even better use of our `paths` vector.
+`map()` is similar to`across()`, but instead of doing something to each column in a data frame, it does something to each element of a vector.`map(x, f)` is shorthand for:
+
+```{r}
+#| eval: false
+list(
+  f(x[[1]]),
+  f(x[[2]]),
+  ...,
+  f(x[[n]])
+)
+```
+
+So we can use `map()` to get a list of 12 data frames:
+
+```{r}
+files <- map(paths, readxl::read_excel)
+length(files)
+
+files[[1]]
+```
+
+(This is another data structure that doesn't display particularly compactly with `str()` so you might want to load it into RStudio and inspect it with `View()`).
+
+Now we can use `purrr::list_rbind()` to combine that list of data frames into a single data frame:
+
+```{r}
+list_rbind(files)
+```
+
+Or we could do both steps at once in a pipeline:
+
+```{r}
+#| results: false
+paths |> 
+  map(readxl::read_excel) |> 
+  list_rbind()
+```
+
+What if we want to pass in extra arguments to `read_excel()`?
+We use the same technique that we used with `across()`.
+For example, it's often useful to peak at the first few rows of the data with `n_max = 1`:
+
+```{r}
+paths |> 
+  map(\(path) readxl::read_excel(path, n_max = 1)) |> 
+  list_rbind()
+```
+
+This makes it clear that something is missing: there's no `year` column because that value is recorded in the path, not in the individual files.
+We'll tackle that problem next.
+
+### Data in the path {#sec-data-in-the-path}
+
+Sometimes the name of the file is data itself.
+In this example, the file name contains the year, which is not otherwise recorded in the individual files.
+To get that column into the final data frame, we need to do two things:
+
+First, we name the vector of paths.
+The easiest way to do this is with the `set_names()` function, which can take a function.
+Here we use `basename()` to extract just the file name from the full path:
+
+```{r}
+paths |> set_names(basename) 
+```
+
+Those names are automatically carried along by all the map functions, so the list of data frames will have those same names:
+
+```{r}
+files <- paths |> 
+  set_names(basename) |> 
+  map(readxl::read_excel)
+```
+
+That makes this call to `map()` shorthand for:
+
+```{r}
+#| eval: false
+files <- list(
+  "1952.xlsx" = readxl::read_excel("data/gapminder/1952.xlsx"),
+  "1957.xlsx" = readxl::read_excel("data/gapminder/1957.xlsx"),
+  "1962.xlsx" = readxl::read_excel("data/gapminder/1962.xlsx"),
+  ...,
+  "2007.xlsx" = readxl::read_excel("data/gapminder/2007.xlsx")
+)
+```
+
+You can also use `[[` to extract elements by name:
+
+```{r}
+files[["1962.xlsx"]]
+```
+
+Then we use the `names_to` argument to `list_rbind()` to tell it to save the names into a new column called `year` then use `readr::parse_number()` to extract the number from the string.
+
+```{r}
+paths |> 
+  set_names(basename) |> 
+  map(readxl::read_excel) |> 
+  list_rbind(names_to = "year") |> 
+  mutate(year = parse_number(year))
+```
+
+In more complicated cases, there might be other variables stored in the directory name, or maybe the file name contains multiple bits of data.
+In that case, use `set_names()` (without any arguments) to record the full path, and then use `tidyr::separate_wider_delim()` and friends to turn them into useful columns.
+
+```{r}
+paths |> 
+  set_names() |> 
+  map(readxl::read_excel) |> 
+  list_rbind(names_to = "year") |> 
+  separate_wider_delim(year, delim = "/", names = c(NA, "dir", "file")) |> 
+  separate_wider_delim(file, delim = ".", names = c("file", "ext"))
+```
+
+### Save your work
+
+Now that you've done all this hard work to get to a nice tidy data frame, it's a great time to save your work:
+
+```{r}
+gapminder <- paths |> 
+  set_names(basename) |> 
+  map(readxl::read_excel) |> 
+  list_rbind(names_to = "year") |> 
+  mutate(year = parse_number(year))
+
+write_csv(gapminder, "gapminder.csv")
+```
+
+Now when you come back to this problem in the future, you can read in a single csv file.
+For large and richer datasets, using parquet might be a better choice than `.csv`, as discussed in @sec-parquet.
+
+```{r}
+#| include: false
+unlink("gapminder.csv")
+```
+
+If you're working in a project, we suggest calling the file that does this sort of data prep work something like `0-cleanup.R`.
+The `0` in the file name suggests that this should be run before anything else.
+
+If your input data files change over time, you might consider learning a tool like [targets](https://docs.ropensci.org/targets/) to set up your data cleaning code to automatically re-run whenever one of the input files is modified.
+
+### Many simple iterations
+
+Here we've just loaded the data directly from disk, and were lucky enough to get a tidy dataset.
+In most cases, you'll need to do some additional tidying, and you have two basic options: you can do one round of iteration with a complex function, or do multiple rounds of iteration with simple functions.
+In our experience most folks reach first for one complex iteration, but you're often better by doing multiple simple iterations.
+
+For example, imagine that you want to read in a bunch of files, filter out missing values, pivot, and then combine.
+One way to approach the problem is to write a function that takes a file and does all those steps then call `map()` once:
+
+```{r}
+#| eval: false
+process_file <- function(path) {
+  df <- read_csv(path)
+  
+  df |> 
+    filter(!is.na(id)) |> 
+    mutate(id = tolower(id)) |> 
+    pivot_longer(jan:dec, names_to = "month")
+}
+
+paths |> 
+  map(process_file) |> 
+  list_rbind()
+```
+
+Alternatively, you could perform each step of `process_file()` to every file:
+
+```{r}
+#| eval: false
+
+paths |> 
+  map(read_csv) |> 
+  map(\(df) df |> filter(!is.na(id))) |> 
+  map(\(df) df |> mutate(id = tolower(id))) |> 
+  map(\(df) df |> pivot_longer(jan:dec, names_to = "month")) |> 
+  list_rbind()
+```
+
+We recommend this approach because it stops you getting fixated on getting the first file right before moving on to the rest.
+By considering all of the data when doing tidying and cleaning, you're more likely to think holistically and end up with a higher quality result.
+
+In this particular example, there's another optimization you could make, by binding all the data frames together earlier.
+Then you can rely on regular dplyr behavior:
+
+```{r}
+#| eval: false
+paths |> 
+  map(read_csv) |> 
+  list_rbind() |> 
+  filter(!is.na(id)) |> 
+  mutate(id = tolower(id)) |> 
+  pivot_longer(jan:dec, names_to = "month")
+```
+
+### Heterogeneous data
+
+Unfortunately, sometimes it's not possible to go from `map()` straight to `list_rbind()` because the data frames are so heterogeneous that `list_rbind()` either fails or yields a data frame that's not very useful.
+In that case, it's still useful to start by loading all of the files:
+
+```{r}
+#| eval: false
+files <- paths |> 
+  map(readxl::read_excel) 
+```
+
+Then a very useful strategy is to capture the structure of the data frames so that you can explore it using your data science skills.
+One way to do so is with this handy `df_types` function[^iteration-6] that returns a tibble with one row for each column:
+
+[^iteration-6]: We're not going to explain how it works, but if you look at the docs for the functions used, you should be able to puzzle it out.
+
+```{r}
+df_types <- function(df) {
+  tibble(
+    col_name = names(df), 
+    col_type = map_chr(df, vctrs::vec_ptype_full),
+    n_miss = map_int(df, \(x) sum(is.na(x)))
+  )
+}
+
+df_types(gapminder)
+```
+
+You can then apply this function to all of the files, and maybe do some pivoting to make it easier to see where the differences are.
+For example, this makes it easy to verify that the gapminder spreadsheets that we've been working with are all quite homogeneous:
+
+```{r}
+files |> 
+  map(df_types) |> 
+  list_rbind(names_to = "file_name") |> 
+  select(-n_miss) |> 
+  pivot_wider(names_from = col_name, values_from = col_type)
+```
+
+If the files have heterogeneous formats, you might need to do more processing before you can successfully merge them.
+Unfortunately, we're now going to leave you to figure that out on your own, but you might want to read about `map_if()` and `map_at()`.
+`map_if()` allows you to selectively modify elements of a list based on their values; `map_at()` allows you to selectively modify elements based on their names.
+
+### Handling failures
+
+Sometimes the structure of your data might be sufficiently wild that you can't even read all the files with a single command.
+And then you'll encounter one of the downsides of `map()`: it succeeds or fails as a whole.
+`map()` will either successfully read all of the files in a directory or fail with an error, reading zero files.
+This is annoying: why does one failure prevent you from accessing all the other successes?
+
+Luckily, purrr comes with a helper to tackle this problem: `possibly()`.
+`possibly()` is what's known as a function operator: it takes a function and returns a function with modified behavior.
+In particular, `possibly()` changes a function from erroring to returning a value that you specify:
+
+```{r}
+files <- paths |> 
+  map(possibly(\(path) readxl::read_excel(path), NULL))
+
+data <- files |> list_rbind()
+```
+
+This works particularly well here because `list_rbind()`, like many tidyverse functions, automatically ignores `NULL`s.
+
+Now you have all the data that can be read easily, and it's time to tackle the hard part of figuring out why some files failed to load and what to do about it.
+Start by getting the paths that failed:
+
+```{r}
+failed <- map_vec(files, is.null)
+paths[failed]
+```
+
+Then call the import function again for each failure and figure out what went wrong.
+
+## Saving multiple outputs
+
+In the last section, you learned about `map()`, which is useful for reading multiple files into a single object.
+In this section, we'll now explore sort of the opposite problem: how can you take one or more R objects and save it to one or more files?
+We'll explore this challenge using three examples:
+
+-   Saving multiple data frames into one database.
+-   Saving multiple data frames into multiple `.csv` files.
+-   Saving multiple plots to multiple `.png` files.
+
+### Writing to a database {#sec-save-database}
+
+Sometimes when working with many files at once, it's not possible to fit all your data into memory at once, and you can't do `map(files, read_csv)`.
+One approach to deal with this problem is to load your data into a database so you can access just the bits you need with dbplyr.
+
+If you're lucky, the database package you're using will provide a handy function that takes a vector of paths and loads them all into the database.
+This is the case with duckdb's `duckdb_read_csv()`:
+
+```{r}
+#| eval: false
+con <- DBI::dbConnect(duckdb::duckdb())
+duckdb::duckdb_read_csv(con, "gapminder", paths)
+```
+
+This would work well here, but we don't have csv files, instead we have excel spreadsheets.
+So we're going to have to do it "by hand".
+Learning to do it by hand will also help you when you have a bunch of csvs and the database that you're working with doesn't have one function that will load them all in.
+
+We need to start by creating a table that will fill in with data.
+The easiest way to do this is by creating a template, a dummy data frame that contains all the columns we want, but only a sampling of the data.
+For the gapminder data, we can make that template by reading a single file and adding the year to it:
+
+```{r}
+template <- readxl::read_excel(paths[[1]])
+template$year <- 1952
+template
+```
+
+Now we can connect to the database, and use `DBI::dbCreateTable()` to turn our template into a database table:
+
+```{r}
+con <- DBI::dbConnect(duckdb::duckdb())
+DBI::dbCreateTable(con, "gapminder", template)
+```
+
+`dbCreateTable()` doesn't use the data in `template`, just the variable names and types.
+So if we inspect the `gapminder` table now you'll see that it's empty but it has the variables we need with the types we expect:
+
+```{r}
+con |> tbl("gapminder")
+```
+
+Next, we need a function that takes a single file path, reads it into R, and adds the result to the `gapminder` table.
+We can do that by combining `read_excel()` with `DBI::dbAppendTable()`:
+
+```{r}
+append_file <- function(path) {
+  df <- readxl::read_excel(path)
+  df$year <- parse_number(basename(path))
+  
+  DBI::dbAppendTable(con, "gapminder", df)
+}
+```
+
+Now we need to call `append_file()` once for each element of `paths`.
+That's certainly possible with `map()`:
+
+```{r}
+#| eval: false
+paths |> map(append_file)
+```
+
+But we don't care about the output of `append_file()`, so instead of `map()` it's slightly nicer to use `walk()`.
+`walk()` does exactly the same thing as `map()` but throws the output away:
+
+```{r}
+paths |> walk(append_file)
+```
+
+Now we can see if we have all the data in our table:
+
+```{r}
+con |> 
+  tbl("gapminder") |> 
+  count(year)
+```
+
+```{r}
+#| include: false
+DBI::dbDisconnect(con, shutdown = TRUE)
+```
+
+### Writing csv files
+
+The same basic principle applies if we want to write multiple csv files, one for each group.
+Let's imagine that we want to take the `ggplot2::diamonds` data and save one csv file for each `clarity`.
+First we need to make those individual datasets.
+There are many ways you could do that, but there's one way we particularly like: `group_nest()`.
+
+```{r}
+by_clarity <- diamonds |> 
+  group_nest(clarity)
+
+by_clarity
+```
+
+This gives us a new tibble with eight rows and two columns.
+`clarity` is our grouping variable and `data` is a list-column containing one tibble for each unique value of `clarity`:
+
+```{r}
+by_clarity$data[[1]]
+```
+
+While we're here, let's create a column that gives the name of output file, using `mutate()` and `str_glue()`:
+
+```{r}
+by_clarity <- by_clarity |> 
+  mutate(path = str_glue("diamonds-{clarity}.csv"))
+
+by_clarity
+```
+
+So if we were going to save these data frames by hand, we might write something like:
+
+```{r}
+#| eval: false
+write_csv(by_clarity$data[[1]], by_clarity$path[[1]])
+write_csv(by_clarity$data[[2]], by_clarity$path[[2]])
+write_csv(by_clarity$data[[3]], by_clarity$path[[3]])
+...
+write_csv(by_clarity$by_clarity[[8]], by_clarity$path[[8]])
+```
+
+This is a little different to our previous uses of `map()` because there are two arguments that are changing, not just one.
+That means we need a new function: `map2()`, which varies both the first and second arguments.
+And because we again don't care about the output, we want `walk2()` rather than `map2()`.
+That gives us:
+
+```{r}
+walk2(by_clarity$data, by_clarity$path, write_csv)
+```
+
+```{r}
+#| include: false
+unlink(by_clarity$path)
+```
+
+### Saving plots
+
+We can take the same basic approach to create many plots.
+Let's first make a function that draws the plot we want:
+
+```{r}
+#| fig-alt: |
+#|   Histogram of carats of diamonds from the by_clarity dataset, ranging from 
+#|   0 to 5 carats. The distribution is unimodal and right skewed with a peak 
+#|   around 1 carat.
+
+carat_histogram <- function(df) {
+  ggplot(df, aes(x = carat)) + geom_histogram(binwidth = 0.1)  
+}
+
+carat_histogram(by_clarity$data[[1]])
+```
+
+Now we can use `map()` to create a list of many plots[^iteration-7] and their eventual file paths:
+
+[^iteration-7]: You can print `by_clarity$plot` to get a crude animation --- you'll get one plot for each element of `plots`.
+    NOTE: this didn't happen for me.
+
+```{r}
+by_clarity <- by_clarity |> 
+  mutate(
+    plot = map(data, carat_histogram),
+    path = str_glue("clarity-{clarity}.png")
+  )
+```
+
+Then use `walk2()` with `ggsave()` to save each plot:
+
+```{r}
+walk2(
+  by_clarity$path,
+  by_clarity$plot,
+  \(path, plot) ggsave(path, plot, width = 6, height = 6)
+)
+```
+
+This is shorthand for:
+
+```{r}
+#| eval: false
+ggsave(by_clarity$path[[1]], by_clarity$plot[[1]], width = 6, height = 6)
+ggsave(by_clarity$path[[2]], by_clarity$plot[[2]], width = 6, height = 6)
+ggsave(by_clarity$path[[3]], by_clarity$plot[[3]], width = 6, height = 6)
+...
+ggsave(by_clarity$path[[8]], by_clarity$plot[[8]], width = 6, height = 6)
+```
+
+```{r}
+#| include: false
+unlink(by_clarity$path)
+```
+
+```{=html}
+<!-- 
+### Exercises
+
+1.  Imagine you have a table of student data containing (amongst other variables) `school_name` and `student_id`. Sketch out what code you'd write if you want to save all the information for each student in file called `{student_id}.csv` in the `{school}` directory.
+-->
+```
+## Summary
+
+In this chapter, you've seen how to use explicit iteration to solve three problems that come up frequently when doing data science: manipulating multiple columns, reading multiple files, and saving multiple outputs.
+But in general, iteration is a super power: if you know the right iteration technique, you can easily go from fixing one problem to fixing all the problems.
+Once you've mastered the techniques in this chapter, we highly recommend learning more by reading the [Functionals chapter](https://adv-r.hadley.nz/functionals.html) of *Advanced R* and consulting the [purrr website](https://purrr.tidyverse.org).
+
+If you know much about iteration in other languages, you might be surprised that we didn't discuss the `for` loop.
+That's because R's orientation towards data analysis changes how we iterate: in most cases you can rely on an existing idiom to do something to each columns or each group.
+And when you can't, you can often use a functional programming tool like `map()` that does something to each element of a list.
+However, you will see `for` loops in wild-caught code, so you'll learn about them in the next chapter where we'll discuss some important base R tools.
diff --git a/joins.qmd b/joins.qmd
new file mode 100644
index 000000000..5c8dba120
--- /dev/null
+++ b/joins.qmd
@@ -0,0 +1,913 @@
+# Joins {#sec-joins}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+It's rare that a data analysis involves only a single data frame.
+Typically you have many data frames, and you must **join** them together to answer the questions that you're interested in.
+This chapter will introduce you to two important types of joins:
+
+-   Mutating joins, which add new variables to one data frame from matching observations in another.
+-   Filtering joins, which filter observations from one data frame based on whether or not they match an observation in another.
+
+We'll begin by discussing keys, the variables used to connect a pair of data frames in a join.
+We cement the theory with an examination of the keys in the datasets from the nycflights13 package, then use that knowledge to start joining data frames together.
+Next we'll discuss how joins work, focusing on their action on the rows.
+We'll finish up with a discussion of non-equi joins, a family of joins that provide a more flexible way of matching keys than the default equality relationship.
+
+### Prerequisites
+
+In this chapter, we'll explore the five related datasets from nycflights13 using the join functions from dplyr.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(nycflights13)
+```
+
+## Keys
+
+To understand joins, you need to first understand how two tables can be connected through a pair of keys, within each table.
+In this section, you'll learn about the two types of key and see examples of both in the datasets of the nycflights13 package.
+You'll also learn how to check that your keys are valid, and what to do if your table lacks a key.
+
+### Primary and foreign keys
+
+Every join involves a pair of keys: a primary key and a foreign key.
+A **primary key** is a variable or set of variables that uniquely identifies each observation.
+When more than one variable is needed, the key is called a **compound key.** For example, in nycfights13:
+
+-   `airlines` records two pieces of data about each airline: its carrier code and its full name.
+    You can identify an airline with its two letter carrier code, making `carrier` the primary key.
+
+    ```{r}
+    airlines
+    ```
+
+-   `airports` records data about each airport.
+    You can identify each airport by its three letter airport code, making `faa` the primary key.
+
+    ```{r}
+    #| R.options:
+    #|   width: 67
+    airports
+    ```
+
+-   `planes` records data about each plane.
+    You can identify a plane by its tail number, making `tailnum` the primary key.
+
+    ```{r}
+    #| R.options:
+    #|   width: 67
+    planes
+    ```
+
+-   `weather` records data about the weather at the origin airports.
+    You can identify each observation by the combination of location and time, making `origin` and `time_hour` the compound primary key.
+
+    ```{r}
+    #| R.options:
+    #|   width: 67
+    weather
+    ```
+
+A **foreign key** is a variable (or set of variables) that corresponds to a primary key in another table.
+For example:
+
+-   `flights$tailnum` is a foreign key that corresponds to the primary key `planes$tailnum`.
+-   `flights$carrier` is a foreign key that corresponds to the primary key `airlines$carrier`.
+-   `flights$origin` is a foreign key that corresponds to the primary key `airports$faa`.
+-   `flights$dest` is a foreign key that corresponds to the primary key `airports$faa`.
+-   `flights$origin`-`flights$time_hour` is a compound foreign key that corresponds to the compound primary key `weather$origin`-`weather$time_hour`.
+
+These relationships are summarized visually in @fig-flights-relationships.
+
+```{r}
+#| label: fig-flights-relationships
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   Connections between all five data frames in the nycflights13 package.
+#|   Variables making up a primary key are colored grey, and are connected
+#|   to their corresponding foreign keys with arrows.
+#| fig-alt: |
+#|   The relationships between airports, planes, flights, weather, and
+#|   airlines datasets from the nycflights13 package. airports$faa
+#|   connected to the flights$origin and flights$dest. planes$tailnum
+#|   is connected to the flights$tailnum. weather$time_hour and
+#|   weather$origin are jointly connected to flights$time_hour and 
+#|   flights$origin. airlines$carrier is connected to flights$carrier.
+#|   There are no direct connections between airports, planes, airlines, 
+#|   and weather data frames.
+knitr::include_graphics("diagrams/relational.png", dpi = 270)
+```
+
+You'll notice a nice feature in the design of these keys: the primary and foreign keys almost always have the same names, which, as you'll see shortly, will make your joining life much easier.
+It's also worth noting the opposite relationship: almost every variable name used in multiple tables has the same meaning in each place.
+There's only one exception: `year` means year of departure in `flights` and year of manufacturer in `planes`.
+This will become important when we start actually joining tables together.
+
+### Checking primary keys
+
+Now that that we've identified the primary keys in each table, it's good practice to verify that they do indeed uniquely identify each observation.
+One way to do that is to `count()` the primary keys and look for entries where `n` is greater than one.
+This reveals that `planes` and `weather` both look good:
+
+```{r}
+planes |> 
+  count(tailnum) |> 
+  filter(n > 1)
+
+weather |> 
+  count(time_hour, origin) |> 
+  filter(n > 1)
+```
+
+You should also check for missing values in your primary keys --- if a value is missing then it can't identify an observation!
+
+```{r}
+planes |> 
+  filter(is.na(tailnum))
+
+weather |> 
+  filter(is.na(time_hour) | is.na(origin))
+```
+
+### Surrogate keys
+
+So far we haven't talked about the primary key for `flights`.
+It's not super important here, because there are no data frames that use it as a foreign key, but it's still useful to consider because it's easier to work with observations if we have some way to describe them to others.
+
+After a little thinking and experimentation, we determined that there are three variables that together uniquely identify each flight:
+
+```{r}
+flights |> 
+  count(time_hour, carrier, flight) |> 
+  filter(n > 1)
+```
+
+Does the absence of duplicates automatically make `time_hour`-`carrier`-`flight` a primary key?
+It's certainly a good start, but it doesn't guarantee it.
+For example, are altitude and latitude a good primary key for `airports`?
+
+```{r}
+airports |>
+  count(alt, lat) |> 
+  filter(n > 1)
+```
+
+Identifying an airport by its altitude and latitude is clearly a bad idea, and in general it's not possible to know from the data alone whether or not a combination of variables makes a good a primary key.
+But for flights, the combination of `time_hour`, `carrier`, and `flight` seems reasonable because it would be really confusing for an airline and its customers if there were multiple flights with the same flight number in the air at the same time.
+
+That said, we might be better off introducing a simple numeric surrogate key using the row number:
+
+```{r}
+flights2 <- flights |> 
+  mutate(id = row_number(), .before = 1)
+flights2
+```
+
+Surrogate keys can be particular useful when communicating to other humans: it's much easier to tell someone to take a look at flight 2001 than to say look at UA430 which departed 9am 2013-01-03.
+
+### Exercises
+
+1.  We forgot to draw the relationship between `weather` and `airports` in @fig-flights-relationships.
+    What is the relationship and how should it appear in the diagram?
+
+2.  `weather` only contains information for the three origin airports in NYC.
+    If it contained weather records for all airports in the USA, what additional connection would it make to `flights`?
+
+3.  The `year`, `month`, `day`, `hour`, and `origin` variables almost form a compound key for `weather`, but there's one hour that has duplicate observations.
+    Can you figure out what's special about that hour?
+
+4.  We know that some days of the year are special and fewer people than usual fly on them (e.g., Christmas eve and Christmas day).
+    How might you represent that data as a data frame?
+    What would be the primary key?
+    How would it connect to the existing data frames?
+
+5.  Draw a diagram illustrating the connections between the `Batting`, `People`, and `Salaries` data frames in the Lahman package.
+    Draw another diagram that shows the relationship between `People`, `Managers`, `AwardsManagers`.
+    How would you characterize the relationship between the `Batting`, `Pitching`, and `Fielding` data frames?
+
+## Basic joins {#sec-mutating-joins}
+
+Now that you understand how data frames are connected via keys, we can start using joins to better understand the `flights` dataset.
+dplyr provides six join functions: `left_join()`, `inner_join()`, `right_join()`, `full_join()`, `semi_join()`, and `anti_join().` They all have the same interface: they take a pair of data frames (`x` and `y`) and return a data frame.
+The order of the rows and columns in the output is primarily determined by `x`.
+
+In this section, you'll learn how to use one mutating join, `left_join()`, and two filtering joins, `semi_join()` and `anti_join()`.
+In the next section, you'll learn exactly how these functions work, and about the remaining `inner_join()`, `right_join()` and `full_join()`.
+
+### Mutating joins
+
+A **mutating join** allows you to combine variables from two data frames: it first matches observations by their keys, then copies across variables from one data frame to the other.
+Like `mutate()`, the join functions add variables to the right, so if your dataset has many variables, you won't see the new ones.
+For these examples, we'll make it easier to see what's going on by creating a narrower dataset with just six variables[^joins-1]:
+
+[^joins-1]: Remember that in RStudio you can also use `View()` to avoid this problem.
+
+```{r}
+flights2 <- flights |> 
+  select(year, time_hour, origin, dest, tailnum, carrier)
+flights2
+```
+
+There are four types of mutating join, but there's one that you'll use almost all of the time: `left_join()`.
+It's special because the output will always have the same rows as `x`[^joins-2].
+The primary use of `left_join()` is to add in additional metadata.
+For example, we can use `left_join()` to add the full airline name to the `flights2` data:
+
+[^joins-2]: That's not 100% true, but you'll get a warning whenever it isn't.
+
+```{r}
+flights2 |>
+  left_join(airlines)
+```
+
+Or we could find out the temperature and wind speed when each plane departed:
+
+```{r}
+flights2 |> 
+  left_join(weather |> select(origin, time_hour, temp, wind_speed))
+```
+
+Or what size of plane was flying:
+
+```{r}
+flights2 |> 
+  left_join(planes |> select(tailnum, type, engines, seats))
+```
+
+When `left_join()` fails to find a match for a row in `x`, it fills in the new variables with missing values.
+For example, there's no information about the plane with tail number `N3ALAA` so the `type`, `engines`, and `seats` will be missing:
+
+```{r}
+flights2 |> 
+  filter(tailnum == "N3ALAA") |> 
+  left_join(planes |> select(tailnum, type, engines, seats))
+```
+
+We'll come back to this problem a few times in the rest of the chapter.
+
+### Specifying join keys
+
+By default, `left_join()` will use all variables that appear in both data frames as the join key, the so called **natural** join.
+This is a useful heuristic, but it doesn't always work.
+For example, what happens if we try to join `flights2` with the complete `planes` dataset?
+
+```{r}
+flights2 |> 
+  left_join(planes)
+```
+
+We get a lot of missing matches because our join is trying to use `tailnum` and `year` as a compound key.
+Both `flights` and `planes` have a `year` column but they mean different things: `flights$year` is the year the flight occurred and `planes$year` is the year the plane was built.
+We only want to join on `tailnum` so we need to provide an explicit specification with `join_by()`:
+
+```{r}
+flights2 |> 
+  left_join(planes, join_by(tailnum))
+```
+
+Note that the `year` variables are disambiguated in the output with a suffix (`year.x` and `year.y`), which tells you whether the variable came from the `x` or `y` argument.
+You can override the default suffixes with the `suffix` argument.
+
+`join_by(tailnum)` is short for `join_by(tailnum == tailnum)`.
+It's important to know about this fuller form for two reasons.
+Firstly, it describes the relationship between the two tables: the keys must be equal.
+That's why this type of join is often called an **equi join**.
+You'll learn about non-equi joins in @sec-non-equi-joins.
+
+Secondly, it's how you specify different join keys in each table.
+For example, there are two ways to join the `flight2` and `airports` table: either by `dest` or `origin`:
+
+```{r}
+flights2 |> 
+  left_join(airports, join_by(dest == faa))
+
+flights2 |> 
+  left_join(airports, join_by(origin == faa))
+```
+
+In older code you might see a different way of specifying the join keys, using a character vector:
+
+-   `by = "x"` corresponds to `join_by(x)`.
+-   `by = c("a" = "x")` corresponds to `join_by(a == x)`.
+
+Now that it exists, we prefer `join_by()` since it provides a clearer and more flexible specification.
+
+`inner_join()`, `right_join()`, `full_join()` have the same interface as `left_join()`.
+The difference is which rows they keep: left join keeps all the rows in `x`, the right join keeps all rows in `y`, the full join keeps all rows in either `x` or `y`, and the inner join only keeps rows that occur in both `x` and `y`.
+We'll come back to these in more detail later.
+
+### Filtering joins
+
+As you might guess the primary action of a **filtering join** is to filter the rows.
+There are two types: semi-joins and anti-joins.
+**Semi-joins** keep all rows in `x` that have a match in `y`.
+For example, we could use a semi-join to filter the `airports` dataset to show just the origin airports:
+
+```{r}
+airports |> 
+  semi_join(flights2, join_by(faa == origin))
+```
+
+Or just the destinations:
+
+```{r}
+airports |> 
+  semi_join(flights2, join_by(faa == dest))
+```
+
+**Anti-joins** are the opposite: they return all rows in `x` that don't have a match in `y`.
+They're useful for finding missing values that are **implicit** in the data, the topic of @sec-missing-implicit.
+Implicitly missing values don't show up as `NA`s but instead only exist as an absence.
+For example, we can find rows that are missing from `airports` by looking for flights that don't have a matching destination airport:
+
+```{r}
+flights2 |> 
+  anti_join(airports, join_by(dest == faa)) |> 
+  distinct(dest)
+```
+
+Or we can find which `tailnum`s are missing from `planes`:
+
+```{r}
+flights2 |>
+  anti_join(planes, join_by(tailnum)) |> 
+  distinct(tailnum)
+```
+
+### Exercises
+
+1.  Find the 48 hours (over the course of the whole year) that have the worst delays.
+    Cross-reference it with the `weather` data.
+    Can you see any patterns?
+
+2.  Imagine you've found the top 10 most popular destinations using this code:
+
+    ```{r}
+    top_dest <- flights2 |>
+      count(dest, sort = TRUE) |>
+      head(10)
+    ```
+
+    How can you find all flights to those destinations?
+
+3.  Does every departing flight have corresponding weather data for that hour?
+
+4.  What do the tail numbers that don't have a matching record in `planes` have in common?
+    (Hint: one variable explains \~90% of the problems.)
+
+5.  Add a column to `planes` that lists every `carrier` that has flown that plane.
+    You might expect that there's an implicit relationship between plane and airline, because each plane is flown by a single airline.
+    Confirm or reject this hypothesis using the tools you've learned in previous chapters.
+
+6.  Add the latitude and the longitude of the origin *and* destination airport to `flights`.
+    Is it easier to rename the columns before or after the join?
+
+7.  Compute the average delay by destination, then join on the `airports` data frame so you can show the spatial distribution of delays.
+    Here's an easy way to draw a map of the United States:
+
+    ```{r}
+    #| eval: false
+
+    airports |>
+      semi_join(flights, join_by(faa == dest)) |>
+      ggplot(aes(x = lon, y = lat)) +
+        borders("state") +
+        geom_point() +
+        coord_quickmap()
+    ```
+
+    You might want to use the `size` or `color` of the points to display the average delay for each airport.
+
+8.  What happened on June 13 2013?
+    Draw a map of the delays, and then use Google to cross-reference with the weather.
+
+    ```{r}
+    #| eval: false
+    #| include: false
+
+    worst <- filter(flights, !is.na(dep_time), month == 6, day == 13)
+    worst |>
+      group_by(dest) |>
+      summarize(delay = mean(arr_delay), n = n()) |>
+      filter(n > 5) |>
+      inner_join(airports, by = c("dest" = "faa")) |>
+      ggplot(aes(x = lon, y = lat)) +
+        borders("state") +
+        geom_point(aes(size = n, color = delay)) +
+        coord_quickmap()
+    ```
+
+## How do joins work?
+
+Now that you've used joins a few times it's time to learn more about how they work, focusing on how each row in `x` matches rows in `y`.
+We'll begin by introducing a visual representation of joins, using the simple tibbles defined below and shown in @fig-join-setup.
+In these examples we'll use a single key called `key` and a single value column (`val_x` and `val_y`), but the ideas all generalize to multiple keys and multiple values.
+
+```{r}
+x <- tribble(
+  ~key, ~val_x,
+     1, "x1",
+     2, "x2",
+     3, "x3"
+)
+y <- tribble(
+  ~key, ~val_y,
+     1, "y1",
+     2, "y2",
+     4, "y3"
+)
+```
+
+```{r}
+#| label: fig-join-setup
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   Graphical representation of two simple tables. The colored `key`
+#|   columns map background color to key value. The grey columns represent
+#|   the "value" columns that are carried along for the ride. 
+#| fig-alt: |
+#|   x and y are two data frames with 2 columns and 3 rows, with contents
+#|   as described in the text. The values of the keys are colored:
+#|   1 is green, 2 is purple, 3 is orange, and 4 is yellow.
+
+knitr::include_graphics("diagrams/join/setup.png", dpi = 270)
+```
+
+@fig-join-setup2 introduces the foundation for our visual representation.
+It shows all potential matches between `x` and `y` as the intersection between lines drawn from each row of `x` and each row of `y`.
+The rows and columns in the output are primarily determined by `x`, so the `x` table is horizontal and lines up with the output.
+
+```{r}
+#| label: fig-join-setup2
+#| echo: false
+#| out-width: ~
+#| fig-cap: | 
+#|   To understand how joins work, it's useful to think of every possible
+#|   match. Here we show that with a grid of connecting lines.
+#| fig-alt: |
+#|   x and y are placed at right-angles, with horizonal lines extending 
+#|   from x and vertical lines extending from y. There are 3 rows in x and 
+#|   3 rows in y, which leads to nine intersections representing nine
+#|   potential matches.
+
+knitr::include_graphics("diagrams/join/setup2.png", dpi = 270)
+```
+
+To describe a specific type of join, we indicate matches with dots.
+The matches determine the rows in the output, a new data frame that contains the key, the x values, and the y values.
+For example, @fig-join-inner shows an inner join, where rows are retained if and only if the keys are equal.
+
+```{r}
+#| label: fig-join-inner
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   An inner join matches each row in `x` to the row in `y` that has the
+#|   same value of `key`. Each match becomes a row in the output.
+#| fig-alt: |
+#|   x and y are placed at right-angles with lines forming a grid of
+#|   potential matches. Keys 1 and 2 appear in both x and y, so we
+#|   get a match, indicated by a dot. Each dot corresponds to a row
+#|   in the output, so the resulting joined data frame has two rows.
+
+knitr::include_graphics("diagrams/join/inner.png", dpi = 270)
+```
+
+We can apply the same principles to explain the **outer joins**, which keep observations that appear in at least one of the data frames.
+These joins work by adding an additional "virtual" observation to each data frame.
+This observation has a key that matches if no other key matches, and values filled with `NA`.
+There are three types of outer joins:
+
+-   A **left join** keeps all observations in `x`, @fig-join-left.
+    Every row of `x` is preserved in the output because it can fall back to matching a row of `NA`s in `y`.
+
+    ```{r}
+    #| label: fig-join-left
+    #| echo: false
+    #| out-width: ~
+    #| fig-cap: | 
+    #|   A visual representation of the left join where every row in `x`
+    #|   appears in the output.
+    #| fig-alt: |
+    #|   Compared to the previous diagram showing an inner join, the y table
+    #|   gets a new virtual row containin NA that will match any row in x
+    #|   that didn't otherwise match. This means that the output now has
+    #|   three rows. For key = 3, which matches this virtual row, val_y takes
+    #|   value NA.
+
+    knitr::include_graphics("diagrams/join/left.png", dpi = 270)
+    ```
+
+-   A **right join** keeps all observations in `y`, @fig-join-right.
+    Every row of `y` is preserved in the output because it can fall back to matching a row of `NA`s in `x`.
+    The output still matches `x` as much as possible; any extra rows from `y` are added to the end.
+
+    ```{r}
+    #| label: fig-join-right
+    #| echo: false
+    #| out-width: ~
+    #| fig-cap: | 
+    #|   A visual representation of the right join where every row of `y` 
+    #|   appears in the output. 
+    #| fig-alt: |
+    #|   Compared to the previous diagram showing an left join, the x table
+    #|   now gains a virtual row so that every row in y gets a match in x.
+    #|   val_x contains NA for the row in y that didn't match x.
+
+    knitr::include_graphics("diagrams/join/right.png", dpi = 270)
+    ```
+
+-   A **full join** keeps all observations that appear in `x` or `y`, @fig-join-full.
+    Every row of `x` and `y` is included in the output because both `x` and `y` have a fall back row of `NA`s.
+    Again, the output starts with all rows from `x`, followed by the remaining unmatched `y` rows.
+
+    ```{r}
+    #| label: fig-join-full
+    #| echo: false
+    #| out-width: ~
+    #| fig-cap: | 
+    #|   A visual representation of the full join where every row in `x`
+    #|   and `y` appears in the output.
+    #| fig-alt: |
+    #|   Now both x and y have a virtual row that always matches.
+    #|   The result has 4 rows: keys 1, 2, 3, and 4 with all values 
+    #|   from val_x and val_y, however key 2, val_y and key 4, val_x are NAs
+    #|   since those keys don't have a match in the other data frames.
+
+    knitr::include_graphics("diagrams/join/full.png", dpi = 270)
+    ```
+
+Another way to show how the types of outer join differ is with a Venn diagram, as in @fig-join-venn.
+However, this is not a great representation because while it might jog your memory about which rows are preserved, it fails to illustrate what's happening with the columns.
+
+```{r}
+#| label: fig-join-venn
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   Venn diagrams showing the difference between inner, left, right, and
+#|   full joins.
+#| fig-alt: |
+#|   Venn diagrams for inner, full, left, and right joins. Each join
+#|   represented with two intersecting circles representing data frames x
+#|   and y, with x on the right and y on the left. Shading indicates the
+#|   result of the join. 
+#|
+#|   Inner join: the intersection is shaded. 
+#|   Full join: Everything is shaded. 
+#|   Left join: All of x is shaded.
+#|   Right join: All of y is shaded.
+
+knitr::include_graphics("diagrams/join/venn.png", dpi = 270)
+```
+
+The joins shown here are the so-called **equi** **joins**, where rows match if the keys are equal.
+Equi joins are the most common type of join, so we'll typically omit the equi prefix, and just say "inner join" rather than "equi inner join".
+We'll come back to non-equi joins in @sec-non-equi-joins.
+
+### Row matching
+
+So far we've explored what happens if a row in `x` matches zero or one rows in `y`.
+What happens if it matches more than one row?
+To understand what's going let's first narrow our focus to the `inner_join()` and then draw a picture, @fig-join-match-types.
+
+```{r}
+#| label: fig-join-match-types
+#| echo: false
+#| out-width: ~
+#| fig-cap: | 
+#|   The three ways a row in `x` can match. `x1` matches
+#|   one row in `y`, `x2` matches two rows in `y`, `x3` matches
+#|   zero rows in y. Note that while there are three rows in
+#|   `x` and three rows in the output, there isn't a direct
+#|   correspondence between the rows.
+#| fig-alt: |
+#|   A join diagram where x has key values 1, 2, and 3, and y has
+#|   key values 1, 2, 2. The output has three rows because key 1 matches
+#|   one row, key 2 matches two rows, and key 3 matches zero rows.
+
+knitr::include_graphics("diagrams/join/match-types.png", dpi = 270)
+```
+
+There are three possible outcomes for a row in `x`:
+
+-   If it doesn't match anything, it's dropped.
+-   If it matches 1 row in `y`, it's preserved.
+-   If it matches more than 1 row in `y`, it's duplicated once for each match.
+
+In principle, this means that there's no guaranteed correspondence between the rows in the output and the rows in `x`, but in practice, this rarely causes problems.
+There is, however, one particularly dangerous case which can cause a combinatorial explosion of rows.
+Imagine joining the following two tables:
+
+```{r}
+df1 <- tibble(key = c(1, 2, 2), val_x = c("x1", "x2", "x3"))
+df2 <- tibble(key = c(1, 2, 2), val_y = c("y1", "y2", "y3"))
+```
+
+While the first row in `df1` only matches one row in `df2`, the second and third rows both match two rows.
+This is sometimes called a `many-to-many` join, and will cause dplyr to emit a warning:
+
+```{r}
+df1 |> 
+  inner_join(df2, join_by(key))
+```
+
+If you are doing this deliberately, you can set `relationship = "many-to-many"`, as the warning suggests.
+
+### Filtering joins
+
+The number of matches also determines the behavior of the filtering joins.
+The semi-join keeps rows in `x` that have one or more matches in `y`, as in @fig-join-semi.
+The anti-join keeps rows in `x` that match zero rows in `y`, as in @fig-join-anti.
+In both cases, only the existence of a match is important; it doesn't matter how many times it matches.
+This means that filtering joins never duplicate rows like mutating joins do.
+
+```{r}
+#| label: fig-join-semi
+#| echo: false
+#| out-width: null
+#| fig-cap: |
+#|   In a semi-join it only matters that there is a match; otherwise
+#|   values in `y` don't affect the output.
+#| fig-alt: |
+#|   A join diagram with old friends x and y. In a semi join, only the 
+#|   presence of a match matters so the output contains the same columns
+#|   as x.
+
+knitr::include_graphics("diagrams/join/semi.png", dpi = 270)
+```
+
+```{r}
+#| label: fig-join-anti
+#| echo: false
+#| out-width: null
+#| fig-cap: |
+#|   An anti-join is the inverse of a semi-join, dropping rows from `x`
+#|   that have a match in `y`.
+#| fig-alt: |
+#|   An anti-join is the inverse of a semi-join so matches are drawn with
+#|   red lines indicating that they will be dropped from the output.
+
+knitr::include_graphics("diagrams/join/anti.png", dpi = 270)
+```
+
+## Non-equi joins {#sec-non-equi-joins}
+
+So far you've only seen equi joins, joins where the rows match if the `x` key equals the `y` key.
+Now we're going to relax that restriction and discuss other ways of determining if a pair of rows match.
+
+But before we can do that, we need to revisit a simplification we made above.
+In equi joins the `x` keys and `y` are always equal, so we only need to show one in the output.
+We can request that dplyr keep both keys with `keep = TRUE`, leading to the code below and the re-drawn `inner_join()` in @fig-inner-both.
+
+```{r}
+x |> left_join(y, by = "key", keep = TRUE)
+```
+
+```{r}
+#| label: fig-inner-both
+#| fig-cap: |
+#|   An inner join showing both `x` and `y` keys in the output.
+#| fig-alt: |
+#|   A join diagram showing an inner join betwen x and y. The result
+#|   now includes four columns: key.x, val_x, key.y, and val_y. The
+#|   values of key.x and key.y are identical, which is why we usually
+#|   only show one.
+#| echo: false
+#| out-width: ~
+
+knitr::include_graphics("diagrams/join/inner-both.png", dpi = 270)
+```
+
+When we move away from equi joins we'll always show the keys, because the key values will often be different.
+For example, instead of matching only when the `x$key` and `y$key` are equal, we could match whenever the `x$key` is greater than or equal to the `y$key`, leading to @fig-join-gte.
+dplyr's join functions understand this distinction equi and non-equi joins so will always show both keys when you perform a non-equi join.
+
+```{r}
+#| label: fig-join-gte
+#| echo: false
+#| fig-cap: |
+#|   A non-equi join where the `x` key must be greater than or equal to 
+#|   the `y` key. Many rows generate multiple matches.
+#| fig-alt: |
+#|   A join diagram illustrating join_by(key >= key). The first row
+#|   of x matches one row of y and the second and thirds rows each match
+#|   two rows. This means the output has five rows containing each of the 
+#|   following (key.x, key.y) pairs: (1, 1), (2, 1), (2, 2), (3, 1),
+#|   (3, 2).
+knitr::include_graphics("diagrams/join/gte.png", dpi = 270)
+```
+
+Non-equi join isn't a particularly useful term because it only tells you what the join is not, not what it is. dplyr helps by identifying four particularly useful types of non-equi join:
+
+-   **Cross joins** match every pair of rows.
+-   **Inequality joins** use `<`, `<=`, `>`, and `>=` instead of `==`.
+-   **Rolling joins** are similar to inequality joins but only find the closest match.
+-   **Overlap joins** are a special type of inequality join designed to work with ranges.
+
+Each of these is described in more detail in the following sections.
+
+### Cross joins
+
+A cross join matches everything, as in @fig-join-cross, generating the Cartesian product of rows.
+This means the output will have `nrow(x) * nrow(y)` rows.
+
+```{r}
+#| label: fig-join-cross
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   A cross join matches each row in `x` with every row in `y`.
+#| fig-alt: |
+#|   A join diagram showing a dot for every combination of x and y.
+knitr::include_graphics("diagrams/join/cross.png", dpi = 270)
+```
+
+Cross joins are useful when generating permutations.
+For example, the code below generates every possible pair of names.
+Since we're joining `df` to itself, this is sometimes called a **self-join**.
+Cross joins use a different join function because there's no distinction between inner/left/right/full when you're matching every row.
+
+```{r}
+df <- tibble(name = c("John", "Simon", "Tracy", "Max"))
+df |> cross_join(df)
+```
+
+### Inequality joins
+
+Inequality joins use `<`, `<=`, `>=`, or `>` to restrict the set of possible matches, as in @fig-join-gte and @fig-join-lt.
+
+```{r}
+#| label: fig-join-lt
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   An inequality join where `x` is joined to `y` on rows where the key 
+#|   of `x` is less than the key of `y`. This makes a triangular
+#|   shape in the top-left corner.
+#| fig-alt: |
+#|   A diagram depicting an inequality join where a data frame x is joined by 
+#|   a data frame y where the key of x is less than the key of y, resulting 
+#|   in a triangular shape in the top-left corner.
+
+knitr::include_graphics("diagrams/join/lt.png", dpi = 270)
+```
+
+Inequality joins are extremely general, so general that it's hard to come up with meaningful specific use cases.
+One small useful technique is to use them to restrict the cross join so that instead of generating all permutations, we generate all combinations:
+
+```{r}
+df <- tibble(id = 1:4, name = c("John", "Simon", "Tracy", "Max"))
+
+df |> left_join(df, join_by(id < id))
+```
+
+### Rolling joins
+
+Rolling joins are a special type of inequality join where instead of getting *every* row that satisfies the inequality, you get just the closest row, as in @fig-join-closest.
+You can turn any inequality join into a rolling join by adding `closest()`.
+For example `join_by(closest(x <= y))` matches the smallest `y` that's greater than or equal to x, and `join_by(closest(x > y))` matches the biggest `y` that's less than `x`.
+
+```{r}
+#| label: fig-join-closest
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   A rolling join is similar to a greater-than-or-equal inequality join
+#|   but only matches the first value.
+#| fig-alt: |
+#|   A rolling join is a subset of an inequality join so some matches are
+#|   grayed out indicating that they're not used because they're not the 
+#|   "closest".
+knitr::include_graphics("diagrams/join/closest.png", dpi = 270)
+```
+
+Rolling joins are particularly useful when you have two tables of dates that don't perfectly line up and you want to find (e.g.) the closest date in table 1 that comes before (or after) some date in table 2.
+
+For example, imagine that you're in charge of the party planning commission for your office.
+Your company is rather cheap so instead of having individual parties, you only have a party once each quarter.
+The rules for determining when a party will be held are a little complex: parties are always on a Monday, you skip the first week of January since a lot of people are on holiday, and the first Monday of Q3 2022 is July 4, so that has to be pushed back a week.
+That leads to the following party days:
+
+```{r}
+parties <- tibble(
+  q = 1:4,
+  party = ymd(c("2022-01-10", "2022-04-04", "2022-07-11", "2022-10-03"))
+)
+```
+
+Now imagine that you have a table of employee birthdays:
+
+```{r}
+employees <- tibble(
+  name = sample(babynames::babynames$name, 100),
+  birthday = ymd("2022-01-01") + (sample(365, 100, replace = TRUE) - 1)
+)
+employees
+```
+
+And for each employee we want to find the first party date that comes after (or on) their birthday.
+We can express that with a rolling join:
+
+```{r}
+employees |> 
+  left_join(parties, join_by(closest(birthday >= party)))
+```
+
+There is, however, one problem with this approach: the folks with birthdays before January 10 don't get a party:
+
+```{r}
+employees |> 
+  anti_join(parties, join_by(closest(birthday >= party)))
+```
+
+To resolve that issue we'll need to tackle the problem a different way, with overlap joins.
+
+### Overlap joins
+
+Overlap joins provide three helpers that use inequality joins to make it easier to work with intervals:
+
+-   `between(x, y_lower, y_upper)` is short for `x >= y_lower, x <= y_upper`.
+-   `within(x_lower, x_upper, y_lower, y_upper)` is short for `x_lower >= y_lower, x_upper <= y_upper`.
+-   `overlaps(x_lower, x_upper, y_lower, y_upper)` is short for `x_lower <= y_upper, x_upper >= y_lower`.
+
+Let's continue the birthday example to see how you might use them.
+There's one problem with the strategy we used above: there's no party preceding the birthdays Jan 1-9.
+So it might be better to be explicit about the date ranges that each party spans, and make a special case for those early birthdays:
+
+```{r}
+parties <- tibble(
+  q = 1:4,
+  party = ymd(c("2022-01-10", "2022-04-04", "2022-07-11", "2022-10-03")),
+  start = ymd(c("2022-01-01", "2022-04-04", "2022-07-11", "2022-10-03")),
+  end = ymd(c("2022-04-03", "2022-07-11", "2022-10-02", "2022-12-31"))
+)
+parties
+```
+
+Hadley is hopelessly bad at data entry so he also wanted to check that the party periods don't overlap.
+One way to do this is by using a self-join to check if any start-end interval overlap with another:
+
+```{r}
+parties |> 
+  inner_join(parties, join_by(overlaps(start, end, start, end), q < q)) |> 
+  select(start.x, end.x, start.y, end.y)
+```
+
+Ooops, there is an overlap, so let's fix that problem and continue:
+
+```{r}
+parties <- tibble(
+  q = 1:4,
+  party = ymd(c("2022-01-10", "2022-04-04", "2022-07-11", "2022-10-03")),
+  start = ymd(c("2022-01-01", "2022-04-04", "2022-07-11", "2022-10-03")),
+  end = ymd(c("2022-04-03", "2022-07-10", "2022-10-02", "2022-12-31"))
+)
+```
+
+Now we can match each employee to their party.
+This is a good place to use `unmatched = "error"` because we want to quickly find out if any employees didn't get assigned a party.
+
+```{r}
+employees |> 
+  inner_join(parties, join_by(between(birthday, start, end)), unmatched = "error")
+```
+
+### Exercises
+
+1.  Can you explain what's happening with the keys in this equi join?
+    Why are they different?
+
+    ```{r}
+    x |> full_join(y, by = "key")
+
+    x |> full_join(y, by = "key", keep = TRUE)
+    ```
+
+2.  When finding if any party period overlapped with another party period we used `q < q` in the `join_by()`?
+    Why?
+    What happens if you remove this inequality?
+
+## Summary
+
+In this chapter, you've learned how to use mutating and filtering joins to combine data from a pair of data frames.
+Along the way you learned how to identify keys, and the difference between primary and foreign keys.
+You also understand how joins work and how to figure out how many rows the output will have.
+Finally, you've gained a glimpse into the power of non-equi joins and seen a few interesting use cases.
+
+This chapter concludes the "Transform" part of the book where the focus was on the tools you could use with individual columns and tibbles.
+You learned about dplyr and base functions for working with logical vectors, numbers, and complete tables, stringr functions for working strings, lubridate functions for working with date-times, and forcats functions for working with factors.
+
+In the next part of the book, you'll learn more about getting various types of data into R in a tidy form.
diff --git a/layers.qmd b/layers.qmd
new file mode 100644
index 000000000..a69a288eb
--- /dev/null
+++ b/layers.qmd
@@ -0,0 +1,1060 @@
+# Layers {#sec-layers}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+In @sec-data-visualization, you learned much more than just how to make scatterplots, bar charts, and boxplots.
+You learned a foundation that you can use to make *any* type of plot with ggplot2.
+
+In this chapter, you'll expand on that foundation as you learn about the layered grammar of graphics.
+We'll start with a deeper dive into aesthetic mappings, geometric objects, and facets.
+Then, you will learn about statistical transformations ggplot2 makes under the hood when creating a plot.
+These transformations are used to calculate new values to plot, such as the heights of bars in a bar plot or medians in a box plot.
+You will also learn about position adjustments, which modify how geoms are displayed in your plots.
+Finally, we'll briefly introduce coordinate systems.
+
+We will not cover every single function and option for each of these layers, but we will walk you through the most important and commonly used functionality provided by ggplot2 as well as introduce you to packages that extend ggplot2.
+
+### Prerequisites
+
+This chapter focuses on ggplot2.
+To access the datasets, help pages, and functions used in this chapter, load the tidyverse by running this code:
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+## Aesthetic mappings
+
+> "The greatest value of a picture is when it forces us to notice what we never expected to see." --- John Tukey
+
+Remember that the `mpg` data frame bundled with the ggplot2 package contains `r nrow(mpg)` observations on `r mpg |> distinct(model) |> nrow()` car models.
+
+```{r}
+mpg
+```
+
+Among the variables in `mpg` are:
+
+1.  `displ`: A car's engine size, in liters.
+    A numerical variable.
+
+2.  `hwy`: A car's fuel efficiency on the highway, in miles per gallon (mpg).
+    A car with a low fuel efficiency consumes more fuel than a car with a high fuel efficiency when they travel the same distance.
+    A numerical variable.
+
+3.  `class`: Type of car.
+    A categorical variable.
+
+Let's start by visualizing the relationship between `displ` and `hwy` for various `class`es of cars.
+We can do this with a scatterplot where the numerical variables are mapped to the `x` and `y` aesthetics and the categorical variable is mapped to an aesthetic like `color` or `shape`.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| fig-alt: |
+#|   Two scatterplots next to each other, both visualizing highway fuel 
+#|   efficiency versus engine size of cars and showing a negative 
+#|   association. In the plot on the left class is mapped to the color 
+#|   aesthetic, resulting in different colors for each class. 
+#|   In the plot on the right class is mapped the shape aesthetic, 
+#|   resulting in different plotting character shapes for each class,
+#|   except for suv. Each plot comes with a legend that shows the 
+#|   mapping between color or shape and levels of the class variable.
+
+# Left
+ggplot(mpg, aes(x = displ, y = hwy, color = class)) +
+  geom_point()
+
+# Right
+ggplot(mpg, aes(x = displ, y = hwy, shape = class)) +
+  geom_point()
+```
+
+When `class` is mapped to `shape`, we get two warnings:
+
+> 1: The shape palette can deal with a maximum of 6 discrete values because more than 6 becomes difficult to discriminate; you have 7.
+> Consider specifying shapes manually if you must have them.
+>
+> 2: Removed 62 rows containing missing values (`geom_point()`).
+
+Since ggplot2 will only use six shapes at a time, by default, additional groups will go unplotted when you use the shape aesthetic.
+The second warning is related -- there are 62 SUVs in the dataset and they're not plotted.
+
+Similarly, we can map `class` to `size` or `alpha` aesthetics as well, which control the shape and the transparency of the points, respectively.
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| fig-alt: |
+#|   Two scatterplots next to each other, both visualizing highway fuel 
+#|   efficiency versus engine size of cars and showing a negative 
+#|   association. In the plot on the left class is mapped to the size 
+#|   aesthetic, resulting in different sizes for each class. 
+#|   In the plot on the right class is mapped the alpha aesthetic, 
+#|   resulting in different alpha (transparency) levels for each class. 
+#|   Each plot comes with a legend that shows the mapping between size 
+#|   or alpha level and levels of the class variable.
+
+# Left
+ggplot(mpg, aes(x = displ, y = hwy, size = class)) +
+  geom_point()
+
+# Right
+ggplot(mpg, aes(x = displ, y = hwy, alpha = class)) +
+  geom_point()
+```
+
+Both of these produce warnings as well:
+
+> Using alpha for a discrete variable is not advised.
+
+Mapping an unordered discrete (categorical) variable (`class`) to an ordered aesthetic (`size` or `alpha`) is generally not a good idea because it implies a ranking that does not in fact exist.
+
+Once you map an aesthetic, ggplot2 takes care of the rest.
+It selects a reasonable scale to use with the aesthetic, and it constructs a legend that explains the mapping between levels and values.
+For x and y aesthetics, ggplot2 does not create a legend, but it creates an axis line with tick marks and a label.
+The axis line provides the same information as a legend; it explains the mapping between locations and values.
+
+You can also set the visual properties of your geom manually as an argument of your geom function (*outside* of `aes()`) instead of relying on a variable mapping to determine the appearance.
+For example, we can make all of the points in our plot blue:
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars 
+#|   that shows a negative association. All points are blue.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point(color = "blue")
+```
+
+Here, the color doesn't convey information about a variable, but only changes the appearance of the plot.
+You'll need to pick a value that makes sense for that aesthetic:
+
+-   The name of a color as a character string, e.g., `color = "blue"`
+-   The size of a point in mm, e.g., `size = 1`
+-   The shape of a point as a number, e.g, `shape = 1`, as shown in @fig-shapes.
+
+```{r}
+#| label: fig-shapes
+#| echo: false
+#| warning: false
+#| fig.asp: 0.364
+#| fig-align: "center"
+#| fig-cap: |
+#|   R has 25 built-in shapes that are identified by numbers. There are some 
+#|   seeming duplicates: for example, 0, 15, and 22 are all squares. The 
+#|   difference comes from the interaction of the `color` and `fill` 
+#|   aesthetics. The hollow shapes (0--14) have a border determined by `color`; 
+#|   the solid shapes (15--20) are filled with `color`; the filled shapes 
+#|   (21--24) have a border of `color` and are filled with `fill`. Shapes are 
+#|   arranged to keep similar shapes next to each other.  
+#| fig-alt: |
+#|   Mapping between shapes and the numbers that represent them: 0 - square, 
+#|   1 - circle, 2 - triangle point up, 3 - plus, 4 - cross, 5 - diamond, 
+#|   6 - triangle point down, 7 - square cross, 8 - star, 9 - diamond plus, 
+#|   10 - circle plus, 11 - triangles up and down, 12 - square plus, 
+#|   13 - circle cross, 14 - square and triangle down, 15 - filled square, 
+#|   16 - filled circle, 17 - filled triangle point-up, 18 - filled diamond, 
+#|   19 - solid circle, 20 - bullet (smaller circle), 21 - filled circle blue, 
+#|   22 - filled square blue, 23 - filled diamond blue, 24 - filled triangle 
+#|   point-up blue, 25 - filled triangle point down blue.
+
+shapes <- tibble(
+  shape = c(0, 1, 2, 5, 3, 4, 6:19, 22, 21, 24, 23, 20),
+  x = (0:24 %/% 5) / 2,
+  y = (-(0:24 %% 5)) / 4
+)
+ggplot(shapes, aes(x, y)) + 
+  geom_point(aes(shape = shape), size = 5, fill = "red") +
+  geom_text(aes(label = shape), hjust = 0, nudge_x = 0.15) +
+  scale_shape_identity() +
+  expand_limits(x = 4.1) +
+  scale_x_continuous(NULL, breaks = NULL) + 
+  scale_y_continuous(NULL, breaks = NULL, limits = c(-1.2, 0.2)) + 
+  theme_minimal() +
+  theme(aspect.ratio = 1/2.75)
+```
+
+So far we have discussed aesthetics that we can map or set in a scatterplot, when using a point geom.
+You can learn more about all possible aesthetic mappings in the aesthetic specifications vignette at <https://ggplot2.tidyverse.org/articles/ggplot2-specs.html>.
+
+The specific aesthetics you can use for a plot depend on the geom you use to represent the data.
+In the next section we dive deeper into geoms.
+
+### Exercises
+
+1.  Create a scatterplot of `hwy` vs. `displ` where the points are pink filled in triangles.
+
+2.  Why did the following code not result in a plot with blue points?
+
+    ```{r}
+    #| fig-show: hide
+    #| fig-alt: |
+    #|   Scatterplot of highway fuel efficiency versus engine size of cars  
+    #|   that shows a negative association. All points are red and 
+    #|   the legend shows a red point that is mapped to the word blue.
+
+    ggplot(mpg) + 
+      geom_point(aes(x = displ, y = hwy, color = "blue"))
+    ```
+
+3.  What does the `stroke` aesthetic do?
+    What shapes does it work with?
+    (Hint: use `?geom_point`)
+
+4.  What happens if you map an aesthetic to something other than a variable name, like `aes(color = displ < 5)`?
+    Note, you'll also need to specify x and y.
+
+## Geometric objects {#sec-geometric-objects}
+
+How are these two plots similar?
+
+```{r}
+#| echo: false
+#| message: false
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|   There are two plots. The plot on the left is a scatterplot of highway 
+#|   fuel efficiency versus engine size of cars and the plot on the right 
+#|   shows a smooth curve that follows the trajectory of the relationship 
+#|   between these variables. A confidence interval around the smooth 
+#|   curve is also displayed.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point()
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_smooth()
+```
+
+Both plots contain the same x variable, the same y variable, and both describe the same data.
+But the plots are not identical.
+Each plot uses a different geometric object, geom, to represent the data.
+The plot on the left uses the point geom, and the plot on the right uses the smooth geom, a smooth line fitted to the data.
+
+To change the geom in your plot, change the geom function that you add to `ggplot()`.
+For instance, to make the plots above, you can use the following code:
+
+```{r}
+#| fig-show: hide
+
+# Left
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point()
+
+# Right
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_smooth()
+```
+
+Every geom function in ggplot2 takes a `mapping` argument, either defined locally in the geom layer or globally in the `ggplot()` layer.
+However, not every aesthetic works with every geom.
+You could set the shape of a point, but you couldn't set the "shape" of a line.
+If you try, ggplot2 will silently ignore that aesthetic mapping.
+On the other hand, you *could* set the linetype of a line.
+`geom_smooth()` will draw a different line, with a different linetype, for each unique value of the variable that you map to linetype.
+
+```{r}
+#| message: false
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|   Two plots of highway fuel efficiency versus engine size of cars.
+#|   The data are represented with smooth curves. On the left, three 
+#|   smooth curves, all with the same linetype. On the right, three 
+#|   smooth curves with different line types (solid, dashed, or long 
+#|   dashed) for each type of drive train. In both plots, confidence 
+#|   intervals around the smooth curves are also displayed.
+
+# Left
+ggplot(mpg, aes(x = displ, y = hwy, shape = drv)) + 
+  geom_smooth()
+
+# Right
+ggplot(mpg, aes(x = displ, y = hwy, linetype = drv)) + 
+  geom_smooth()
+```
+
+Here, `geom_smooth()` separates the cars into three lines based on their `drv` value, which describes a car's drive train.
+One line describes all of the points that have a `4` value, one line describes all of the points that have an `f` value, and one line describes all of the points that have an `r` value.
+Here, `4` stands for four-wheel drive, `f` for front-wheel drive, and `r` for rear-wheel drive.
+
+If this sounds strange, we can make it clearer by overlaying the lines on top of the raw data and then coloring everything according to `drv`.
+
+```{r}
+#| message: false
+#| fig-alt: |
+#|   A plot of highway fuel efficiency versus engine size of cars. The data 
+#|   are represented with points (colored by drive train) as well as smooth 
+#|   curves (where line type is determined based on drive train as well). 
+#|   Confidence intervals around the smooth curves are also displayed.
+
+ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + 
+  geom_point() +
+  geom_smooth(aes(linetype = drv))
+```
+
+Notice that this plot contains two geoms in the same graph.
+
+Many geoms, like `geom_smooth()`, use a single geometric object to display multiple rows of data.
+For these geoms, you can set the `group` aesthetic to a categorical variable to draw multiple objects.
+ggplot2 will draw a separate object for each unique value of the grouping variable.
+In practice, ggplot2 will automatically group the data for these geoms whenever you map an aesthetic to a discrete variable (as in the `linetype` example).
+It is convenient to rely on this feature because the `group` aesthetic by itself does not add a legend or distinguishing features to the geoms.
+
+```{r}
+#| layout-ncol: 3
+#| fig-width: 3
+#| fig-asp: 1
+#| message: false
+#| fig-alt: |
+#|   Three plots, each with highway fuel efficiency on the y-axis and engine 
+#|   size of cars, where data are represented by a smooth curve. The first plot 
+#|   only has these two variables, the center plot has three separate smooth 
+#|   curves for each level of drive train, and the right plot not only has the 
+#|   same three separate smooth curves for each level of drive train but these 
+#|   curves are plotted in different colors, with a legend explaining which 
+#|   color maps to which level. Confidence intervals around the smooth curves 
+#|   are also displayed.
+
+# Left
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_smooth()
+
+# Middle
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_smooth(aes(group = drv))
+
+# Right
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_smooth(aes(color = drv), show.legend = FALSE)
+```
+
+If you place mappings in a geom function, ggplot2 will treat them as local mappings for the layer.
+It will use these mappings to extend or overwrite the global mappings *for that layer only*.
+This makes it possible to display different aesthetics in different layers.
+
+```{r}
+#| message: false
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, where 
+#|   points are colored according to the car class. A smooth curve following 
+#|   the trajectory of the relationship between highway fuel efficiency versus 
+#|   engine size of cars is overlaid along with a confidence interval around it.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point(aes(color = class)) + 
+  geom_smooth()
+```
+
+You can use the same idea to specify different `data` for each layer.
+Here, we use red points as well as open circles to highlight two-seater cars.
+The local data argument in `geom_point()` overrides the global data argument in `ggplot()` for that layer only.
+
+```{r}
+#| message: false
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, where 
+#|   points are colored according to the car class. A smooth curve following 
+#|   the trajectory of the relationship between highway fuel efficiency versus 
+#|   engine size of subcompact cars is overlaid along with a confidence interval 
+#|   around it.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point() + 
+  geom_point(
+    data = mpg |> filter(class == "2seater"), 
+    color = "red"
+  ) +
+  geom_point(
+    data = mpg |> filter(class == "2seater"), 
+    shape = "circle open", size = 3, color = "red"
+  )
+```
+
+Geoms are the fundamental building blocks of ggplot2.
+You can completely transform the look of your plot by changing its geom, and different geoms can reveal different features of your data.
+For example, the histogram and density plot below reveal that the distribution of highway mileage is bimodal and right skewed while the boxplot reveals two potential outliers.
+
+```{r}
+#| layout-ncol: 3
+#| fig-width: 3
+#| fig-alt: |
+#|   Three plots: histogram, density plot, and box plot of highway
+#|   mileage.
+ 
+# Left
+ggplot(mpg, aes(x = hwy)) +
+  geom_histogram(binwidth = 2)
+
+# Middle
+ggplot(mpg, aes(x = hwy)) +
+  geom_density()
+
+# Right
+ggplot(mpg, aes(x = hwy)) +
+  geom_boxplot()
+```
+
+ggplot2 provides more than 40 geoms but these don't cover all possible plots one could make.
+If you need a different geom, we recommend looking into extension packages first to see if someone else has already implemented it (see <https://exts.ggplot2.tidyverse.org/gallery/> for a sampling).
+For example, the **ggridges** package ([https://wilkelab.org/ggridges](https://wilkelab.org/ggridges/){.uri}) is useful for making ridgeline plots, which can be useful for visualizing the density of a numerical variable for different levels of a categorical variable.
+In the following plot not only did we use a new geom (`geom_density_ridges()`), but we have also mapped the same variable to multiple aesthetics (`drv` to `y`, `fill`, and `color`) as well as set an aesthetic (`alpha = 0.5`) to make the density curves transparent.
+
+```{r}
+#| fig-asp: 0.33
+#| fig-alt: 
+#|   Density curves for highway mileage for cars with rear wheel, 
+#|   front wheel, and 4-wheel drives plotted separately. The 
+#|   distribution is bimodal and roughly symmetric for real and 
+#|   4 wheel drive cars and unimodal and right skewed for front 
+#|   wheel drive cars.
+
+library(ggridges)
+
+ggplot(mpg, aes(x = hwy, y = drv, fill = drv, color = drv)) +
+  geom_density_ridges(alpha = 0.5, show.legend = FALSE)
+```
+
+The best place to get a comprehensive overview of all of the geoms ggplot2 offers, as well as all functions in the package, is the reference page: <https://ggplot2.tidyverse.org/reference>.
+To learn more about any single geom, use the help (e.g., `?geom_smooth`).
+
+### Exercises
+
+1.  What geom would you use to draw a line chart?
+    A boxplot?
+    A histogram?
+    An area chart?
+
+2.  Earlier in this chapter we used `show.legend` without explaining it:
+
+    ```{r}
+    #| fig-show: hide
+    #| message: false
+
+    ggplot(mpg, aes(x = displ, y = hwy)) +
+      geom_smooth(aes(color = drv), show.legend = FALSE)
+    ```
+
+    What does `show.legend = FALSE` do here?
+    What happens if you remove it?
+    Why do you think we used it earlier?
+
+3.  What does the `se` argument to `geom_smooth()` do?
+
+4.  Recreate the R code necessary to generate the following graphs.
+    Note that wherever a categorical variable is used in the plot, it's `drv`.
+
+    ```{r}
+    #| echo: false
+    #| message: false
+    #| layout-ncol: 2
+    #| fig-width: 3
+    #| fig-alt: |
+    #|   There are six scatterplots in this figure, arranged in a 3x2 grid. 
+    #|   In all plots highway fuel efficiency of cars are on the y-axis and 
+    #|   engine size is on the x-axis. The first plot shows all points in black 
+    #|   with a smooth curve overlaid on them. In the second plot points are 
+    #|   also all black, with separate smooth curves overlaid for each level of 
+    #|   drive train. On the third plot, points and the smooth curves are 
+    #|   represented in different colors for each level of drive train. In the 
+    #|   fourth plot the points are represented in different colors for each 
+    #|   level of drive train but there is only a single smooth line fitted to 
+    #|   the whole data. In the fifth plot, points are represented in different 
+    #|   colors for each level of drive train, and a separate smooth curve with 
+    #|   different line types are fitted to each level of drive train. And 
+    #|   finally in the sixth plot points are represented in different colors 
+    #|   for each level of drive train and they have a thick white border.
+
+    ggplot(mpg, aes(x = displ, y = hwy)) + 
+      geom_point() + 
+      geom_smooth(se = FALSE)
+    ggplot(mpg, aes(x = displ, y = hwy)) + 
+      geom_smooth(aes(group = drv), se = FALSE) +
+      geom_point()
+    ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + 
+      geom_point() + 
+      geom_smooth(se = FALSE)
+    ggplot(mpg, aes(x = displ, y = hwy)) + 
+      geom_point(aes(color = drv)) + 
+      geom_smooth(se = FALSE)
+    ggplot(mpg, aes(x = displ, y = hwy)) + 
+      geom_point(aes(color = drv)) +
+      geom_smooth(aes(linetype = drv), se = FALSE)
+    ggplot(mpg, aes(x = displ, y = hwy)) + 
+      geom_point(size = 4, color = "white") + 
+      geom_point(aes(color = drv))
+    ```
+
+## Facets
+
+In @sec-data-visualization you learned about faceting with `facet_wrap()`, which splits a plot into subplots that each display one subset of the data based on a categorical variable.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, 
+#|   faceted by class, with facets spanning two rows.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point() + 
+  facet_wrap(~cyl)
+```
+
+To facet your plot with the combination of two variables, switch from `facet_wrap()` to `facet_grid()`.
+The first argument of `facet_grid()` is also a formula, but now it's a double sided formula: `rows ~ cols`.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, faceted 
+#|   by number of cylinders across rows and by type of drive train across 
+#|   columns. This results in a 4x3 grid of 12 facets. Some of these facets have 
+#|   no observations: 5 cylinders and 4 wheel drive, 4 or 5 cylinders and front 
+#|   wheel drive.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point() + 
+  facet_grid(drv ~ cyl)
+```
+
+By default each of the facets share the same scale and range for x and y axes.
+This is useful when you want to compare data across facets but it can be limiting when you want to visualize the relationship within each facet better.
+Setting the `scales` argument in a faceting function to `"free"` will allow for different axis scales across both rows and columns, `"free_x"` will allow for different scales across rows, and `"free_y"` will allow for different scales across columns.
+
+```{r}
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars, 
+#|   faceted by number of cylinders across rows and by type of drive train 
+#|   across columns. This results in a 4x3 grid of 12 facets. Some of these 
+#|   facets have no observations: 5 cylinders and 4 wheel drive, 4 or 5 
+#|   cylinders and front wheel drive. Facets within a row share the same 
+#|   y-scale and facets within a column share the same x-scale.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point() + 
+  facet_grid(drv ~ cyl, scales = "free_y")
+```
+
+### Exercises
+
+1.  What happens if you facet on a continuous variable?
+
+2.  What do the empty cells in the plot above with `facet_grid(drv ~ cyl)` mean?
+    Run the following code.
+    How do they relate to the resulting plot?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(mpg) + 
+      geom_point(aes(x = drv, y = cyl))
+    ```
+
+3.  What plots does the following code make?
+    What does `.` do?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(mpg) + 
+      geom_point(aes(x = displ, y = hwy)) +
+      facet_grid(drv ~ .)
+
+    ggplot(mpg) + 
+      geom_point(aes(x = displ, y = hwy)) +
+      facet_grid(. ~ cyl)
+    ```
+
+4.  Take the first faceted plot in this section:
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(mpg) + 
+      geom_point(aes(x = displ, y = hwy)) + 
+      facet_wrap(~ class, nrow = 2)
+    ```
+
+    What are the advantages to using faceting instead of the color aesthetic?
+    What are the disadvantages?
+    How might the balance change if you had a larger dataset?
+
+5.  Read `?facet_wrap`.
+    What does `nrow` do?
+    What does `ncol` do?
+    What other options control the layout of the individual panels?
+    Why doesn't `facet_grid()` have `nrow` and `ncol` arguments?
+
+6.  Which of the following plots makes it easier to compare engine size (`displ`) across cars with different drive trains?
+    What does this say about when to place a faceting variable across rows or columns?
+
+    ```{r}
+    #| fig-show: hide
+    #| message: false
+
+    ggplot(mpg, aes(x = displ)) + 
+      geom_histogram() + 
+      facet_grid(drv ~ .)
+
+    ggplot(mpg, aes(x = displ)) + 
+      geom_histogram() +
+      facet_grid(. ~ drv)
+    ```
+
+7.  Recreate the following plot using `facet_wrap()` instead of `facet_grid()`.
+    How do the positions of the facet labels change?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(mpg) + 
+      geom_point(aes(x = displ, y = hwy)) +
+      facet_grid(drv ~ .)
+    ```
+
+## Statistical transformations
+
+Consider a basic bar chart, drawn with `geom_bar()` or `geom_col()`.
+The following chart displays the total number of diamonds in the `diamonds` dataset, grouped by `cut`.
+The `diamonds` dataset is in the ggplot2 package and contains information on \~54,000 diamonds, including the `price`, `carat`, `color`, `clarity`, and `cut` of each diamond.
+The chart shows that more diamonds are available with high quality cuts than with low quality cuts.
+
+```{r}
+#| fig-alt: |
+#|   Bar chart of number of each cut of diamond. There are roughly 1500 
+#|   Fair, 5000 Good, 12000 Very Good, 14000 Premium, and 22000 Ideal cut 
+#|   diamonds.
+
+ggplot(diamonds, aes(x = cut)) + 
+  geom_bar()
+```
+
+On the x-axis, the chart displays `cut`, a variable from `diamonds`.
+On the y-axis, it displays count, but count is not a variable in `diamonds`!
+Where does count come from?
+Many graphs, like scatterplots, plot the raw values of your dataset.
+Other graphs, like bar charts, calculate new values to plot:
+
+-   Bar charts, histograms, and frequency polygons bin your data and then plot bin counts, the number of points that fall in each bin.
+
+-   Smoothers fit a model to your data and then plot predictions from the model.
+
+-   Boxplots compute the five-number summary of the distribution and then display that summary as a specially formatted box.
+
+The algorithm used to calculate new values for a graph is called a **stat**, short for statistical transformation.
+@fig-vis-stat-bar shows how this process works with `geom_bar()`.
+
+```{r}
+#| label: fig-vis-stat-bar
+#| echo: false
+#| out-width: "100%"
+#| fig-cap: |
+#|   When creating a bar chart we first start with the raw data, then
+#|   aggregate it to count the number of observations in each bar,
+#|   and finally map those computed variables to plot aesthetics.
+#| fig-alt: |
+#|   A figure demonstrating three steps of creating a bar chart. 
+#|   Step 1. geom_bar() begins with the diamonds data set. Step 2. geom_bar() 
+#|   transforms the data with the count stat, which returns a data set of 
+#|   cut values and counts. Step 3. geom_bar() uses the transformed data to 
+#|   build the plot. cut is mapped to the x-axis, count is mapped to the y-axis.
+
+knitr::include_graphics("images/visualization-stat-bar.png")
+```
+
+You can learn which stat a geom uses by inspecting the default value for the `stat` argument.
+For example, `?geom_bar` shows that the default value for `stat` is "count", which means that `geom_bar()` uses `stat_count()`.
+`stat_count()` is documented on the same page as `geom_bar()`.
+If you scroll down, the section called "Computed variables" explains that it computes two new variables: `count` and `prop`.
+
+Every geom has a default stat; and every stat has a default geom.
+This means that you can typically use geoms without worrying about the underlying statistical transformation.
+However, there are three reasons why you might need to use a stat explicitly:
+
+1.  You might want to override the default stat.
+    In the code below, we change the stat of `geom_bar()` from count (the default) to identity.
+    This lets us map the height of the bars to the raw values of a y variable.
+
+    ```{r}
+    #| warning: false
+    #| fig-alt: |
+    #|   Bar chart of number of each cut of diamond. There are roughly 1500 
+    #|   Fair, 5000 Good, 12000 Very Good, 14000 Premium, and 22000 Ideal cut 
+    #|   diamonds.
+
+    diamonds |>
+      count(cut) |>
+      ggplot(aes(x = cut, y = n)) +
+      geom_bar(stat = "identity")
+    ```
+
+2.  You might want to override the default mapping from transformed variables to aesthetics.
+    For example, you might want to display a bar chart of proportions, rather than counts:
+
+    ```{r}
+    #| fig-alt: |
+    #|   Bar chart of proportion of each cut of diamond. Roughly, Fair 
+    #|   diamonds make up 0.03, Good 0.09, Very Good 0.22, Premium 0.26, and 
+    #|   Ideal 0.40.
+
+    ggplot(diamonds, aes(x = cut, y = after_stat(prop), group = 1)) + 
+      geom_bar()
+    ```
+
+    To find the possible variables that can be computed by the stat, look for the section titled "computed variables" in the help for `geom_bar()`.
+
+3.  You might want to draw greater attention to the statistical transformation in your code.
+    For example, you might use `stat_summary()`, which summarizes the y values for each unique x value, to draw attention to the summary that you're computing:
+
+    ```{r}
+    #| fig-alt: |
+    #|   A plot with depth on the y-axis and cut on the x-axis (with levels 
+    #|   fair, good, very good, premium, and ideal) of diamonds. For each level 
+    #|   of cut, vertical lines extend from minimum to maximum depth for diamonds 
+    #|   in that cut category, and the median depth is indicated on the line 
+    #|   with a point.
+
+    ggplot(diamonds) + 
+      stat_summary(
+        aes(x = cut, y = depth),
+        fun.min = min,
+        fun.max = max,
+        fun = median
+      )
+    ```
+
+ggplot2 provides more than 20 stats for you to use.
+Each stat is a function, so you can get help in the usual way, e.g., `?stat_bin`.
+
+### Exercises
+
+1.  What is the default geom associated with `stat_summary()`?
+    How could you rewrite the previous plot to use that geom function instead of the stat function?
+
+2.  What does `geom_col()` do?
+    How is it different from `geom_bar()`?
+
+3.  Most geoms and stats come in pairs that are almost always used in concert.
+    Make a list of all the pairs.
+    What do they have in common?
+    (Hint: Read through the documentation.)
+
+4.  What variables does `stat_smooth()` compute?
+    What arguments control its behavior?
+
+5.  In our proportion bar chart, we need to set `group = 1`.
+    Why?
+    In other words, what is the problem with these two graphs?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(diamonds, aes(x = cut, y = after_stat(prop))) + 
+      geom_bar()
+    ggplot(diamonds, aes(x = cut, fill = color, y = after_stat(prop))) + 
+      geom_bar()
+    ```
+
+## Position adjustments
+
+There's one more piece of magic associated with bar charts.
+You can color a bar chart using either the `color` aesthetic, or, more usefully, the `fill` aesthetic:
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 4
+#| fig-alt: |
+#|   Two bar charts of drive types of cars. In the first plot, the bars have 
+#|   colored borders. In the second plot, they're filled with colors. Heights 
+#|   of the bars correspond to the number of cars in each cut category.
+
+# Left
+ggplot(mpg, aes(x = drv, color = drv)) + 
+  geom_bar()
+
+# Right
+ggplot(mpg, aes(x = drv, fill = drv)) + 
+  geom_bar()
+```
+
+Note what happens if you map the fill aesthetic to another variable, like `class`: the bars are automatically stacked.
+Each colored rectangle represents a combination of `drv` and `class`.
+
+```{r}
+#| fig-alt: |
+#|   Segmented bar chart of drive types of cars, where each bar is filled with 
+#|   colors for the classes of cars. Heights of the bars correspond to the 
+#|   number of cars in each drive category, and heights of the colored 
+#|   segments are proportional to the number of cars with a given class 
+#|   level within a given drive type level.
+
+ggplot(mpg, aes(x = drv, fill = class)) + 
+  geom_bar()
+```
+
+The stacking is performed automatically using the **position adjustment** specified by the `position` argument.
+If you don't want a stacked bar chart, you can use one of three other options: `"identity"`, `"dodge"` or `"fill"`.
+
+-   `position = "identity"` will place each object exactly where it falls in the context of the graph.
+    This is not very useful for bars, because it overlaps them.
+    To see that overlapping we either need to make the bars slightly transparent by setting `alpha` to a small value, or completely transparent by setting `fill = NA`.
+
+    ```{r}
+    #| layout-ncol: 2
+    #| fig-width: 4
+    #| fig-alt: |
+    #|   Segmented bar chart of drive types of cars, where each bar is filled with 
+    #|   colors for the classes of cars. Heights of the bars correspond to the 
+    #|   number of cars in each drive category, and heights of the colored 
+    #|   segments are proportional to the number of cars with a given class 
+    #|   level within a given drive type level. However the segments overlap. In 
+    #|   the first plot the bars are filled with transparent colors
+    #|   and in the second plot they are only outlined with color.
+
+    # Left
+    ggplot(mpg, aes(x = drv, fill = class)) + 
+      geom_bar(alpha = 1/5, position = "identity")
+
+    # Right
+    ggplot(mpg, aes(x = drv, color = class)) + 
+      geom_bar(fill = NA, position = "identity")
+    ```
+
+    The identity position adjustment is more useful for 2d geoms, like points, where it is the default.
+
+-   `position = "fill"` works like stacking, but makes each set of stacked bars the same height.
+    This makes it easier to compare proportions across groups.
+
+-   `position = "dodge"` places overlapping objects directly *beside* one another.
+    This makes it easier to compare individual values.
+
+    ```{r}
+    #| layout-ncol: 2
+    #| fig-width: 4
+    #| fig-alt: |
+    #|   On the left, segmented bar chart of drive types of cars, where each bar is 
+    #|   filled with colors for the levels of class. Height of each bar is 1 and 
+    #|   heights of the colored segments represent the proportions of cars 
+    #|   with a given class level within a given drive type.
+    #|   On the right, dodged bar chart of drive types of cars. Dodged bars are 
+    #|   grouped by levels of drive type. Within each group bars represent each 
+    #|   level of class. Some classes are represented within some drive types and 
+    #|   not represented in others, resulting in unequal number of bars within each 
+    #|   group. Heights of these bars represent the number of cars with a given 
+    #|   level of drive type and class.
+
+    # Left
+    ggplot(mpg, aes(x = drv, fill = class)) + 
+      geom_bar(position = "fill")
+
+    # Right
+    ggplot(mpg, aes(x = drv, fill = class)) + 
+      geom_bar(position = "dodge")
+    ```
+
+There's one other type of adjustment that's not useful for bar charts, but can be very useful for scatterplots.
+Recall our first scatterplot.
+Did you notice that the plot displays only 126 points, even though there are 234 observations in the dataset?
+
+```{r}
+#| echo: false
+#| fig-alt: |
+#|   Scatterplot of highway fuel efficiency versus engine size of cars that 
+#|   shows a negative association.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point()
+```
+
+The underlying values of `hwy` and `displ` are rounded so the points appear on a grid and many points overlap each other.
+This problem is known as **overplotting**.
+This arrangement makes it difficult to see the distribution of the data.
+Are the data points spread equally throughout the graph, or is there one special combination of `hwy` and `displ` that contains 109 values?
+
+You can avoid this gridding by setting the position adjustment to "jitter".
+`position = "jitter"` adds a small amount of random noise to each point.
+This spreads the points out because no two points are likely to receive the same amount of random noise.
+
+```{r}
+#| fig-alt: |
+#|   Jittered scatterplot of highway fuel efficiency versus engine size of cars. 
+#|   The plot shows a negative association.
+
+ggplot(mpg, aes(x = displ, y = hwy)) + 
+  geom_point(position = "jitter")
+```
+
+Adding randomness seems like a strange way to improve your plot, but while it makes your graph less accurate at small scales, it makes your graph *more* revealing at large scales.
+Because this is such a useful operation, ggplot2 comes with a shorthand for `geom_point(position = "jitter")`: `geom_jitter()`.
+
+To learn more about a position adjustment, look up the help page associated with each adjustment: `?position_dodge`, `?position_fill`, `?position_identity`, `?position_jitter`, and `?position_stack`.
+
+### Exercises
+
+1.  What is the problem with the following plot?
+    How could you improve it?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(mpg, aes(x = cty, y = hwy)) + 
+      geom_point()
+    ```
+
+2.  What, if anything, is the difference between the two plots?
+    Why?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(mpg, aes(x = displ, y = hwy)) +
+      geom_point()
+    ggplot(mpg, aes(x = displ, y = hwy)) +
+      geom_point(position = "identity")
+    ```
+
+3.  What parameters to `geom_jitter()` control the amount of jittering?
+
+4.  Compare and contrast `geom_jitter()` with `geom_count()`.
+
+5.  What's the default position adjustment for `geom_boxplot()`?
+    Create a visualization of the `mpg` dataset that demonstrates it.
+
+## Coordinate systems
+
+Coordinate systems are probably the most complicated part of ggplot2.
+The default coordinate system is the Cartesian coordinate system where the x and y positions act independently to determine the location of each point.
+There are two other coordinate systems that are occasionally helpful.
+
+-   `coord_quickmap()` sets the aspect ratio correctly for geographic maps.
+    This is very important if you're plotting spatial data with ggplot2.
+    We don't have the space to discuss maps in this book, but you can learn more in the [Maps chapter](https://ggplot2-book.org/maps.html) of *ggplot2: Elegant graphics for data analysis*.
+
+    ```{r}
+    #| layout-ncol: 2
+    #| fig-width: 3
+    #| message: false
+    #| fig-alt: |
+    #|   Two maps of the boundaries of New Zealand. In the first plot the aspect 
+    #|   ratio is incorrect, in the second plot it is correct.
+
+    nz <- map_data("nz")
+
+    ggplot(nz, aes(x = long, y = lat, group = group)) +
+      geom_polygon(fill = "white", color = "black")
+
+    ggplot(nz, aes(x = long, y = lat, group = group)) +
+      geom_polygon(fill = "white", color = "black") +
+      coord_quickmap()
+    ```
+
+-   `coord_polar()` uses polar coordinates.
+    Polar coordinates reveal an interesting connection between a bar chart and a Coxcomb chart.
+
+    ```{r}
+    #| layout-ncol: 2
+    #| fig-width: 3
+    #| fig-asp: 1
+    #| fig-alt: |
+    #|   There are two plots. On the left is a bar chart of clarity of diamonds, 
+    #|   on the right is a Coxcomb chart of the same data.
+
+    bar <- ggplot(data = diamonds) + 
+      geom_bar(
+        mapping = aes(x = clarity, fill = clarity), 
+        show.legend = FALSE,
+        width = 1
+      ) + 
+      theme(aspect.ratio = 1)
+
+    bar + coord_flip()
+    bar + coord_polar()
+    ```
+
+### Exercises
+
+1.  Turn a stacked bar chart into a pie chart using `coord_polar()`.
+
+2.  What's the difference between `coord_quickmap()` and `coord_map()`?
+
+3.  What does the following plot tell you about the relationship between city and highway mpg?
+    Why is `coord_fixed()` important?
+    What does `geom_abline()` do?
+
+    ```{r}
+    #| fig-show: hide
+
+    ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
+      geom_point() + 
+      geom_abline() +
+      coord_fixed()
+    ```
+
+## The layered grammar of graphics
+
+We can expand on the graphing template you learned in @sec-ggplot2-calls by adding position adjustments, stats, coordinate systems, and faceting:
+
+```         
+ggplot(data = <DATA>) + 
+  <GEOM_FUNCTION>(
+     mapping = aes(<MAPPINGS>),
+     stat = <STAT>, 
+     position = <POSITION>
+  ) +
+  <COORDINATE_FUNCTION> +
+  <FACET_FUNCTION>
+```
+
+Our new template takes seven parameters, the bracketed words that appear in the template.
+In practice, you rarely need to supply all seven parameters to make a graph because ggplot2 will provide useful defaults for everything except the data, the mappings, and the geom function.
+
+The seven parameters in the template compose the grammar of graphics, a formal system for building plots.
+The grammar of graphics is based on the insight that you can uniquely describe *any* plot as a combination of a dataset, a geom, a set of mappings, a stat, a position adjustment, a coordinate system, a faceting scheme, and a theme.
+
+To see how this works, consider how you could build a basic plot from scratch: you could start with a dataset and then transform it into the information that you want to display (with a stat).
+Next, you could choose a geometric object to represent each observation in the transformed data.
+You could then use the aesthetic properties of the geoms to represent variables in the data.
+You would map the values of each variable to the levels of an aesthetic.
+These steps are illustrated in @fig-visualization-grammar.
+You'd then select a coordinate system to place the geoms into, using the location of the objects (which is itself an aesthetic property) to display the values of the x and y variables.
+
+```{r}
+#| label: fig-visualization-grammar
+#| echo: false
+#| fig-alt: |
+#|   A figure demonstrating the steps for going from raw data to table of 
+#|   frequencies where each row represents one level of cut and a count column 
+#|   shows how many diamonds are in that cut level. Then, these values are 
+#|   mapped to heights of bars.
+#| fig-cap: |
+#|   Steps for going from raw data to a table of frequencies to a bar plot where 
+#|   the heights of the bar represent the frequencies.
+
+knitr::include_graphics("images/visualization-grammar.png")
+```
+
+At this point, you would have a complete graph, but you could further adjust the positions of the geoms within the coordinate system (a position adjustment) or split the graph into subplots (faceting).
+You could also extend the plot by adding one or more additional layers, where each additional layer uses a dataset, a geom, a set of mappings, a stat, and a position adjustment.
+
+You could use this method to build *any* plot that you imagine.
+In other words, you can use the code template that you've learned in this chapter to build hundreds of thousands of unique plots.
+
+If you'd like to learn more about the theoretical underpinnings of ggplot2, you might enjoy reading "[The Layered Grammar of Graphics](https://vita.had.co.nz/papers/layered-grammar.pdf)", the scientific paper that describes the theory of ggplot2 in detail.
+
+## Summary
+
+In this chapter you learned about the layered grammar of graphics starting with aesthetics and geometries to build a simple plot, facets for splitting the plot into subsets, statistics for understanding how geoms are calculated, position adjustments for controlling the fine details of position when geoms might otherwise overlap, and coordinate systems which allow you to fundamentally change what `x` and `y` mean.
+One layer we have not yet touched on is theme, which we will introduce in @sec-themes.
+
+Two very useful resources for getting an overview of the complete ggplot2 functionality are the ggplot2 cheatsheet (which you can find at <https://posit.co/resources/cheatsheets>) and the ggplot2 package website ([https://ggplot2.tidyverse.org](https://ggplot2.tidyverse.org/)).
+
+An important lesson you should take from this chapter is that when you feel the need for a geom that is not provided by ggplot2, it's always a good idea to look into whether someone else has already solved your problem by creating a ggplot2 extension package that offers that geom.
diff --git a/logicals.qmd b/logicals.qmd
new file mode 100644
index 000000000..5fef2bf2b
--- /dev/null
+++ b/logicals.qmd
@@ -0,0 +1,583 @@
+# Logical vectors {#sec-logicals}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+In this chapter, you'll learn tools for working with logical vectors.
+Logical vectors are the simplest type of vector because each element can only be one of three possible values: `TRUE`, `FALSE`, and `NA`.
+It's relatively rare to find logical vectors in your raw data, but you'll create and manipulate them in the course of almost every analysis.
+
+We'll begin by discussing the most common way of creating logical vectors: with numeric comparisons.
+Then you'll learn about how you can use Boolean algebra to combine different logical vectors, as well as some useful summaries.
+We'll finish off with `if_else()` and `case_when()`, two useful functions for making conditional changes powered by logical vectors.
+
+### Prerequisites
+
+Most of the functions you'll learn about in this chapter are provided by base R, so we don't need the tidyverse, but we'll still load it so we can use `mutate()`, `filter()`, and friends to work with data frames.
+We'll also continue to draw examples from the `nycflights13::flights` dataset.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(nycflights13)
+```
+
+However, as we start to cover more tools, there won't always be a perfect real example.
+So we'll start making up some dummy data with `c()`:
+
+```{r}
+x <- c(1, 2, 3, 5, 7, 11, 13)
+x * 2
+```
+
+This makes it easier to explain individual functions at the cost of making it harder to see how it might apply to your data problems.
+Just remember that any manipulation we do to a free-floating vector, you can do to a variable inside a data frame with `mutate()` and friends.
+
+```{r}
+df <- tibble(x)
+df |> 
+  mutate(y = x * 2)
+```
+
+## Comparisons
+
+A very common way to create a logical vector is via a numeric comparison with `<`, `<=`, `>`, `>=`, `!=`, and `==`.
+So far, we've mostly created logical variables transiently within `filter()` --- they are computed, used, and then thrown away.
+For example, the following filter finds all daytime departures that arrive roughly on time:
+
+```{r}
+flights |> 
+  filter(dep_time > 600 & dep_time < 2000 & abs(arr_delay) < 20)
+```
+
+It's useful to know that this is a shortcut and you can explicitly create the underlying logical variables with `mutate()`:
+
+```{r}
+flights |> 
+  mutate(
+    daytime = dep_time > 600 & dep_time < 2000,
+    approx_ontime = abs(arr_delay) < 20,
+    .keep = "used"
+  )
+```
+
+This is particularly useful for more complicated logic because naming the intermediate steps makes it easier to both read your code and check that each step has been computed correctly.
+
+All up, the initial filter is equivalent to:
+
+```{r}
+#| results: false
+
+flights |> 
+  mutate(
+    daytime = dep_time > 600 & dep_time < 2000,
+    approx_ontime = abs(arr_delay) < 20,
+  ) |> 
+  filter(daytime & approx_ontime)
+```
+
+### Floating point comparison {#sec-fp-comparison}
+
+Beware of using `==` with numbers.
+For example, it looks like this vector contains the numbers 1 and 2:
+
+```{r}
+x <- c(1 / 49 * 49, sqrt(2) ^ 2)
+x
+```
+
+But if you test them for equality, you get `FALSE`:
+
+```{r}
+x == c(1, 2)
+```
+
+What's going on?
+Computers store numbers with a fixed number of decimal places so there's no way to exactly represent 1/49 or `sqrt(2)` and subsequent computations will be very slightly off.
+We can see the exact values by calling `print()` with the `digits`[^logicals-1] argument:
+
+[^logicals-1]: R normally calls print for you (i.e. `x` is a shortcut for `print(x)`), but calling it explicitly is useful if you want to provide other arguments.
+
+```{r}
+print(x, digits = 16)
+```
+
+You can see why R defaults to rounding these numbers; they really are very close to what you expect.
+
+Now that you've seen why `==` is failing, what can you do about it?
+One option is to use `dplyr::near()` which ignores small differences:
+
+```{r}
+near(x, c(1, 2))
+```
+
+### Missing values {#sec-na-comparison}
+
+Missing values represent the unknown so they are "contagious": almost any operation involving an unknown value will also be unknown:
+
+```{r}
+NA > 5
+10 == NA
+```
+
+The most confusing result is this one:
+
+```{r}
+NA == NA
+```
+
+It's easiest to understand why this is true if we artificially supply a little more context:
+
+```{r}
+# We don't know how old Mary is
+age_mary <- NA
+
+# We don't know how old John is
+age_john <- NA
+
+# Are Mary and John the same age?
+age_mary == age_john
+# We don't know!
+```
+
+So if you want to find all flights where `dep_time` is missing, the following code doesn't work because `dep_time == NA` will yield `NA` for every single row, and `filter()` automatically drops missing values:
+
+```{r}
+flights |> 
+  filter(dep_time == NA)
+```
+
+Instead we'll need a new tool: `is.na()`.
+
+### `is.na()`
+
+`is.na(x)` works with any type of vector and returns `TRUE` for missing values and `FALSE` for everything else:
+
+```{r}
+is.na(c(TRUE, NA, FALSE))
+is.na(c(1, NA, 3))
+is.na(c("a", NA, "b"))
+```
+
+We can use `is.na()` to find all the rows with a missing `dep_time`:
+
+```{r}
+flights |> 
+  filter(is.na(dep_time))
+```
+
+`is.na()` can also be useful in `arrange()`.
+`arrange()` usually puts all the missing values at the end but you can override this default by first sorting by `is.na()`:
+
+```{r}
+flights |> 
+  filter(month == 1, day == 1) |> 
+  arrange(dep_time)
+
+flights |> 
+  filter(month == 1, day == 1) |> 
+  arrange(desc(is.na(dep_time)), dep_time)
+```
+
+We'll come back to cover missing values in more depth in @sec-missing-values.
+
+### Exercises
+
+1.  How does `dplyr::near()` work? Type `near` to see the source code. Is `sqrt(2)^2` near 2?
+2.  Use `mutate()`, `is.na()`, and `count()` together to describe how the missing values in `dep_time`, `sched_dep_time` and `dep_delay` are connected.
+
+## Boolean algebra
+
+Once you have multiple logical vectors, you can combine them together using Boolean algebra.
+In R, `&` is "and", `|` is "or", `!` is "not", and `xor()` is exclusive or[^logicals-2].
+For example, `df |> filter(!is.na(x))` finds all rows where `x` is not missing and `df |> filter(x < -10 | x > 0)` finds all rows where `x` is smaller than -10 or bigger than 0.
+@fig-bool-ops shows the complete set of Boolean operations and how they work.
+
+[^logicals-2]: That is, `xor(x, y)` is true if x is true, or y is true, but not both.
+    This is how we usually use "or" In English.
+    "Both" is not usually an acceptable answer to the question "would you like ice cream or cake?".
+
+```{r}
+#| label: fig-bool-ops
+#| echo: false
+#| out-width: NULL
+#| fig-cap: | 
+#|    The complete set of Boolean operations. `x` is the left-hand
+#|    circle, `y` is the right-hand circle, and the shaded region show 
+#|    which parts each operator selects.
+#| fig-alt: |
+#|    Six Venn diagrams, each explaining a given logical operator. The
+#|    circles (sets) in each of the Venn diagrams represent x and y. 1. y &
+#|    !x is y but none of x; x & y is the intersection of x and y; x & !y is
+#|    x but none of y; x is all of x none of y; xor(x, y) is everything
+#|    except the intersection of x and y; y is all of y and none of x; and 
+#|    x | y is everything.
+knitr::include_graphics("diagrams/transform.png", dpi = 270)
+```
+
+As well as `&` and `|`, R also has `&&` and `||`.
+Don't use them in dplyr functions!
+These are called short-circuiting operators and only ever return a single `TRUE` or `FALSE`.
+They're important for programming, not data science.
+
+### Missing values {#sec-na-boolean}
+
+The rules for missing values in Boolean algebra are a little tricky to explain because they seem inconsistent at first glance:
+
+```{r}
+df <- tibble(x = c(TRUE, FALSE, NA))
+
+df |> 
+  mutate(
+    and = x & NA,
+    or = x | NA
+  )
+```
+
+To understand what's going on, think about `NA | TRUE` (`NA` or `TRUE`).
+A missing value in a logical vector means that the value could either be `TRUE` or `FALSE`.
+`TRUE | TRUE` and `FALSE | TRUE` are both `TRUE` because at least one of them is `TRUE`.
+`NA | TRUE` must also be `TRUE` because `NA` can either be `TRUE` or `FALSE`.
+However, `NA | FALSE` is `NA` because we don't know if `NA` is `TRUE` or `FALSE`.
+Similar reasoning applies with `NA & FALSE`.
+
+### Order of operations
+
+Note that the order of operations doesn't work like English.
+Take the following code that finds all flights that departed in November or December:
+
+```{r}
+#| eval: false
+
+flights |> 
+   filter(month == 11 | month == 12)
+```
+
+You might be tempted to write it like you'd say in English: "Find all flights that departed in November or December.":
+
+```{r}
+flights |> 
+   filter(month == 11 | 12)
+```
+
+This code doesn't error but it also doesn't seem to have worked.
+What's going on?
+Here, R first evaluates `month == 11` creating a logical vector, which we call `nov`.
+It computes `nov | 12`.
+When you use a number with a logical operator it converts everything apart from 0 to `TRUE`, so this is equivalent to `nov | TRUE` which will always be `TRUE`, so every row will be selected:
+
+```{r}
+flights |> 
+  mutate(
+    nov = month == 11,
+    final = nov | 12,
+    .keep = "used"
+  )
+```
+
+### `%in%`
+
+An easy way to avoid the problem of getting your `==`s and `|`s in the right order is to use `%in%`.
+`x %in% y` returns a logical vector the same length as `x` that is `TRUE` whenever a value in `x` is anywhere in `y` .
+
+```{r}
+1:12 %in% c(1, 5, 11)
+letters[1:10] %in% c("a", "e", "i", "o", "u")
+```
+
+So to find all flights in November and December we could write:
+
+```{r}
+#| eval: false
+
+flights |> 
+  filter(month %in% c(11, 12))
+```
+
+Note that `%in%` obeys different rules for `NA` to `==`, as `NA %in% NA` is `TRUE`.
+
+```{r}
+c(1, 2, NA) == NA
+c(1, 2, NA) %in% NA
+```
+
+This can make for a useful shortcut:
+
+```{r}
+flights |> 
+  filter(dep_time %in% c(NA, 0800))
+```
+
+### Exercises
+
+1.  Find all flights where `arr_delay` is missing but `dep_delay` is not. Find all flights where neither `arr_time` nor `sched_arr_time` are missing, but `arr_delay` is.
+2.  How many flights have a missing `dep_time`? What other variables are missing in these rows? What might these rows represent?
+3.  Assuming that a missing `dep_time` implies that a flight is cancelled, look at the number of cancelled flights per day. Is there a pattern? Is there a connection between the proportion of cancelled flights and the average delay of non-cancelled flights?
+
+## Summaries {#sec-logical-summaries}
+
+The following sections describe some useful techniques for summarizing logical vectors.
+As well as functions that only work specifically with logical vectors, you can also use functions that work with numeric vectors.
+
+### Logical summaries
+
+There are two main logical summaries: `any()` and `all()`.
+`any(x)` is the equivalent of `|`; it'll return `TRUE` if there are any `TRUE`'s in `x`.
+`all(x)` is equivalent of `&`; it'll return `TRUE` only if all values of `x` are `TRUE`'s.
+Like all summary functions, they'll return `NA` if there are any missing values present, and as usual you can make the missing values go away with `na.rm = TRUE`.
+
+For example, we could use `all()` and `any()` to find out if every flight was delayed on departure by at most an hour or if any flights were delayed on arrival by five hours or more.
+And using `group_by()` allows us to do that by day:
+
+```{r}
+flights |> 
+  group_by(year, month, day) |> 
+  summarize(
+    all_delayed = all(dep_delay <= 60, na.rm = TRUE),
+    any_long_delay = any(arr_delay >= 300, na.rm = TRUE),
+    .groups = "drop"
+  )
+```
+
+In most cases, however, `any()` and `all()` are a little too crude, and it would be nice to be able to get a little more detail about how many values are `TRUE` or `FALSE`.
+That leads us to the numeric summaries.
+
+### Numeric summaries of logical vectors {#sec-numeric-summaries-of-logicals}
+
+When you use a logical vector in a numeric context, `TRUE` becomes 1 and `FALSE` becomes 0.
+This makes `sum()` and `mean()` very useful with logical vectors because `sum(x)` gives the number of `TRUE`s and `mean(x)` gives the proportion of `TRUE`s (because `mean()` is just `sum()` divided by `length()`.
+
+That, for example, allows us to see the proportion of flights that were delayed on departure by at most an hour and the number of flights that were delayed on arrival by five hours or more:
+
+```{r}
+flights |> 
+  group_by(year, month, day) |> 
+  summarize(
+    all_delayed = mean(dep_delay <= 60, na.rm = TRUE),
+    any_long_delay = sum(arr_delay >= 300, na.rm = TRUE),
+    .groups = "drop"
+  )
+```
+
+### Logical subsetting
+
+There's one final use for logical vectors in summaries: you can use a logical vector to filter a single variable to a subset of interest.
+This makes use of the base `[` (pronounced subset) operator, which you'll learn more about in @sec-subset-many.
+
+Imagine we wanted to look at the average delay just for flights that were actually delayed.
+One way to do so would be to first filter the flights and then calculate the average delay:
+
+```{r}
+flights |> 
+  filter(arr_delay > 0) |> 
+  group_by(year, month, day) |> 
+  summarize(
+    behind = mean(arr_delay),
+    n = n(),
+    .groups = "drop"
+  )
+```
+
+This works, but what if we wanted to also compute the average delay for flights that arrived early?
+We'd need to perform a separate filter step, and then figure out how to combine the two data frames together[^logicals-3].
+Instead you could use `[` to perform an inline filtering: `arr_delay[arr_delay > 0]` will yield only the positive arrival delays.
+
+[^logicals-3]: We'll cover this in @sec-joins.
+
+This leads to:
+
+```{r}
+flights |> 
+  group_by(year, month, day) |> 
+  summarize(
+    behind = mean(arr_delay[arr_delay > 0], na.rm = TRUE),
+    ahead = mean(arr_delay[arr_delay < 0], na.rm = TRUE),
+    n = n(),
+    .groups = "drop"
+  )
+```
+
+Also note the difference in the group size: in the first chunk `n()` gives the number of delayed flights per day; in the second, `n()` gives the total number of flights.
+
+### Exercises
+
+1.  What will `sum(is.na(x))` tell you? How about `mean(is.na(x))`?
+2.  What does `prod()` return when applied to a logical vector? What logical summary function is it equivalent to? What does `min()` return when applied to a logical vector? What logical summary function is it equivalent to? Read the documentation and perform a few experiments.
+
+## Conditional transformations
+
+One of the most powerful features of logical vectors are their use for conditional transformations, i.e. doing one thing for condition x, and something different for condition y.
+There are two important tools for this: `if_else()` and `case_when()`.
+
+### `if_else()`
+
+If you want to use one value when a condition is `TRUE` and another value when it's `FALSE`, you can use `dplyr::if_else()`[^logicals-4].
+You'll always use the first three argument of `if_else()`. The first argument, `condition`, is a logical vector, the second, `true`, gives the output when the condition is true, and the third, `false`, gives the output if the condition is false.
+
+[^logicals-4]: dplyr's `if_else()` is very similar to base R's `ifelse()`.
+    There are two main advantages of `if_else()`over `ifelse()`: you can choose what should happen to missing values, and `if_else()` is much more likely to give you a meaningful error if your variables have incompatible types.
+
+Let's begin with a simple example of labeling a numeric vector as either "+ve" (positive) or "-ve" (negative):
+
+```{r}
+x <- c(-3:3, NA)
+if_else(x > 0, "+ve", "-ve")
+```
+
+There's an optional fourth argument, `missing` which will be used if the input is `NA`:
+
+```{r}
+if_else(x > 0, "+ve", "-ve", "???")
+```
+
+You can also use vectors for the the `true` and `false` arguments.
+For example, this allows us to create a minimal implementation of `abs()`:
+
+```{r}
+if_else(x < 0, -x, x)
+```
+
+So far all the arguments have used the same vectors, but you can of course mix and match.
+For example, you could implement a simple version of `coalesce()` like this:
+
+```{r}
+x1 <- c(NA, 1, 2, NA)
+y1 <- c(3, NA, 4, 6)
+if_else(is.na(x1), y1, x1)
+```
+
+You might have noticed a small infelicity in our labeling example above: zero is neither positive nor negative.
+We could resolve this by adding an additional `if_else()`:
+
+```{r}
+if_else(x == 0, "0", if_else(x < 0, "-ve", "+ve"), "???")
+```
+
+This is already a little hard to read, and you can imagine it would only get harder if you have more conditions.
+Instead, you can switch to `dplyr::case_when()`.
+
+### `case_when()`
+
+dplyr's `case_when()` is inspired by SQL's `CASE` statement and provides a flexible way of performing different computations for different conditions.
+It has a special syntax that unfortunately looks like nothing else you'll use in the tidyverse.
+It takes pairs that look like `condition ~ output`.
+`condition` must be a logical vector; when it's `TRUE`, `output` will be used.
+
+This means we could recreate our previous nested `if_else()` as follows:
+
+```{r}
+x <- c(-3:3, NA)
+case_when(
+  x == 0   ~ "0",
+  x < 0    ~ "-ve", 
+  x > 0    ~ "+ve",
+  is.na(x) ~ "???"
+)
+```
+
+This is more code, but it's also more explicit.
+
+To explain how `case_when()` works, let's explore some simpler cases.
+If none of the cases match, the output gets an `NA`:
+
+```{r}
+case_when(
+  x < 0 ~ "-ve",
+  x > 0 ~ "+ve"
+)
+```
+
+Use `.default` if you want to create a "default"/catch all value:
+
+```{r}
+case_when(
+  x < 0 ~ "-ve",
+  x > 0 ~ "+ve",
+  .default = "???"
+)
+```
+
+And note that if multiple conditions match, only the first will be used:
+
+```{r}
+case_when(
+  x > 0 ~ "+ve",
+  x > 2 ~ "big"
+)
+```
+
+Just like with `if_else()` you can use variables on both sides of the `~` and you can mix and match variables as needed for your problem.
+For example, we could use `case_when()` to provide some human readable labels for the arrival delay:
+
+```{r}
+flights |> 
+  mutate(
+    status = case_when(
+      is.na(arr_delay)      ~ "cancelled",
+      arr_delay < -30       ~ "very early",
+      arr_delay < -15       ~ "early",
+      abs(arr_delay) <= 15  ~ "on time",
+      arr_delay < 60        ~ "late",
+      arr_delay < Inf       ~ "very late",
+    ),
+    .keep = "used"
+  )
+```
+
+Be wary when writing this sort of complex `case_when()` statement; my first two attempts used a mix of `<` and `>` and I kept accidentally creating overlapping conditions.
+
+### Compatible types
+
+Note that both `if_else()` and `case_when()` require **compatible** types in the output.
+If they're not compatible, you'll see errors like this:
+
+```{r}
+#| error: true
+
+if_else(TRUE, "a", 1)
+
+case_when(
+  x < -1 ~ TRUE,  
+  x > 0  ~ now()
+)
+```
+
+Overall, relatively few types are compatible, because automatically converting one type of vector to another is a common source of errors.
+Here are the most important cases that are compatible:
+
+-   Numeric and logical vectors are compatible, as we discussed in @sec-numeric-summaries-of-logicals.
+-   Strings and factors (@sec-factors) are compatible, because you can think of a factor as a string with a restricted set of values.
+-   Dates and date-times, which we'll discuss in @sec-dates-and-times, are compatible because you can think of a date as a special case of date-time.
+-   `NA`, which is technically a logical vector, is compatible with everything because every vector has some way of representing a missing value.
+
+We don't expect you to memorize these rules, but they should become second nature over time because they are applied consistently throughout the tidyverse.
+
+### Exercises
+
+1.  A number is even if it's divisible by two, which in R you can find out with `x %% 2 == 0`.
+    Use this fact and `if_else()` to determine whether each number between 0 and 20 is even or odd.
+
+2.  Given a vector of days like `x <- c("Monday", "Saturday", "Wednesday")`, use an `ifelse()` statement to label them as weekends or weekdays.
+
+3.  Use `ifelse()` to compute the absolute value of a numeric vector called `x`.
+
+4.  Write a `case_when()` statement that uses the `month` and `day` columns from `flights` to label a selection of important US holidays (e.g., New Years Day, 4th of July, Thanksgiving, and Christmas).
+    First create a logical column that is either `TRUE` or `FALSE`, and then create a character column that either gives the name of the holiday or is `NA`.
+
+## Summary
+
+The definition of a logical vector is simple because each value must be either `TRUE`, `FALSE`, or `NA`.
+But logical vectors provide a huge amount of power.
+In this chapter, you learned how to create logical vectors with `>`, `<`, `<=`, `=>`, `==`, `!=`, and `is.na()`, how to combine them with `!`, `&`, and `|`, and how to summarize them with `any()`, `all()`, `sum()`, and `mean()`.
+You also learned the powerful `if_else()` and `case_when()` functions that allow you to return values depending on the value of a logical vector.
+
+We'll see logical vectors again and again in the following chapters.
+For example in @sec-strings you'll learn about `str_detect(x, pattern)` which returns a logical vector that's `TRUE` for the elements of `x` that match the `pattern`, and in @sec-dates-and-times you'll create logical vectors from the comparison of dates and times.
+But for now, we're going to move onto the next most important type of vector: numeric vectors.
diff --git a/missing-values.qmd b/missing-values.qmd
new file mode 100644
index 000000000..a79d0461a
--- /dev/null
+++ b/missing-values.qmd
@@ -0,0 +1,304 @@
+# Missing values {#sec-missing-values}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+You've already learned the basics of missing values earlier in the book.
+You first saw them in @sec-data-visualization where they resulted in a warning when making a plot as well as in @sec-summarize where they interfered with computing summary statistics, and you learned about their infectious nature and how to check for their presence in @sec-na-comparison.
+Now we'll come back to them in more depth, so you can learn more of the details.
+
+We'll start by discussing some general tools for working with missing values recorded as `NA`s.
+We'll then explore the idea of implicitly missing values, values are that are simply absent from your data, and show some tools you can use to make them explicit.
+We'll finish off with a related discussion of empty groups, caused by factor levels that don't appear in the data.
+
+### Prerequisites
+
+The functions for working with missing data mostly come from dplyr and tidyr, which are core members of the tidyverse.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+```
+
+## Explicit missing values
+
+To begin, let's explore a few handy tools for creating or eliminating missing explicit values, i.e. cells where you see an `NA`.
+
+### Last observation carried forward
+
+A common use for missing values is as a data entry convenience.
+When data is entered by hand, missing values sometimes indicate that the value in the previous row has been repeated (or carried forward):
+
+```{r}
+treatment <- tribble(
+  ~person,           ~treatment, ~response,
+  "Derrick Whitmore", 1,         7,
+  NA,                 2,         10,
+  NA,                 3,         NA,
+  "Katherine Burke",  1,         4
+)
+```
+
+You can fill in these missing values with `tidyr::fill()`.
+It works like `select()`, taking a set of columns:
+
+```{r}
+treatment |>
+  fill(everything())
+```
+
+This treatment is sometimes called "last observation carried forward", or **locf** for short.
+You can use the `.direction` argument to fill in missing values that have been generated in more exotic ways.
+
+### Fixed values
+
+Some times missing values represent some fixed and known value, most commonly 0.
+You can use `dplyr::coalesce()` to replace them:
+
+```{r}
+x <- c(1, 4, 5, 7, NA)
+coalesce(x, 0)
+```
+
+Sometimes you'll hit the opposite problem where some concrete value actually represents a missing value.
+This typically arises in data generated by older software that doesn't have a proper way to represent missing values, so it must instead use some special value like 99 or -999.
+
+If possible, handle this when reading in the data, for example, by using the `na` argument to `readr::read_csv()`, e.g., `read_csv(path, na = "99")`.
+If you discover the problem later, or your data source doesn't provide a way to handle it on read, you can use `dplyr::na_if()`:
+
+```{r}
+x <- c(1, 4, 5, 7, -99)
+na_if(x, -99)
+```
+
+### NaN
+
+Before we continue, there's one special type of missing value that you'll encounter from time to time: a `NaN` (pronounced "nan"), or **n**ot **a** **n**umber.
+It's not that important to know about because it generally behaves just like `NA`:
+
+```{r}
+x <- c(NA, NaN)
+x * 10
+x == 1
+is.na(x)
+```
+
+In the rare case you need to distinguish an `NA` from a `NaN`, you can use `is.nan(x)`.
+
+You'll generally encounter a `NaN` when you perform a mathematical operation that has an indeterminate result:
+
+```{r}
+0 / 0 
+0 * Inf
+Inf - Inf
+sqrt(-1)
+```
+
+## Implicit missing values {#sec-missing-implicit}
+
+So far we've talked about missing values that are **explicitly** missing, i.e. you can see an `NA` in your data.
+But missing values can also be **implicitly** missing, if an entire row of data is simply absent from the data.
+Let's illustrate the difference with a simple dataset that records the price of some stock each quarter:
+
+```{r}
+stocks <- tibble(
+  year  = c(2020, 2020, 2020, 2020, 2021, 2021, 2021),
+  qtr   = c(   1,    2,    3,    4,    2,    3,    4),
+  price = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
+)
+```
+
+This dataset has two missing observations:
+
+-   The `price` in the fourth quarter of 2020 is explicitly missing, because its value is `NA`.
+
+-   The `price` for the first quarter of 2021 is implicitly missing, because it simply does not appear in the dataset.
+
+One way to think about the difference is with this Zen-like koan:
+
+> An explicit missing value is the presence of an absence.\
+>
+> An implicit missing value is the absence of a presence.
+
+Sometimes you want to make implicit missings explicit in order to have something physical to work with.
+In other cases, explicit missings are forced upon you by the structure of the data and you want to get rid of them.
+The following sections discuss some tools for moving between implicit and explicit missingness.
+
+### Pivoting
+
+You've already seen one tool that can make implicit missings explicit and vice versa: pivoting.
+Making data wider can make implicit missing values explicit because every combination of the rows and new columns must have some value.
+For example, if we pivot `stocks` to put the `quarter` in the columns, both missing values become explicit:
+
+```{r}
+stocks |>
+  pivot_wider(
+    names_from = qtr, 
+    values_from = price
+  )
+```
+
+By default, making data longer preserves explicit missing values, but if they are structurally missing values that only exist because the data is not tidy, you can drop them (make them implicit) by setting `values_drop_na = TRUE`.
+See the examples in @sec-tidy-data for more details.
+
+### Complete
+
+`tidyr::complete()` allows you to generate explicit missing values by providing a set of variables that define the combination of rows that should exist.
+For example, we know that all combinations of `year` and `qtr` should exist in the `stocks` data:
+
+```{r}
+stocks |>
+  complete(year, qtr)
+```
+
+Typically, you'll call `complete()` with names of existing variables, filling in the missing combinations.
+However, sometimes the individual variables are themselves incomplete, so you can instead provide your own data.
+For example, you might know that the `stocks` dataset is supposed to run from 2019 to 2021, so you could explicitly supply those values for `year`:
+
+```{r}
+stocks |>
+  complete(year = 2019:2021, qtr)
+```
+
+If the range of a variable is correct, but not all values are present, you could use `full_seq(x, 1)` to generate all values from `min(x)` to `max(x)` spaced out by 1.
+
+In some cases, the complete set of observations can't be generated by a simple combination of variables.
+In that case, you can do manually what `complete()` does for you: create a data frame that contains all the rows that should exist (using whatever combination of techniques you need), then combine it with your original dataset with `dplyr::full_join()`.
+
+### Joins
+
+This brings us to another important way of revealing implicitly missing observations: joins.
+You'll learn more about joins in @sec-joins, but we wanted to quickly mention them to you here since you can often only know that values are missing from one dataset when you compare it to another.
+
+`dplyr::anti_join(x, y)` is a particularly useful tool here because it selects only the rows in `x` that don't have a match in `y`.
+For example, we can use two `anti_join()`s to reveal that we're missing information for four airports and 722 planes mentioned in `flights`:
+
+```{r}
+library(nycflights13)
+
+flights |> 
+  distinct(faa = dest) |> 
+  anti_join(airports)
+
+flights |> 
+  distinct(tailnum) |> 
+  anti_join(planes)
+```
+
+### Exercises
+
+1.  Can you find any relationship between the carrier and the rows that appear to be missing from `planes`?
+
+## Factors and empty groups
+
+A final type of missingness is the empty group, a group that doesn't contain any observations, which can arise when working with factors.
+For example, imagine we have a dataset that contains some health information about people:
+
+```{r}
+health <- tibble(
+  name   = c("Ikaia", "Oletta", "Leriah", "Dashay", "Tresaun"),
+  smoker = factor(c("no", "no", "no", "no", "no"), levels = c("yes", "no")),
+  age    = c(34, 88, 75, 47, 56),
+)
+```
+
+And we want to count the number of smokers with `dplyr::count()`:
+
+```{r}
+health |> count(smoker)
+```
+
+This dataset only contains non-smokers, but we know that smokers exist; the group of non-smoker is empty.
+We can request `count()` to keep all the groups, even those not seen in the data by using `.drop = FALSE`:
+
+```{r}
+health |> count(smoker, .drop = FALSE)
+```
+
+The same principle applies to ggplot2's discrete axes, which will also drop levels that don't have any values.
+You can force them to display by supplying `drop = FALSE` to the appropriate discrete axis:
+
+```{r}
+#| layout-ncol: 2
+#| fig-width: 3
+#| fig-alt: |
+#|   A bar chart with a single value on the x-axis, "no".
+#| 
+#|   The same bar chart as the last plot, but now with two values on
+#|   the x-axis, "yes" and "no". There is no bar for the "yes" category.
+ggplot(health, aes(x = smoker)) +
+  geom_bar() +
+  scale_x_discrete()
+
+ggplot(health, aes(x = smoker)) +
+  geom_bar() +
+  scale_x_discrete(drop = FALSE)
+```
+
+The same problem comes up more generally with `dplyr::group_by()`.
+And again you can use `.drop = FALSE` to preserve all factor levels:
+
+```{r}
+#| warning: false
+health |> 
+  group_by(smoker, .drop = FALSE) |> 
+  summarize(
+    n = n(),
+    mean_age = mean(age),
+    min_age = min(age),
+    max_age = max(age),
+    sd_age = sd(age)
+  )
+```
+
+We get some interesting results here because when summarizing an empty group, the summary functions are applied to zero-length vectors.
+There's an important distinction between empty vectors, which have length 0, and missing values, each of which has length 1.
+
+```{r}
+# A vector containing two missing values
+x1 <- c(NA, NA)
+length(x1)
+
+# A vector containing nothing
+x2 <- numeric()
+length(x2)
+```
+
+All summary functions work with zero-length vectors, but they may return results that are surprising at first glance.
+Here we see `mean(age)` returning `NaN` because `mean(age)` = `sum(age)/length(age)` which here is 0/0.
+`max()` and `min()` return -Inf and Inf for empty vectors so if you combine the results with a non-empty vector of new data and recompute you'll get the minimum or maximum of the new data[^missing-values-1].
+
+[^missing-values-1]: In other words, `min(c(x, y))` is always equal to `min(min(x), min(y))`.
+
+Sometimes a simpler approach is to perform the summary and then make the implicit missings explicit with `complete()`.
+
+```{r}
+health |> 
+  group_by(smoker) |> 
+  summarize(
+    n = n(),
+    mean_age = mean(age),
+    min_age = min(age),
+    max_age = max(age),
+    sd_age = sd(age)
+  ) |> 
+  complete(smoker)
+```
+
+The main drawback of this approach is that you get an `NA` for the count, even though you know that it should be zero.
+
+## Summary
+
+Missing values are weird!
+Sometimes they're recorded as an explicit `NA` but other times you only notice them by their absence.
+This chapter has given you some tools for working with explicit missing values, tools for uncovering implicit missing values, and discussed some of the ways that implicit can become explicit and vice versa.
+
+In the next chapter, we tackle the final chapter in this part of the book: joins.
+This is a bit of a change from the chapters so far because we're going to discuss tools that work with data frames as a whole, not something that you put inside a data frame.
diff --git a/model-assess.Rmd b/model-assess.Rmd
deleted file mode 100644
index 952de1bc0..000000000
--- a/model-assess.Rmd
+++ /dev/null
@@ -1,301 +0,0 @@
-# Model assessment
-
-In this chapter, you'll turn the tools of multiple models towards model assessment: learning how the model performs when given new data. So far we've focussed on models as tools for description, using models to help us understand the patterns in the data we have collected so far. But ideally a model will do more than just describe what we have seen so far - it will also help predict what will come next.  
-
-In other words, we want a model that doesn't just perform well on the sample, but also accurately summarises the underlying population.
-
-In some industries this is primarily the use of models: you spend relatively little time fitting the model compared to how many times you use it.
-
-There are two basic ways that a model can fail with new data:
-
-* You can under- or over-fit the model.  Underfitting is where you fail
-  to model and important trend: you leave too much in the residuals, and not 
-  enough in the model. Overfitting is the opposite: you fit a trend to
-  what is actually random noise: you've too put much model and not left
-  enough in the residuals. Generally overfitting tends to be more of a 
-  problem than underfitting.
-
-* The process that generates the data might change. There's nothing the 
-  model can do about this. You can protect yourself against this to some
-  extent by creating models that you understand and applying your knowledge
-  to the problem. Are these fundamentals likely to change? If you have 
-  a model that you are going to use again and again for a long time, you
-  need to plan to maintain the model, regularly checking that it still 
-  makes sense. i.e. is the population the same?
-  
-    <http://research.google.com/pubs/pub43146.html>
-    <http://www.wired.com/2015/10/can-learn-epic-failure-google-flu-trends/>
- 
-The most common problem with a model that causes it to do poorly with new data is overfitting. 
-
-
-Obviously, there's a bit of a problem here: we don't have new data with which to check the model, and even if we did, we'd presumably use it to make the model better in the first place. One powerful technique of approaches can help us get around this problem: resampling.
-
-There are two main resampling techniques that we're going to cover.
-
-* We will use __cross-validation__ to assess model quality. In 
-  cross-validation, you split the data into test and training sets. You fit 
-  the data to the training set, and evaluate it on the test set. This avoids
-  intrinsic bias of using the same data to both fit the model and assess it's
-  quality. However it introduces a new bias: you're not using all the data to
-  fit the model so it's not going to be quite as good as it could be.
-  
-* We will use __boostrapping__ to understand how stable (or how variable)
-  the model is. If you sample data from the same population multiple times, 
-  how much does your model vary? Instead of going back to collect new data, 
-  you can use the best estimate of the population data: the data you've
-  collected so far. The amazing idea of the bootstrap is that you can resample
-  from the data you already have.
-
-There are lots of high-level helpers to do these resampling methods in R. We're going to use the tools provided by the modelr package because they are explicit - you'll see exactly what's going on at each step. 
-
-<http://topepo.github.io/caret>. [Applied Predictive Modeling](https://amzn.com/1461468485), by Max Kuhn and Kjell Johnson.
-
-If you're competing in competitions, like Kaggle, that are predominantly about creating good predictions, developing a good strategy for avoiding overfitting is very important. Otherwise you risk tricking yourself into thinking that you have a good model, when in reality you just have a model that does a good job of fitting your data.
-
-There is a closely related family that uses a similar idea: model ensembles. However, instead of trying to find the best models, ensembles make use of all the models, acknowledging that even models that don't fit all the data particularly well can still model some subsets well. In general, you can think of model ensemble techniques as functions that take a list of models, and a return a single model that attempts to take the best part of each.
-
-
-### Prerequisites
-
-```{r setup, message = FALSE}
-# Standard data manipulation and visulisation
-library(dplyr)
-library(ggplot2)
-
-# Tools for working with models
-library(broom)
-library(modelr)
-library(splines)
-
-# Tools for working with lots of models
-library(purrr)
-library(tidyr)
-```
-
-```{r}
-# Options that make your life easier
-options(
-  contrasts = c("contr.treatment", "contr.treatment"),
-  na.option = na.exclude
-)
-```
-
-
-## Overfitting
-
-Both bootstrapping and cross-validation help us to spot and remedy the problem of __over fitting__, where the model fits the data we've seen so far extremely well, but does a bad job of generalising to new data.
-
-A classic example of over-fitting is to using a polynomial with too many degrees of freedom.
-
-Bias - variance tradeoff.  Simpler = more biased. Complex = more variable.  Occam's razor.
-
-```{r}
-true_model <- function(x) {
-  1 + 2 * x + rnorm(length(x), sd = 0.25)
-}
-
-df <- tibble(
-  x = seq(0, 1, length = 20),
-  y = true_model(x)
-)
-
-df %>% 
-  ggplot(aes(x, y)) +
-  geom_point()
-```
-
-We can create a model that fits this data very well:
-
-```{r, message = FALSE}
-library(splines)
-my_model <- function(df) {
-  lm(y ~ poly(x, 7), data = df)
-}
-
-mod <- my_model(df)
-rmse(mod, df)
-
-grid <- df %>% 
-  expand(x = seq_range(x, 50))
-preds <- grid %>% 
-  add_predictions(mod, var = "y")
-
-df %>% 
-  ggplot(aes(x, y)) +
-  geom_line(data = preds) + 
-  geom_point()
-```
-
-As we fit progressively more and more complicated models, the model error decreases:
-
-```{r}
-fs <- list(
-  y ~ x,
-  y ~ poly(x, 2),
-  y ~ poly(x, 3),
-  y ~ poly(x, 4),
-  y ~ poly(x, 5),
-  y ~ poly(x, 6),
-  y ~ poly(x, 7)
-)
-
-models <- tibble(
-  n = 1:7, 
-  f = fs,
-  mod = map(f, lm, data = df),
-  rmse = map2_dbl(mod, list(df), rmse)
-)
-
-models %>% 
-  ggplot(aes(n, rmse)) + 
-  geom_line(colour = "grey70") + 
-  geom_point(size = 3)
-```
-
-But do you think this model will do well if we apply it to new data from the same population? 
-
-In real-life you can't easily go out and recollect your data. There are two approaches to help you get around this problem. I'll introduce them briefly here, and then we'll go into more depth in the following sections.
-
-```{r}
-boot <- bootstrap(df, 100) %>% 
-  mutate(
-    mod = map(strap, my_model),
-    pred = map2(list(grid), mod, add_predictions)
-  )
-
-boot %>% 
-  unnest(pred) %>% 
-  ggplot(aes(x, pred, group = .id)) +
-  geom_line(alpha = 1/3)
-```
-
-It's a little easier to see what's going on if we zoom on the y axis:
-
-```{r}
-last_plot() + 
-  coord_cartesian(ylim = c(0, 5))
-```
-
-(You might notice that while each individual model varies a lot, the average of all the models seems like it might not be that bad. That gives rise to a model ensemble technique called model averaging.)
-
-Bootstrapping is a useful tool to help us understand how the model might vary if we'd collected a different sample from the population. A related technique is cross-validation which allows us to explore the quality of the model. It works by repeatedly splitting the data into two pieces. One piece, the training set, is used to fit, and the other piece, the test set, is used to measure the model quality.
-
-The following code generates 100 test-training splits, holding out 20% of the data for testing each time. We then fit a model to the training set, and evaluate the error on the test set:
-
-```{r}
-cv <- crossv_mc(df, 100) %>% 
-  mutate(
-    mod = map(train, my_model),
-    rmse = map2_dbl(mod, test, rmse)
-  )
-cv
-```
-
-Obviously, a plot is going to help us see distribution more easily. I've added our original estimate of the model error as a white vertical line (where the same dataset is used for both training and testing), and you can see it's very optimistic.
-
-```{r}
-cv %>% 
-  ggplot(aes(rmse)) +
-  geom_ref_line(v = rmse(mod, df)) +
-  geom_freqpoly(binwidth = 0.2) +
-  geom_rug()
-```
-
-The distribution of errors is highly skewed: there are a few cases which have very high errors. These represent samples where we ended up with a few cases on all with low values or high values of x.  Let's take a look:
-
-```{r}
-filter(cv, rmse > 1.5) %>% 
-  unnest(map(train, as.data.frame)) %>% 
-  ggplot(aes(x, .id)) + 
-    geom_point() + 
-    xlim(0, 1)
-```
-
-All of the models that fit particularly poorly were fit to samples that either missed the first one or two or the last one or two observation. Because polynomials shoot off to positive and negative, they give very bad predictions for those values.
-
-Now that we've given you a quick overview and intuition for these techniques, let's dive in more detail.
-
-## Resamples
-
-### Building blocks
-
-Both the boostrap and cross-validation are built on top of a "resample" object. In modelr, you can access these low-level tools directly with the `resample_*` functions. 
-
-These functions return an object of class "resample", which represents the resample in a memory efficient way. Instead of storing the resampled dataset itself, it instead stores the integer indices, and a "pointer" to the original dataset. This makes resamples take up much less memory.
-
-```{r}
-x <- resample_bootstrap(as_tibble(mtcars))
-class(x)
-
-x
-```
-
-Most modelling functions call `as.data.frame()` on the `data` argument. This generates a resampled data frame. Because it's called automatically you can just pass the object.
-
-```{r}
-lm(mpg ~ wt, data = x)
-```
-
-If you get a strange error, it's probably because the modelling function doesn't do this, and you need to do it yourself. You'll also need to do it yourself if you want to `unnest()` the data so you can visualise it.  If you want to just get the rows selected, you can use `as.integer()`.
-
-### Dataframe API
-
-`bootstrap()` and `crossv_mc()` are built on top of these simpler primitives. They are designed to work naturally in a model exploration environment by returning data frames. Each row of the data frame represents a single sample. They return slightly different columns:
-
-*   `boostrap()` returns a data frame with two columns:
-
-    ```{r}
-    bootstrap(df, 3)
-    ```
-    
-    `strap` gives the bootstrap sample dataset, and `.id` assigns a 
-    unique identifier to each model (this is often useful for plotting)
-    
-*   `crossv_mc()` return a data frame with three columns:
-
-    ```{r}
-    crossv_mc(df, 3)
-    ```
-    `train` contains the data that you should use to fit (train) the model,
-    and `test` contains the data you should use to validate the model. Together,
-    the test and train columns form an exclusive partition of the full dataset.
-
-## Numeric summaries of model quality
-
-When you start dealing with many models, it's helpful to have some rough way of comparing them so you can spend your time looking at the models that do the best job of capturing important features in the data. 
-
-One way to capture the quality of the model is to summarise the distribution of the residuals. For example, you could look at the quantiles of the absolute residuals. For this dataset, 25% of predictions are less than \$7,400 away, and 75% are less than \$25,800 away. That seems like quite a bit of error when predicting someone's income!
-
-```{r}
-heights <- tibble(readRDS("data/heights.RDS"))
-h <- lm(income ~ height, data = heights)
-h 
-
-qae(h, heights)
-range(heights$income)
-```
-
-You might be familiar with the $R^2$. That's a single number summary that rescales the variance of the residuals to between 0 (very bad) and 1 (very good):
-
-```{r}
-rsquare(h, heights)
-```
-
-$R^2$ can be interpreted as the amount of variation in the data explained by the model. Here we're explaining 3% of the total variation - not a lot! But I don't think worrying about the relative amount of variation explained is that useful; instead I think you need to consider whether the absolute amount of variation explained is useful for your project.
-
-It's called the $R^2$ because for simple models like this, it's just the square of the correlation between the variables:
-
-```{r}
-cor(heights$income, heights$height) ^ 2
-```
-
-The $R^2$ is an ok single number summary, but I prefer to think about the unscaled residuals because it's easier to interpret in the context of the original data. As you'll also learn later, it's also a rather optimistic interpretation of the model. Because you're assessing the model using the same data that was used to fit it, it really gives more of an upper bound on the quality of the model, not a fair assessment. 
-
-
-
-## Bootstrapping
-
-
-## Cross-validation
-
diff --git a/model-basics.Rmd b/model-basics.Rmd
deleted file mode 100644
index f3b8be063..000000000
--- a/model-basics.Rmd
+++ /dev/null
@@ -1,685 +0,0 @@
-# Model basics
-
-## Introduction
-
-The goal of a model is to provide a simple low-dimensional summary of a dataset. In the context of this book we're going to use models to partition data into patterns and residuals. Strong patterns will hide subtler trends, so we'll use models to help peel back layers of structure as we explore a dataset.
-
-However, before we can start using models on interesting, real, datasets, you need to understand the basics of how models work. For that reason, this chapter of the book is unique because it uses only simulated datasets. These datasets are very simple, and not at all interesting, but they will help you understand the essence of modelling before you apply the same techniques to real data in the next chapter.
-
-There are two parts to a model:
-
-1.  First, you define a __family of models__ that express a precise, but 
-    generic, pattern that you want to capture. For example, the pattern 
-    might be a straight line, or a quadratic curve. You will express
-    the model family as an equation like `y = a_1 * x + a_2` or 
-    `y = a_1 * x ^ a_2`. Here, `x` and `y` are known variables from your
-    data, and `a_1` and `a_2` are parameters that can vary to capture 
-    different patterns.
-
-1.  Next, you generate a __fitted model__ by finding the model from the 
-    family that is the closest to your data. This takes the generic model 
-    family and makes it specific, like `y = 3 * x + 7` or `y = 9 * x ^ 2`.
-
-It's important to understand that a fitted model is just the closest model from a family of models. That implies that you have the "best" model (according to some criteria); it doesn't imply that you have a good model and it certainly doesn't imply that the model is "true". George Box puts this well in his famous aphorism:
-
-> All models are wrong, but some are useful.
-
-It's worth reading the fuller context of the quote:
-
-> Now it would be very remarkable if any system existing in the real world 
-> could be exactly represented by any simple model. However, cunningly chosen 
-> parsimonious models often do provide remarkably useful approximations. For 
-> example, the law PV = RT relating pressure P, volume V and temperature T of 
-> an "ideal" gas via a constant R is not exactly true for any real gas, but it 
-> frequently provides a useful approximation and furthermore its structure is 
-> informative since it springs from a physical view of the behavior of gas 
-> molecules.
-> 
-> For such a model there is no need to ask the question "Is the model true?". 
-> If "truth" is to be the "whole truth" the answer must be "No". The only 
-> question of interest is "Is the model illuminating and useful?".
-
-The goal of a model is not to uncover truth, but to discover a simple approximation that is still useful. 
-
-### Prerequisites
-
-In this chapter we'll use the modelr package which wraps around base R's modelling functions to make them work naturally in a pipe.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-
-library(modelr)
-options(na.action = na.warn)
-```
-
-## A simple model
-
-Lets take a look at the simulated dataset `sim1`, included with the modelr package. It contains two continuous variables, `x` and `y`. Let's plot them to see how they're related:
-
-```{r}
-ggplot(sim1, aes(x, y)) + 
-  geom_point()
-```
-
-You can see a strong pattern in the data. Let's use a model to capture that pattern and make it explicit. It's our job to supply the basic form of the model. In this case, the relationship looks linear, i.e. `y = a_0 + a_1 * x`.  Let's start by getting a feel for what models from that family look like by randomly generating a few and overlaying them on the data. For this simple case, we can use `geom_abline()` which takes a slope and intercept as parameters. Later on we'll learn more general techniques that work with any model.
-
-```{r}
-models <- tibble(
-  a1 = runif(250, -20, 40),
-  a2 = runif(250, -5, 5)
-)
-
-ggplot(sim1, aes(x, y)) + 
-  geom_abline(aes(intercept = a1, slope = a2), data = models, alpha = 1/4) +
-  geom_point() 
-```
-
-There are 250 models on this plot, but a lot are really bad! We need to find the good models by making precise our intuition that a good model is "close" to the data. We need a way to quantify the distance between the data and a model. Then we can fit the model by finding the value of `a_0` and `a_1` that generate the model with the smallest distance from this data.
-
-One easy place to start is to find the vertical distance between each point and the model, as in the following diagram. (Note that I've shifted the x values slightly so you can see the individual distances.)
-
-```{r, echo = FALSE}
-dist1 <- sim1 %>% 
-  mutate(
-    dodge = rep(c(-1, 0, 1) / 20, 10),
-    x1 = x + dodge,
-    pred = 7 + x1 * 1.5
-  )
-
-ggplot(dist1, aes(x1, y)) + 
-  geom_abline(intercept = 7, slope = 1.5, colour = "grey40") +
-  geom_point(colour = "grey40") +
-  geom_linerange(aes(ymin = y, ymax = pred), colour = "#3366FF") 
-```
-
-This distance is just the difference between the y value given by the model (the __prediction__), and the actual y value in the data (the __response__).
-
-To compute this distance, we first turn our model family into an R function. This takes the model parameters and the data as inputs, and gives values predicted by the model as output:
-
-```{r}
-model1 <- function(a, data) {
-  a[1] + data$x * a[2]
-}
-model1(c(7, 1.5), sim1)
-```
-
-Next, we need some way to compute an overall distance between the predicted and actual values. In other words, the plot above shows 30 distances: how do we collapse that into a single number?
-
-One common way to do this in statistics to use the "root-mean-squared deviation". We compute the difference between actual and predicted, square them, average them, and the take the square root. This distance has lots of appealing mathematical properties, which we're not going to talk about here. You'll just have to take my word for it!
-
-```{r}
-measure_distance <- function(mod, data) {
-  diff <- data$y - model1(mod, data)
-  sqrt(mean(diff ^ 2))
-}
-measure_distance(c(7, 1.5), sim1)
-```
-
-Now we can use purrr to compute the distance for all the models defined above. We need a helper function because our distance function expects the model as a numeric vector of length 2.
-
-```{r}
-sim1_dist <- function(a1, a2) {
-  measure_distance(c(a1, a2), sim1)
-}
-
-models <- models %>% 
-  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
-models
-```
-
-Next, let's overlay the 10 best models on to the data. I've coloured the models by `-dist`: this is an easy way to make sure that the best models (i.e. the ones with the smallest distance) get the brighest colours.
-
-```{r}
-ggplot(sim1, aes(x, y)) + 
-  geom_point(size = 2, colour = "grey30") + 
-  geom_abline(
-    aes(intercept = a1, slope = a2, colour = -dist), 
-    data = filter(models, rank(dist) <= 10)
-  )
-```
-
-We can also think about these models as observations, and visualising with a scatterplot of `a1` vs  `a2`, again coloured by `-dist`. We can no longer directly see how the model compares to the data, but we can see many models at once. Again, I've highlighted the 10 best models, this time by drawing red circles underneath them.
-
-```{r}
-ggplot(models, aes(a1, a2)) +
-  geom_point(data = filter(models, rank(dist) <= 10), size = 4, colour = "red") +
-  geom_point(aes(colour = -dist))
-```
-
-Instead of trying lots of random models, we could be more systematic and generate an evenly spaced grid of points (this is called a grid search). I picked the parameters of the grid roughly by looking at where the best models were in the plot above.
-
-```{r}
-grid <- expand.grid(
-  a1 = seq(-5, 20, length = 25),
-  a2 = seq(1, 3, length = 25)
-  ) %>% 
-  mutate(dist = purrr::map2_dbl(a1, a2, sim1_dist))
-
-grid %>% 
-  ggplot(aes(a1, a2)) +
-  geom_point(data = filter(grid, rank(dist) <= 10), size = 4, colour = "red") +
-  geom_point(aes(colour = -dist)) 
-```
-
-When you overlay the best 10 models back on the original data, they all look pretty good:
-
-```{r}
-ggplot(sim1, aes(x, y)) + 
-  geom_point(size = 2, colour = "grey30") + 
-  geom_abline(
-    aes(intercept = a1, slope = a2, colour = -dist), 
-    data = filter(grid, rank(dist) <= 10)
-  )
-```
-
-You could imagine iteratively making the grid finer and finer until you narrowed in on the best model. But there's a better way to tackle that problem: a numerical minimisation tool called Newton-Raphson search. The intuition of Newton-Raphson is pretty simple: you pick a starting point and look around for the steepest slope. You then ski down that slope a little way, and then repeat again and again, until you can't go any lower. In R, we can do that with `optim()`:
-
-```{r}
-best <- optim(c(0, 0), measure_distance, data = sim1)
-best$par
-
-ggplot(sim1, aes(x, y)) + 
-  geom_point(size = 2, colour = "grey30") + 
-  geom_abline(intercept = best$par[1], slope = best$par[2])
-```
-
-Don't worry too much about the details of how `optim()` works. It's the intuition that's important here. If you have a function that defines the distance between a model and a dataset, an algorithm that can minimise that distance by modifying the parameters of the model, you can find the best model. The neat thing about this approach is that it will work for any family of models that you can write an equation for. 
-
-There's one more approach that we can use for this model, because it's a special case of a broader family: linear models. A linear model has the general form `y = a_1 + a_2 * x_1 + a_3 * x_2 + ... + a_n * x_(n - 1)`. So this simple model is equivalent to a general linear model where n is 2 and `x_1` is `x`. R has a tool specifically designed for fitting linear models called `lm()`. `lm()` has a special way to specify the model family: formulas. Formulas look like `y ~ x`, which `lm()` will translate to a function like `y = a_1 + a_2 * x`. We can fit the model and look at the output:
-
-```{r}
-sim1_mod <- lm(y ~ x, data = sim1)
-coef(sim1_mod)
-```
-
-These are exactly the same values we got with `optim()`! Behind the scenes `lm()` doesn't use `optim()` but instead takes advantage of the mathematical structure of linear models. Using some connections between geometry, calculus, and linear algebra, `lm()` actually finds the closest model in a single step, using a sophisticated algorithm. This approach is both faster, and guarantees that there is a global minimum.
-
-### Exercises
-
-1.  One downside of the linear model is that it is sensitive to unusual values
-    because the distance incorporates a squared term. Fit a linear model to 
-    the simulated data below, and visualise the results. Rerun a few times to
-    generate different simulated datasets. What do you notice about the model? 
-    
-    ```{r}
-    sim1a <- tibble(
-      x = rep(1:10, each = 3),
-      y = x * 1.5 + 6 + rt(length(x), df = 2)
-    )
-    ```
-
-1.  One way to make linear models more robust is to use a different distance
-    measure. For example, instead of root-mean-squared distance, you could use
-    mean-absolute distance:
-    
-    ```{r}
-    measure_distance <- function(mod, data) {
-      diff <- data$y - model1(mod, data)
-      mean(abs(diff))
-    }
-    ```
-    
-    Use `optim()` to fit this model to the simulated data above and compare it 
-    to the linear model.
-
-1.  One challenge with performing numerical optimisation is that it's only
-    guaranteed to find one local optimum. What's the problem with optimising
-    a three parameter model like this?
-    
-    ```{r}
-    model1 <- function(a, data) {
-      a[1] + data$x * a[2] + a[3]
-    }
-    ```
-
-## Visualising models
-
-For simple models, like the one above, you can figure out what pattern the model captures by carefully studying the model family and the fitted coefficients. And if you ever take a statistics course on modelling, you're likely to spend a lot of time doing just that. Here, however, we're going to take a different tack. We're going to focus on understanding a model by looking at its predictions. This has a big advantage: every type of predictive model makes predictions (otherwise what use would it be?) so we can use the same set of techniques to understand any type of predictive model.
-
-It's also useful to see what the model doesn't capture, the so-called residuals which are left after subtracting the predictions from the data. Residuals are powerful because they allow us to use models to remove striking patterns so we can study the subtler trends that remain.
-
-### Predictions
-
-To visualise the predictions from a model, we start by generating an evenly spaced grid of values that covers the region where our data lies. The easiest way to do that is to use `modelr::data_grid()`. Its first argument is a data frame, and for each subsequent argument it finds the unique variables and then generates all combinations:
-
-```{r}
-grid <- sim1 %>% 
-  data_grid(x) 
-grid
-```
-
-(This will get more interesting when we start to add more variables to our model.)
-
-Next we add predictions. We'll use `modelr::add_predictions()` which takes a data frame and a model. It adds the predictions from the model to a new column in the data frame:
-
-```{r}
-grid <- grid %>% 
-  add_predictions(sim1_mod) 
-grid
-```
-
-(You can also use this function to add predictions to your original dataset.)
-
-Next, we plot the predictions. You might wonder about all this extra work compared to just using `geom_abline()`. But the advantage of this approach is that it will work with _any_ model in R, from the simplest to the most complex. You're only limited by your visualisation skills. For more ideas about how to visualise more complex model types, you might try <http://vita.had.co.nz/papers/model-vis.html>.
-
-```{r}
-ggplot(sim1, aes(x)) +
-  geom_point(aes(y = y)) +
-  geom_line(aes(y = pred), data = grid, colour = "red", size = 1)
-```
-
-### Residuals
-
-The flip-side of predictions are __residuals__. The predictions tells you the pattern that the model has captured, and the residuals tell you what the model has missed. The residuals are just the distances between the observed and predicted values that we computed above. 
-
-We add residuals to the data with `add_residuals()`, which works much like `add_predictions()`. Note, however, that we use the original dataset, not a manufactured grid. This is because to compute residuals we need actual y values.
-
-```{r}
-sim1 <- sim1 %>% 
-  add_residuals(sim1_mod)
-sim1
-```
-
-There are a few different ways to understand what the residuals tell us about the model. One way is to simply draw a frequency polygon to help us understand the spread of the residuals:
-
-```{r}
-ggplot(sim1, aes(resid)) + 
-  geom_freqpoly(binwidth = 0.5)
-```
-
-This helps you calibrate the quality of the model: how far away are the predictions from the observed values?  Note that the average of the residual will always be 0.
-
-You'll often want to recreate plots using the residuals instead of the original predictor. You'll see a lot of that in the next chapter.
-
-```{r}
-ggplot(sim1, aes(x, resid)) + 
-  geom_ref_line(h = 0) +
-  geom_point() 
-```
-
-This looks like random noise, suggesting that our model has done a good job of capturing the patterns in the dataset.
-
-### Exercises
-
-1.  Instead of using `lm()` to fit a straight line, you can use `loess()`
-    to fit a smooth curve. Repeat the process of model fitting, 
-    grid generation, predictions, and visualisation on `sim1` using 
-    `loess()` instead of `lm()`. How does the result compare to 
-    `geom_smooth()`?
-    
-1.  `add_predictions()` is paired with `gather_predictions()` and 
-    `spread_predictions()`. How do these three functions differ?
-    
-1.  What does `geom_ref_line()` do? What package does it come from?
-    Why is displaying a reference line in plots showing residuals
-    useful and important?
-    
-1.  Why might you want to look at a frequency polygon of absolute residuals?
-    What are the pros and cons compared to looking at the raw residuals?
-
-## Formulas and model families
-
-You've seen formulas before when using `facet_wrap()` and `facet_grid()`. In R, formulas provide a general way of getting "special behaviour". Rather than evaluating the values of the variables right away, they capture them so they can be interpreted by the function.
-
-The majority of modelling functions in R use a standard conversion from formulas to functions. You've seen one simple conversion already: `y ~ x` is translated to `y = a_1 + a_2 * x`.  If you want to see what R actually does, you can use the `model_matrix()` function. It takes a data frame and a formula and returns a tibble that defines the model equation: each column in the output is associated with one coefficient in the model, the function is always `y = a_1 * out_1 + a_2 * out_2`. For the simplest case of `y ~ x1` this shows us something interesting:
-
-```{r}
-df <- tribble(
-  ~y, ~x1, ~x2,
-  4, 2, 5,
-  5, 1, 6
-)
-model_matrix(df, y ~ x1)
-```
-
-The way that R adds the intercept to the model is just by having a column that is full of ones.  By default, R will always add this column. If you don't want, you need to explicitly drop it with `-1`:
-
-```{r}
-model_matrix(df, y ~ x1 - 1)
-```
-
-The model matrix grows in an unsurprising way when you add more variables to the the model:
-
-```{r}
-model_matrix(df, y ~ x1 + x2)
-```
-
-This formula notation is sometimes called "Wilkinson-Rogers notation", and was initially described in _Symbolic Description of Factorial Models for Analysis of Variance_, by G. N. Wilkinson and C. E. Rogers <https://www.jstor.org/stable/2346786>. It's worth digging up and reading the original paper if you'd like to understand the full details of the modelling algebra.
-
-The following sections expand on how this formula notation works for categorical variables, interactions, and transformation.
-
-### Categorical variables
-
-Generating a function from a formula is straight forward when the predictor is continuous, but things get a bit more complicated when the predictor is categorical. Imagine you have a formula like `y ~ sex`, where sex could either be male or female. It doesn't make sense to convert that to a formula like `y = x_0 + x_1 * sex` because `sex` isn't a number - you can't multiply it! Instead what R does is convert it to `y = x_0 + x_1 * sex_male` where `sex_male` is one if `sex` is male and zero otherwise:
-
-```{r}
-df <- tribble(
-  ~ sex, ~ response,
-  "male", 1,
-  "female", 2,
-  "male", 1
-)
-model_matrix(df, response ~ sex)
-```
-
-You might wonder why R also doesn't create a `sex_female` column. The problem is that would create a column that is perfectly predictable based on the other columns (i.e. `sex_female = 1 - sex_male`). Unfortunately the exact details of why this is a problem is beyond the scope of this book, but basically it creates a model family that is too flexible, and will have infinitely many models that are equally close to the data.
-
-Fortunately, however, if you focus on visualising predictions you don't need to worry about the exact parameterisation. Let's look at some data and models to make that concrete. Here's the `sim2` dataset from modelr:
-
-```{r}
-ggplot(sim2) + 
-  geom_point(aes(x, y))
-```
-
-We can fit a model to it, and generate predictions:
-
-```{r}
-mod2 <- lm(y ~ x, data = sim2)
-
-grid <- sim2 %>% 
-  data_grid(x) %>% 
-  add_predictions(mod2)
-grid
-```
-
-Effectively, a model with a categorical `x` will predict the mean value for each category. (Why? Because the mean minimises the root-mean-squared distance.) That's easy to see if we overlay the predictions on top of the original data:
-
-```{r}
-ggplot(sim2, aes(x)) + 
-  geom_point(aes(y = y)) +
-  geom_point(data = grid, aes(y = pred), colour = "red", size = 4)
-```
-
-You can't make predictions about levels that you didn't observe. Sometimes you'll do this by accident so it's good to recognise this error message:
-
-```{r, error = TRUE}
-tibble(x = "e") %>% 
-  add_predictions(mod2)
-```
-
-### Interactions (continuous and categorical)
-
-What happens when you combine a continuous and a categorical variable?  `sim3` contains a categorical predictor and a continuous predictor. We can visualise it with a simple plot:
-
-```{r}
-ggplot(sim3, aes(x1, y)) + 
-  geom_point(aes(colour = x2))
-```
-
-There are two possible models you could fit to this data:
-
-```{r}
-mod1 <- lm(y ~ x1 + x2, data = sim3)
-mod2 <- lm(y ~ x1 * x2, data = sim3)
-```
-
-When you add variables with `+`, the model will estimate each effect independent of all the others. It's possible to fit the so-called interaction by using `*`. For example, `y ~ x1 * x2` is translated to `y = a_0 + a_1 * x1 + a_2 * x2 + a_12 * x1 * x2`. Note that whenever you use `*`, both the interaction and the individual components are included in the model.
-
-To visualise these models we need two new tricks:
-
-1.  We have two predictors, so we need to give `data_grid()` both variables. 
-    It finds all the unique values of `x1` and `x2` and then generates all
-    combinations. 
-   
-1.  To generate predictions from both models simultaneously, we can use 
-    `gather_predictions()` which adds each prediction as a row. The
-    complement of `gather_predictions()` is `spread_predictions()` which adds 
-    each prediction to a new column.
-    
-Together this gives us:
-
-```{r}
-grid <- sim3 %>% 
-  data_grid(x1, x2) %>% 
-  gather_predictions(mod1, mod2)
-grid
-```
-
-We can visualise the results for both models on one plot using facetting:
-
-```{r}
-ggplot(sim3, aes(x1, y, colour = x2)) + 
-  geom_point() + 
-  geom_line(data = grid, aes(y = pred)) + 
-  facet_wrap(~ model)
-```
-
-Note that the model that uses `+` has the same slope for each line, but different intercepts. The model that uses `*` has a different slope and intercept for each line.
-
-Which model is better for this data? We can take look at the residuals. Here I've facetted by both model and `x2` because it makes it easier to see the pattern within each group.
-
-```{r}
-sim3 <- sim3 %>% 
-  gather_residuals(mod1, mod2)
-
-ggplot(sim3, aes(x1, resid, colour = x2)) + 
-  geom_point() + 
-  facet_grid(model ~ x2)
-```
-
-There is little obvious pattern in the residuals for `mod2`. The residuals for `mod1` show that the model has clearly missed some pattern in `b`, and less so, but still present is pattern in `c`, and `d`. You might wonder if there's a precise way to tell which of `mod1` or `mod2` is better. There is, but it requires a lot of mathematical background, and we don't really care. Here, we're interested in a qualitative assessment of whether or not the model has captured the pattern that we're interested in. 
-
-### Interactions (two continuous)
-
-Let's take a look at the equivalent model for two continuous variables. Initially things proceed almost identically to the previous example:
-
-```{r}
-mod1 <- lm(y ~ x1 + x2, data = sim4)
-mod2 <- lm(y ~ x1 * x2, data = sim4)
-
-grid <- sim4 %>% 
-  data_grid(
-    x1 = seq_range(x1, 5), 
-    x2 = seq_range(x2, 5) 
-  ) %>% 
-  gather_predictions(mod1, mod2)
-grid
-```
-
-Note my use of `seq_range()` inside `data_grid()`. Instead of using every unique value of `x`, I'm going to use a regularly spaced grid of five values between the minimum and maximum numbers. It's probably not super important here, but it's a useful technique in general. There are two other useful arguments to `seq_range()`:
-
-*  `pretty = TRUE` will generate a "pretty" sequence, i.e. something that looks
-    nice to the human eye. This is useful if you want to produce tables of 
-    output:
-    
-    ```{r}
-    seq_range(c(0.0123, 0.923423), n = 5)
-    seq_range(c(0.0123, 0.923423), n = 5, pretty = TRUE)
-    ```
-    
-*   `trim = 0.1` will trim off 10% of the tail values. This is useful if the 
-    variables have a long tailed distribution and you want to focus on generating
-    values near the center:
-    
-    ```{r}
-    x1 <- rcauchy(100)
-    seq_range(x1, n = 5)
-    seq_range(x1, n = 5, trim = 0.10)
-    seq_range(x1, n = 5, trim = 0.25)
-    seq_range(x1, n = 5, trim = 0.50)
-    ```
-    
-*   `expand = 0.1` is in some sense the opposite of `trim()` it expands the 
-    range by 10%.
-    
-    ```{r}
-    x2 <- c(0, 1)
-    seq_range(x2, n = 5)
-    seq_range(x2, n = 5, expand = 0.10)
-    seq_range(x2, n = 5, expand = 0.25)
-    seq_range(x2, n = 5, expand = 0.50)
-    ```
-
-Next let's try and visualise that model. We have two continuous predictors, so you can imagine the model like a 3d surface. We could display that using `geom_tile()`:
-
-```{r}
-ggplot(grid, aes(x1, x2)) + 
-  geom_tile(aes(fill = pred)) + 
-  facet_wrap(~ model)
-```
-
-That doesn't suggest that the models are very different! But that's partly an illusion: our eyes and brains are not very good at accurately comparing shades of colour. Instead of looking at the surface from the top, we could look at it from either side, showing multiple slices:
-
-```{r, asp = 1/2}
-ggplot(grid, aes(x1, pred, colour = x2, group = x2)) + 
-  geom_line() +
-  facet_wrap(~ model)
-ggplot(grid, aes(x2, pred, colour = x1, group = x1)) + 
-  geom_line() +
-  facet_wrap(~ model)
-```
-
-This shows you that interaction between two continuous variables works basically the same way as for a categorical and continuous variable. An interaction says that there's not a fixed offset: you need to consider both values of `x1` and `x2` simultaneously in order to predict `y`.
-
-You can see that even with just two continuous variables, coming up with good visualisations are hard. But that's reasonable: you shouldn't expect it will be easy to understand how three or more variables simultaneously interact! But again, we're saved a little because we're using models for exploration, and you can gradually build up your model over time. The model doesn't have to be perfect, it just has to help you reveal a little more about your data.
-
-I spent some time looking at the residuals to see if I could figure if `mod2` did better than `mod1`. I think it does, but it's pretty subtle. You'll have a chance to work on it in the exercises.
-
-### Transformations
-
-You can also perform transformations inside the model formula. For example, `log(y) ~ sqrt(x1) + x2` is transformed to `log(y) = a_1 + a_2 * sqrt(x1) + a_3 * x2`. If your transformation involves `+`, `*`, `^`, or `-`, you'll need to wrap it in `I()` so R doesn't treat it like part of the model specification. For example, `y ~ x + I(x ^ 2)` is translated to `y = a_1 + a_2 * x + a_3 * x^2`. If you forget the `I()` and specify `y ~ x ^ 2 + x`, R will compute `y ~ x * x + x`. `x * x` means the interaction of `x` with itself, which is the same as `x`. R automatically drops redundant variables so `x + x` become `x`, meaning that `y ~ x ^ 2 + x` specifies the function `y = a_1 + a_2 * x`. That's probably not what you intended!
-
-Again, if you get confused about what your model is doing, you can always use `model_matrix()` to see exactly what equation `lm()` is fitting:
-
-```{r}
-df <- tribble(
-  ~y, ~x,
-   1,  1,
-   2,  2, 
-   3,  3
-)
-model_matrix(df, y ~ x^2 + x)
-model_matrix(df, y ~ I(x^2) + x)
-```
-
-Transformations are useful because you can use them to approximate non-linear functions. If you've taken a calculus class, you may have heard of Taylor's theorem which says you can approximate any smooth function with an infinite sum of polynomials. That means you can use a polynomial function to get arbitrarily close to a smooth function by fitting an equation like `y = a_1 + a_2 * x + a_3 * x^2 + a_4 * x^3`. Typing that sequence by hand is tedious, so R provides a helper function: `poly()`:
-
-```{r}
-model_matrix(df, y ~ poly(x, 2))
-```
-
-However there's one major problem with using `poly()`: outside the range of the data, polynomials rapidly shoot off to positive or negative infinity. One safer alternative is to use the natural spline, `splines::ns()`.
-
-```{r}
-library(splines)
-model_matrix(df, y ~ ns(x, 2))
-```
-
-Let's see what that looks like when we try and approximate a non-linear function:
-
-```{r}
-sim5 <- tibble(
-  x = seq(0, 3.5 * pi, length = 50),
-  y = 4 * sin(x) + rnorm(length(x))
-)
-
-ggplot(sim5, aes(x, y)) +
-  geom_point()
-```
-
-I'm going to fit five models to this data.
-
-```{r}
-mod1 <- lm(y ~ ns(x, 1), data = sim5)
-mod2 <- lm(y ~ ns(x, 2), data = sim5)
-mod3 <- lm(y ~ ns(x, 3), data = sim5)
-mod4 <- lm(y ~ ns(x, 4), data = sim5)
-mod5 <- lm(y ~ ns(x, 5), data = sim5)
-
-grid <- sim5 %>% 
-  data_grid(x = seq_range(x, n = 50, expand = 0.1)) %>% 
-  gather_predictions(mod1, mod2, mod3, mod4, mod5, .pred = "y")
-
-ggplot(sim5, aes(x, y)) + 
-  geom_point() +
-  geom_line(data = grid, colour = "red") +
-  facet_wrap(~ model)
-```
-
-Notice that the extrapolation outside the range of the data is clearly bad. This is the downside to approximating a function with a polynomial. But this is a very real problem with every model: the model can never tell you if the behaviour is true when you start extrapolating outside the range of the data that you have seen. You must rely on theory and science.
-
-### Exercises
-
-1.  What happens if you repeat the analysis of `sim2` using a model without
-    an intercept. What happens to the model equation? What happens to the
-    predictions?
-    
-1.  Use `model_matrix()` to explore the equations generated for the models
-    I fit to `sim3` and `sim4`. Why is `*` a good shorthand for interaction?
-
-1.  Using the basic principles, convert the formulas in the following two
-    models into functions. (Hint: start by converting the categorical variable
-    into 0-1 variables.)
-    
-    ```{r, eval = FALSE}
-    mod1 <- lm(y ~ x1 + x2, data = sim3)
-    mod2 <- lm(y ~ x1 * x2, data = sim3)
-    ```
-
-1.   For `sim4`,  which of `mod1` and `mod2` is better? I think `mod2` does a 
-     slightly better job at removing patterns, but it's pretty subtle. Can you 
-     come up with a plot to support my claim? 
-
-## Missing values
-
-Missing values obviously can not convey any information about the relationship between the variables, so modelling functions will drop any rows that contain missing values. R's default behaviour is to silently drop them, but `options(na.action = na.warn)` (run in the prerequisites), makes sure you get a warning.
-
-```{r}
-df <- tribble(
-  ~x, ~y,
-  1, 2.2,
-  2, NA,
-  3, 3.5,
-  4, 8.3,
-  NA, 10
-)
-
-mod <- lm(y ~ x, data = df)
-```
-
-To suppress the warning, set `na.action = na.exclude`:
-
-```{r}
-mod <- lm(y ~ x, data = df, na.action = na.exclude)
-```
-
-You can always see exactly how many observations were used with `nobs()`:
-
-```{r}
-nobs(mod)
-```
-
-## Other model families
-
-This chapter has focussed exclusively on the class of linear models, which assume a relationship of the form `y = a_1 * x1 + a_2 * x2 + ... + a_n * xn`. Linear models additionally assume that the residuals have a normal distribution, which we haven't talked about. There are a large set of model classes that extend the linear model in various interesting ways. Some of them are:
-
-* __Generalised linear models__, e.g. `stats::glm()`. Linear models assume that
-  the response is continuous and the error has a normal distribution. 
-  Generalised linear models extend linear models to include non-continuous
-  responses (e.g. binary data or counts). They work by defining a distance
-  metric based on the statistical idea of likelihood.
-  
-* __Generalised additive models__, e.g. `mgcv::gam()`, extend generalised
-  linear models to incorporate arbitrary smooth functions. That means you can
-  write a formula like `y ~ s(x)` which becomes an equation like 
-  `y = f(x)` and let `gam()` estimate what that function is (subject to some
-  smoothness constraints to make the problem tractable).
-  
-* __Penalised linear models__, e.g. `glmnet::glmnet()`, add a penalty term to
-  the distance that penalises complex models (as defined by the distance 
-  between the parameter vector and the origin). This tends to make
-  models that generalise better to new datasets from the same population.
-
-* __Robust linear models__, e.g. `MASS:rlm()`, tweak the distance to downweight 
-  points that are very far away. This makes them less sensitive to the presence
-  of outliers, at the cost of being not quite as good when there are no 
-  outliers.
-  
-* __Trees__, e.g. `rpart::rpart()`, attack the problem in a completely different
-  way than linear models. They fit a piece-wise constant model, splitting the
-  data into progressively smaller and smaller pieces. Trees aren't terribly
-  effective by themselves, but they are very powerful when used in aggregate
-  by models like __random forests__ (e.g. `randomForest::randomForest()`) or 
-  __gradient boosting machines__ (e.g. `xgboost::xgboost`.)
-
-These models all work similarly from a programming perspective. Once you've mastered linear models, you should find it easy to master the mechanics of these other model classes. Being a skilled modeller is a mixture of some good general principles and having a big toolbox of techniques. Now that you've learned some general tools and one useful class of models, you can go on and learn more classes from other sources.
diff --git a/model-building.Rmd b/model-building.Rmd
deleted file mode 100644
index f2bf8fbad..000000000
--- a/model-building.Rmd
+++ /dev/null
@@ -1,470 +0,0 @@
-# Model building
-
-## Introduction
-
-In the previous chapter you learned how linear models work, and learned some basic tools for understanding what a model is telling you about your data. The previous chapter focussed on simulated datasets. This chapter will focus on real data, showing you how you can progressively build up a model to aid your understanding of the data.
-
-We will take advantage of the fact that you can think about a model partitioning your data into pattern and residuals. We'll find patterns with visualisation, then make them concrete and precise with a model. We'll then repeat the process, but replace the old response variable with the residuals from the model. The goal is to transition from implicit knowledge in the data and your head to explicit knowledge in a quantitative model. This makes it easier to apply to new domains, and easier for others to use. 
-
-For very large and complex datasets this will be a lot of work. There are certainly alternative approaches - a more machine learning approach is simply to focus on the predictive ability of the model. These approaches tend to produce black boxes: the model does a really good job at generating predictions, but you don't know why. This is a totally reasonable approach, but it does make it hard to apply your real world knowledge to the model. That, in turn, makes it difficult to assess whether or not the model will continue to work in the long-term, as fundamentals change. For most real models, I'd expect you to use some combination of this approach and a more classic automated approach.
-
-It's a challenge to know when to stop. You need to figure out when your model is good enough, and when additional investment is unlikely to pay off. I particularly like this quote from reddit user Broseidon241: 
-
-> A long time ago in art class, my teacher told me "An artist needs to know 
-> when a piece is done. You can't tweak something into perfection - wrap it up. 
-> If you don't like it, do it over again. Otherwise begin something new". Later
-> in life, I heard "A poor seamstress makes many mistakes. A good seamstress 
-> works hard to correct those mistakes. A great seamstress isn't afraid to 
-> throw out the garment and start over."
-
--- Broseidon241, <https://www.reddit.com/r/datascience/comments/4irajq>
-
-### Prerequisites
-
-We'll use the same tools as in the previous chapter, but add in some real datasets: `diamonds` from ggplot2, and `flights` from nycflights13.  We'll also need lubridate in order to work with the date/times in `flights`.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-library(modelr)
-options(na.action = na.warn)
-
-library(nycflights13)
-library(lubridate)
-```
-
-## Why are low quality diamonds more expensive? {#diamond-prices}
-
-In previous chapters we've seen a surprising relationship between the quality of diamonds and their price: low quality diamonds (poor cuts, bad colours, and inferior clarity) have higher prices.
-
-```{r dev = "png"}
-ggplot(diamonds, aes(cut, price)) + geom_boxplot()
-ggplot(diamonds, aes(color, price)) + geom_boxplot()
-ggplot(diamonds, aes(clarity, price)) + geom_boxplot()
-```
-
-Note that the worst diamond color is J (slightly yellow), and the worst clarity is I1 (inclusions visible to the naked eye).
-
-### Price and carat
-
-It looks like lower quality diamonds have higher prices because there is an important confounding variable: the weight (`carat`) of the diamond. The weight of the diamond is the single most important factor for determining the price of the diamond, and lower quality diamonds tend to be larger.
-
-```{r}
-ggplot(diamonds, aes(carat, price)) + 
-  geom_hex(bins = 50)
-```
-
-We can make it easier to see how the other attributes of a diamond affect its relative `price` by fitting a model to separate out the effect of `carat`. But first, lets make a couple of tweaks to the diamonds dataset to make it easier to work with:
-
-1. Focus on diamonds smaller than 2.5 carats (99.7% of the data)
-1. Log-transform the carat and price variables.
-
-```{r}
-diamonds2 <- diamonds %>% 
-  filter(carat <= 2.5) %>% 
-  mutate(lprice = log2(price), lcarat = log2(carat))
-```
-
-Together, these changes make it easier to see the relationship between `carat` and `price`:
-
-```{r}
-ggplot(diamonds2, aes(lcarat, lprice)) + 
-  geom_hex(bins = 50)
-```
-
-The log-transformation is particularly useful here because it makes the pattern linear, and linear patterns are the easiest to work with. Let's take the next step and remove that strong linear pattern. We first make the pattern explicit by fitting a model:
-
-```{r}
-mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
-```
-
-Then we look at what the model tells us about the data. Note that I back transform the predictions, undoing the log transformation, so I can overlay the predictions on the raw data:
-
-```{r}
-grid <- diamonds2 %>% 
-  data_grid(carat = seq_range(carat, 20)) %>% 
-  mutate(lcarat = log2(carat)) %>% 
-  add_predictions(mod_diamond, "lprice") %>% 
-  mutate(price = 2 ^ lprice)
-
-ggplot(diamonds2, aes(carat, price)) + 
-  geom_hex(bins = 50) + 
-  geom_line(data = grid, colour = "red", size = 1)
-```
-
-That tells us something interesting about our data. If we believe our model, then the large diamonds are much cheaper than expected. This is probably because no diamond in this dataset costs more than $19,000.
-
-Now we can look at the residuals, which verifies that we've successfully removed the strong linear pattern:
-
-```{r}
-diamonds2 <- diamonds2 %>% 
-  add_residuals(mod_diamond, "lresid")
-
-ggplot(diamonds2, aes(lcarat, lresid)) + 
-  geom_hex(bins = 50)
-```
-
-Importantly, we can now re-do our motivating plots using those residuals instead of `price`. 
-
-```{r dev = "png"}
-ggplot(diamonds2, aes(cut, lresid)) + geom_boxplot()
-ggplot(diamonds2, aes(color, lresid)) + geom_boxplot()
-ggplot(diamonds2, aes(clarity, lresid)) + geom_boxplot()
-```
-
-Now we see the relationship we expect: as the quality of the diamond increases, so too does its relative price. To interpret the `y` axis, we need to think about what the residuals are telling us, and what scale they are on. A residual of -1 indicates that `lprice` was 1 unit lower than a prediction based solely on its weight. $2^{-1}$ is 1/2, points with a value of -1 are half the expected price, and residuals with value 1 are twice the predicted price.
-
-### A more complicated model
-
-If we wanted to, we could continue to build up our model, moving the effects we've observed into the model to make them explicit. For example, we could include `color`, `cut`, and `clarity` into the model so that we also make explicit the effect of these three categorical variables:
-
-```{r}
-mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity, data = diamonds2)
-```
-
-This model now includes four predictors, so it's getting harder to visualise. Fortunately, they're currently all independent which means that we can plot them individually in four plots. To make the process a little easier, we're going to use the `.model` argument to `data_grid`:
-
-```{r}
-grid <- diamonds2 %>% 
-  data_grid(cut, .model = mod_diamond2) %>% 
-  add_predictions(mod_diamond2)
-grid
-
-ggplot(grid, aes(cut, pred)) + 
-  geom_point()
-```
-
-If the model needs variables that you haven't explicitly supplied, `data_grid()` will automatically fill them in with "typical" value. For continuous variables, it uses the median, and categorical variables it uses the most common value (or values, if there's a tie).
-
-```{r}
-diamonds2 <- diamonds2 %>% 
-  add_residuals(mod_diamond2, "lresid2")
-
-ggplot(diamonds2, aes(lcarat, lresid2)) + 
-  geom_hex(bins = 50)
-```
-
-This plot indicates that there are some diamonds with quite large residuals - remember a residual of 2 indicates that the diamond is 4x the price that we expected. It's often useful to look at unusual values individually:
-
-```{r}
-diamonds2 %>% 
-  filter(abs(lresid2) > 1) %>% 
-  add_predictions(mod_diamond2) %>% 
-  mutate(pred = round(2 ^ pred)) %>% 
-  select(price, pred, carat:table, x:z) %>% 
-  arrange(price)
-```
-
-Nothing really jumps out at me here, but it's probably worth spending time considering if this indicates a problem with our model, or if there are errors in the data. If there are mistakes in the data, this could be an opportunity to buy diamonds that have been priced low incorrectly.
-
-### Exercises
-
-1.  In the plot of `lcarat` vs. `lprice`, there are some bright vertical
-    strips. What do they represent?
-
-1.  If `log(price) = a_0 + a_1 * log(carat)`, what does that say about 
-    the relationship between `price` and `carat`?
-    
-1.  Extract the diamonds that have very high and very low residuals. 
-    Is there anything unusual about these diamonds? Are they particularly bad
-    or good, or do you think these are pricing errors?
-
-1.  Does the final model, `mod_diamond2`, do a good job of predicting
-    diamond prices? Would you trust it to tell you how much to spend
-    if you were buying a diamond?
-
-## What affects the number of daily flights?
-
-Let's work through a similar process for a dataset that seems even simpler at first glance: the number of flights that leave NYC per day. This is a really small dataset --- only 365 rows and 2 columns --- and we're not going to end up with a fully realised model, but as you'll see, the steps along the way will help us better understand the data. Let's get started by counting the number of flights per day and visualising it with ggplot2.
-
-```{r}
-daily <- flights %>% 
-  mutate(date = make_date(year, month, day)) %>% 
-  group_by(date) %>% 
-  summarise(n = n())
-daily
-
-ggplot(daily, aes(date, n)) + 
-  geom_line()
-```
-
-### Day of week
-
-Understanding the long-term trend is challenging because there's a very strong day-of-week effect that dominates the subtler patterns. Let's start by looking at the distribution of flight numbers by day-of-week:
-
-```{r}
-daily <- daily %>% 
-  mutate(wday = wday(date, label = TRUE))
-ggplot(daily, aes(wday, n)) + 
-  geom_boxplot()
-```
-
-There are fewer flights on weekends because most travel is for business. The effect is particularly pronounced on Saturday: you might sometimes leave on Sunday for a Monday morning meeting, but it's very rare that you'd leave on Saturday as you'd much rather be at home with your family.
-
-One way to remove this strong pattern is to use a model. First, we fit the model, and display its predictions overlaid on the original data:
-
-```{r}
-mod <- lm(n ~ wday, data = daily)
-
-grid <- daily %>% 
-  data_grid(wday) %>% 
-  add_predictions(mod, "n")
-
-ggplot(daily, aes(wday, n)) + 
-  geom_boxplot() +
-  geom_point(data = grid, colour = "red", size = 4)
-```
-
-Next we compute and visualise the residuals:
-
-```{r}
-daily <- daily %>% 
-  add_residuals(mod)
-daily %>% 
-  ggplot(aes(date, resid)) + 
-  geom_ref_line(h = 0) + 
-  geom_line()
-```
-
-Note the change in the y-axis: now we are seeing the deviation from the expected number of flights, given the day of week. This plot is useful because now that we've removed much of the large day-of-week effect, we can see some of the subtler patterns that remain:
-
-1.  Our model seems to fail starting in June: you can still see a strong 
-    regular pattern that our model hasn't captured. Drawing a plot with one 
-    line for each day of the week makes the cause easier to see:
-
-    ```{r}
-    ggplot(daily, aes(date, resid, colour = wday)) + 
-      geom_ref_line(h = 0) + 
-      geom_line()
-    ```
-
-    Our model fails to accurately predict the number of flights on Saturday:
-    during summer there are more flights than we expect, and during Fall there 
-    are fewer. We'll see how we can do better to capture this pattern in the
-    next section.
-
-1.  There are some days with far fewer flights than expected:
-
-    ```{r}
-    daily %>% 
-      filter(resid < -100)
-    ```
-
-    If you're familiar with American public holidays, you might spot New Year's 
-    day, July 4th, Thanksgiving and Christmas. There are some others that don't 
-    seem to correspond to public holidays. You'll work on those in one 
-    of the exercises.
-    
-1.  There seems to be some smoother long term trend over the course of a year.
-    We can highlight that trend with `geom_smooth()`:
-
-    ```{r}
-    daily %>% 
-      ggplot(aes(date, resid)) + 
-      geom_ref_line(h = 0) + 
-      geom_line(colour = "grey50") + 
-      geom_smooth(se = FALSE, span = 0.20)
-    ```
-
-    There are fewer flights in January (and December), and more in summer 
-    (May-Sep). We can't do much with this pattern quantitatively, because we 
-    only have a single year of data. But we can use our domain knowledge to 
-    brainstorm potential explanations.
-
-### Seasonal Saturday effect
-
-Let's first tackle our failure to accurately predict the number of flights on Saturday. A good place to start is to go back to the raw numbers, focussing on Saturdays:
-
-```{r}
-daily %>% 
-  filter(wday == "Sat") %>% 
-  ggplot(aes(date, n)) + 
-    geom_point() + 
-    geom_line() +
-    scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b")
-```
-
-(I've used both points and lines to make it more clear what is data and what is interpolation.)
-
-I suspect this pattern is caused by summer holidays: many people go on holiday in the summer, and people don't mind travelling on Saturdays for vacation. Looking at this plot, we might guess that summer holidays are from early June to late August. That seems to line up fairly well with the [state's school terms](http://schools.nyc.gov/Calendar/2013-2014+School+Year+Calendars.htm): summer break in 2013 was Jun 26--Sep 9. 
-
-Why are there more Saturday flights in the Spring than the Fall? I asked some American friends and they suggested that it's less common to plan family vacations during the Fall because of the big Thanksgiving and Christmas holidays. We don't have the data to know for sure, but it seems like a plausible working hypothesis.
-
-Lets create a "term" variable that roughly captures the three school terms, and check our work with a plot:
-
-```{r}
-term <- function(date) {
-  cut(date, 
-    breaks = ymd(20130101, 20130605, 20130825, 20140101),
-    labels = c("spring", "summer", "fall") 
-  )
-}
-
-daily <- daily %>% 
-  mutate(term = term(date)) 
-
-daily %>% 
-  filter(wday == "Sat") %>% 
-  ggplot(aes(date, n, colour = term)) +
-  geom_point(alpha = 1/3) + 
-  geom_line() +
-  scale_x_date(NULL, date_breaks = "1 month", date_labels = "%b")
-```
-
-(I manually tweaked the dates to get nice breaks in the plot. Using a visualisation to help you understand what your function is doing is a really powerful and general technique.)
-
-It's useful to see how this new variable affects the other days of the week:
-
-```{r}
-daily %>% 
-  ggplot(aes(wday, n, colour = term)) +
-    geom_boxplot()
-```
-
-It looks like there is significant variation across the terms, so fitting a separate day of week effect for each term is reasonable. This improves our model, but not as much as we might hope:
-
-```{r}
-mod1 <- lm(n ~ wday, data = daily)
-mod2 <- lm(n ~ wday * term, data = daily)
-
-daily %>% 
-  gather_residuals(without_term = mod1, with_term = mod2) %>% 
-  ggplot(aes(date, resid, colour = model)) +
-    geom_line(alpha = 0.75)
-```
-
-We can see the problem by overlaying the predictions from the model on to the raw data:
-
-```{r}
-grid <- daily %>% 
-  data_grid(wday, term) %>% 
-  add_predictions(mod2, "n")
-
-ggplot(daily, aes(wday, n)) +
-  geom_boxplot() + 
-  geom_point(data = grid, colour = "red") + 
-  facet_wrap(~ term)
-```
-
-Our model is finding the _mean_ effect, but we have a lot of big outliers, so mean tends to be far away from the typical value. We can alleviate this problem by using a model that is robust to the effect of outliers: `MASS::rlm()`. This greatly reduces the impact of the outliers on our estimates, and gives a model that does a good job of removing the day of week pattern:
-
-```{r, warn = FALSE}
-mod3 <- MASS::rlm(n ~ wday * term, data = daily)
-
-daily %>% 
-  add_residuals(mod3, "resid") %>% 
-  ggplot(aes(date, resid)) + 
-  geom_hline(yintercept = 0, size = 2, colour = "white") + 
-  geom_line()
-```
-
-It's now much easier to see the long-term trend, and the positive and negative outliers.
-
-
-### Computed variables
-
-If you're experimenting with many models and many visualisations, it's a good idea to bundle the creation of variables up into a function so there's no chance of accidentally applying a different transformation in different places. For example, we could write:
-
-```{r}
-compute_vars <- function(data) {
-  data %>% 
-    mutate(
-      term = term(date), 
-      wday = wday(date, label = TRUE)
-    )
-}
-```
-
-Another option is to put the transformations directly in the model formula:
-
-```{r}
-wday2 <- function(x) wday(x, label = TRUE)
-mod3 <- lm(n ~ wday2(date) * term(date), data = daily)
-```
-
-Either approach is reasonable. Making the transformed variable explicit is useful if you want to check your work, or use them in a visualisation. But you can't easily use transformations (like splines) that return multiple columns. Including the transformations in the model function makes life a little easier when you're working with many different datasets because the model is self contained.
-
-### Time of year: an alternative approach
-
-In the previous section we used our domain knowledge (how the US school term affects travel) to improve the model. An alternative to using our knowledge explicitly in the model is to give the data more room to speak. We could use a more flexible model and allow that to capture the pattern we're interested in. A simple linear trend isn't adequate, so we could try using a natural spline to fit a smooth curve across the year:
-
-```{r}
-library(splines)
-mod <- MASS::rlm(n ~ wday * ns(date, 5), data = daily)
-
-daily %>% 
-  data_grid(wday, date = seq_range(date, n = 13)) %>% 
-  add_predictions(mod) %>% 
-  ggplot(aes(date, pred, colour = wday)) + 
-    geom_line() +
-    geom_point()
-```
-
-We see a strong pattern in the numbers of Saturday flights. This is reassuring, because we also saw that pattern in the raw data. It's a good sign when you get the same signal from different approaches.
-
-
-### Exercises
-
-1.  Use your Google sleuthing skills to brainstorm why there were fewer than
-    expected flights on Jan 20, May 26, and Sep 1. (Hint: they all have the
-    same explanation.) How would these days generalise to another year?
-
-1.  What do the three days with high positive residuals represent?
-    How would these days generalise to another year?
-
-    ```{r}
-    daily %>% 
-      top_n(3, resid)
-    ```
-
-1.  Create a new variable that splits the `wday` variable into terms, but only
-    for Saturdays, i.e. it should have `Thurs`, `Fri`, but `Sat-summer`, 
-    `Sat-spring`, `Sat-fall`. How does this model compare with the model with 
-    every combination of `wday` and `term`?
-    
-1.  Create a new `wday` variable that combines the day of week, term 
-    (for Saturdays), and public holidays. What do the residuals of 
-    that model look like?
-
-1.  What happens if you fit a day of week effect that varies by month 
-    (i.e. `n ~ wday * month`)? Why is this not very helpful? 
-
-1.  What would you expect the model `n ~ wday + ns(date, 5)` to look like?
-    Knowing what you know about the data, why would you expect it to be
-    not particularly effective?
-
-1.  We hypothesised that people leaving on Sundays are more likely to be 
-    business travellers who need to be somewhere on Monday. Explore that 
-    hypothesis by seeing how it breaks down based on distance and time: if 
-    it's true, you'd expect to see more Sunday evening flights to places that 
-    are far away.
-
-1.  It's a little frustrating that Sunday and Saturday are on separate ends
-    of the plot. Write a small function to set the levels of the 
-    factor so that the week starts on Monday.
-
-## Learning more about models
-
-We have only scratched the absolute surface of modelling, but you have hopefully gained some simple, but general-purpose tools that you can use to improve your own data analyses. It's OK to start simple! As you've seen, even very simple models can make a dramatic difference in your ability to tease out interactions between variables.
-
-These modelling chapters are even more opinionated than the rest of the book. I approach modelling from a somewhat different perspective to most others, and there is relatively little space devoted to it. Modelling really deserves a book on its own, so I'd highly recommend that you read at least one of these three books:
-
-* *Statistical Modeling: A Fresh Approach* by Danny Kaplan,
-  <http://www.mosaic-web.org/go/StatisticalModeling/>. This book provides 
-  a gentle introduction to modelling, where you build your intuition,
-  mathematical tools, and R skills in parallel. The book replaces a traditional
-  "introduction to statistics" course, providing a curriculum that is up-to-date 
-  and relevant to data science.
-
-* *An Introduction to Statistical Learning* by Gareth James, Daniela Witten, 
-  Trevor Hastie, and Robert Tibshirani, <http://www-bcf.usc.edu/~gareth/ISL/> 
-  (available online for free). This book presents a family of modern modelling
-  techniques collectively known as statistical learning.  For an even deeper
-  understanding of the math behind the models, read the classic 
-  *Elements of Statistical Learning* by Trevor Hastie, Robert Tibshirani, and
-  Jerome Friedman, <http://statweb.stanford.edu/~tibs/ElemStatLearn/> (also
-  available online for free).
-
-* *Applied Predictive Modeling* by Max Kuhn and Kjell Johnson, 
-  <http://appliedpredictivemodeling.com>. This book is a companion to the 
-  __caret__ package and provides practical tools for dealing with real-life
-  predictive modelling challenges.
diff --git a/model-many.Rmd b/model-many.Rmd
deleted file mode 100644
index 124794b4b..000000000
--- a/model-many.Rmd
+++ /dev/null
@@ -1,578 +0,0 @@
-# Many models
-
-## Introduction
-
-In this chapter you're going to learn three powerful ideas that help you to work with large numbers of models with ease:
-
-1.  Using many simple models to better understand complex datasets.
-
-1.  Using list-columns to store arbitrary data structures in a data frame.
-    For example, this will allow you to have a column that contains linear 
-    models.
-   
-1.  Using the __broom__ package, by David Robinson, to turn models into tidy 
-    data. This is a powerful technique for working with large numbers of models
-    because once you have tidy data, you can apply all of the techniques that 
-    you've learned about earlier in the book.
-
-We'll start by diving into a motivating example using data about life expectancy around the world. It's a small dataset but it illustrates how important modelling can be for improving your visualisations. We'll use a large number of simple models to partition out some of the strongest signals so we can see the subtler signals that remain. We'll also see how model summaries can help us pick out outliers and unusual trends.
-
-The following sections will dive into more detail about the individual techniques:
-
-1. In [list-columns], you'll learn more about the list-column data structure,
-   and why it's valid to put lists in data frames.
-   
-1. In [creating list-columns], you'll learn the three main ways in which you'll
-   create list-columns.
-   
-1. In [simplifying list-columns] you'll learn how to convert list-columns back
-   to regular atomic vectors (or sets of atomic vectors) so you can work
-   with them more easily.
-   
-1. In [making tidy data with broom], you'll learn about the full set of tools
-   provided by broom, and see how they can be applied to other types of 
-   data structure.
-
-This chapter is somewhat aspirational: if this book is your first introduction to R, this chapter is likely to be a struggle. It requires you have to deeply internalised ideas about modelling, data structures, and iteration. So don't worry if you don't get it --- just put this chapter aside for a few months, and come back when you want to stretch your brain. 
-
-### Prerequisites
-
-Working with many models requires many of the packages of the tidyverse (for data exploration, wrangling, and programming) and modelr to facilitate modelling.
-
-```{r setup, message = FALSE}
-library(modelr)
-library(tidyverse)
-```
-
-## gapminder
-
-To motivate the power of many simple models, we're going to look into the "gapminder" data. This data was popularised by Hans Rosling, a Swedish doctor and statistician. If you've never heard of him, stop reading this chapter right now and go watch one of his videos! He is a fantastic data presenter and illustrates how you can use data to present a compelling story. A good place to start is this short video filmed in conjunction with the BBC: <https://www.youtube.com/watch?v=jbkSRLYSojo>.
-
-The gapminder data summarises the progression of countries over time, looking at statistics like life expectancy and GDP. The data is easy to access in R, thanks to Jenny Bryan who created the gapminder package:
-
-```{r}
-library(gapminder)
-gapminder
-```
-
-In this case study, we're going to focus on just three variables to answer the question "How does life expectancy (`lifeExp`) change over time (`year`) for each country (`country`)?". A good place to start is with a plot:
-
-```{r}
-gapminder %>% 
-  ggplot(aes(year, lifeExp, group = country)) +
-    geom_line(alpha = 1/3)
-```
-
-This is a small dataset: it only has ~1,700 observations and 3 variables. But it's still hard to see what's going on! Overall, it looks like life expectancy has been steadily improving. However, if you look closely, you might notice some countries that don't follow this pattern. How can we make those countries easier to see?
-
-One way is to use the same approach as in the last chapter: there's a strong signal (overall linear growth) that makes it hard to see subtler trends. We'll tease these factors apart by fitting a model with a linear trend. The model captures steady growth over time, and the residuals will show what's left.
-
-You already know how to do that if we had a single country:
-
-```{r, out.width = "33%", fig.asp = 1, fig.width = 3, fig.align='default'}
-nz <- filter(gapminder, country == "New Zealand")
-nz %>% 
-  ggplot(aes(year, lifeExp)) + 
-  geom_line() + 
-  ggtitle("Full data = ")
-
-nz_mod <- lm(lifeExp ~ year, data = nz)
-nz %>% 
-  add_predictions(nz_mod) %>%
-  ggplot(aes(year, pred)) + 
-  geom_line() + 
-  ggtitle("Linear trend + ")
-
-nz %>% 
-  add_residuals(nz_mod) %>% 
-  ggplot(aes(year, resid)) + 
-  geom_hline(yintercept = 0, colour = "white", size = 3) + 
-  geom_line() + 
-  ggtitle("Remaining pattern")
-```
-
-How can we easily fit that model to every country?
-
-### Nested data
-
-You could imagine copy and pasting that code multiple times; but you've already learned a better way! Extract out the common code with a function and repeat using a map function from purrr. This problem is structured a little differently to what you've seen before. Instead of repeating an action for each variable, we want to repeat an action for each country, a subset of rows. To do that, we need a new data structure: the __nested data frame__. To create a nested data frame we start with a grouped data frame, and "nest" it:
-
-```{r}
-by_country <- gapminder %>% 
-  group_by(country, continent) %>% 
-  nest()
-
-by_country
-```
-
-(I'm cheating a little by grouping on both `continent` and `country`. Given `country`, `continent` is fixed, so this doesn't add any more groups, but it's an easy way to carry an extra variable along for the ride.)
-
-This creates a data frame that has one row per group (per country), and a rather unusual column: `data`. `data` is a list of data frames (or tibbles, to be precise).  This seems like a crazy idea: we have a data frame with a column that is a list of other data frames! I'll explain shortly why I think this is a good idea.
-
-The `data` column is a little tricky to look at because it's a moderately complicated list, and we're still working on good tools to explore these objects. Unfortunately using `str()` is not recommended as it will often produce very long output. But if you pluck out a single element from the `data` column you'll see that it contains all the data for that country (in this case, Afghanistan).
-
-```{r}
-by_country$data[[1]]
-```
-
-Note the difference between a standard grouped data frame and a nested data frame: in a grouped data frame, each row is an observation; in a nested data frame, each row is a group. Another way to think about a nested dataset is we now have a meta-observation: a row that represents the complete time course for a country, rather than a single point in time.
-
-### List-columns
-
-Now that we have our nested data frame, we're in a good position to fit some models. We have a model-fitting function:
-
-```{r}
-country_model <- function(df) {
-  lm(lifeExp ~ year, data = df)
-}
-```
-
-And we want to apply it to every data frame. The data frames are in a list, so we can use `purrr::map()` to apply `country_model` to each element:
-
-```{r}
-models <- map(by_country$data, country_model)
-```
-
-However, rather than leaving the list of models as a free-floating object, I think it's better to store it as a column in the `by_country` data frame. Storing related objects in columns is a key part of the value of data frames, and why I think list-columns are such a good idea. In the course of working with these countries, we are going to have lots of lists where we have one element per country. So why not store them all together in one data frame?
-
-In other words, instead of creating a new object in the global environment, we're going to create a new variable in the `by_country` data frame. That's a job for `dplyr::mutate()`:
-
-```{r}
-by_country <- by_country %>% 
-  mutate(model = map(data, country_model))
-by_country
-```
-
-This has a big advantage: because all the related objects are stored together, you don't need to manually keep them in sync when you filter or arrange. The semantics of the data frame takes care of that for you:
-
-```{r}
-by_country %>% 
-  filter(continent == "Europe")
-by_country %>% 
-  arrange(continent, country)
-```
-
-If your list of data frames and list of models were separate objects, you have to remember that whenever you re-order or subset one vector, you need to re-order or subset all the others in order to keep them in sync. If you forget, your code will continue to work, but it will give the wrong answer!
-
-### Unnesting
-
-Previously we computed the residuals of a single model with a single dataset. Now we have 142 data frames and 142 models. To compute the residuals, we need to call `add_residuals()` with each model-data pair:
-
-```{r}
-by_country <- by_country %>% 
-  mutate(
-    resids = map2(data, model, add_residuals)
-  )
-by_country
-```
-
-But how you can plot a list of data frames? Instead of struggling to answer that question, let's turn the list of data frames back into a regular data frame. Previously we used `nest()` to turn a regular data frame into an nested data frame, and now we do the opposite with `unnest()`:
-
-```{r}
-resids <- unnest(by_country, resids)
-resids
-```
-
-Note that each regular column is repeated one for each row in the nested column.
-
-Now we have regular data frame, we can plot the residuals:
-
-```{r}
-resids %>% 
-  ggplot(aes(year, resid)) +
-    geom_line(aes(group = country), alpha = 1 / 3) + 
-    geom_smooth(se = FALSE)
-
-```
-
-Facetting by continent is particularly revealing:
- 
-```{r}
-resids %>% 
-  ggplot(aes(year, resid, group = country)) +
-    geom_line(alpha = 1 / 3) + 
-    facet_wrap(~continent)
-```
-
-It looks like we've missed some mild patterns. There's also something interesting going on in Africa: we see some very large residuals which suggests our model isn't fitting so well there. We'll explore that more in the next section, attacking it from a slightly different angle.
-
-### Model quality
-
-Instead of looking at the residuals from the model, we could look at some general measurements of model quality. You learned how to compute some specific measures in the previous chapter. Here we'll show a different approach using the broom package. The broom package provides a general set of functions to turn models into tidy data. Here we'll use `broom::glance()` to extract some model quality metrics. If we apply it to a model, we get a data frame with a single row:
-
-```{r}
-broom::glance(nz_mod)
-```
-
-We can use `mutate()` and `unnest()` to create a data frame with a row for each country:
-
-```{r}
-by_country %>% 
-  mutate(glance = map(model, broom::glance)) %>% 
-  unnest(glance)
-```
-
-This isn't quite the output we want, because it still includes all the list columns. This is default behaviour when `unnest()` works on single row data frames. To suppress these columns we use `.drop = TRUE`:
-
-```{r}
-glance <- by_country %>% 
-  mutate(glance = map(model, broom::glance)) %>% 
-  unnest(glance, .drop = TRUE)
-glance
-```
-
-(Pay attention to the variables that aren't printed: there's a lot of useful stuff there.)
-
-With this data frame in hand, we can start to look for models that don't fit well:
-
-```{r}
-glance %>% 
-  arrange(r.squared)
-```
-
-The worst models all appear to be in Africa. Let's double check that with a plot. Here we have a relatively small number of observations and a discrete variable, so `geom_jitter()` is effective:
-
-```{r}
-glance %>% 
-  ggplot(aes(continent, r.squared)) + 
-    geom_jitter(width = 0.5)
-```
-
-We could pull out the countries with particularly bad $R^2$ and plot the data:
-
-```{r}
-bad_fit <- filter(glance, r.squared < 0.25)
-
-gapminder %>% 
-  semi_join(bad_fit, by = "country") %>% 
-  ggplot(aes(year, lifeExp, colour = country)) +
-    geom_line()
-```
-
-We see two main effects here: the tragedies of the HIV/AIDS epidemic and the Rwandan genocide.
-
-### Exercises
-
-1.  A linear trend seems to be slightly too simple for the overall trend.
-    Can you do better with a quadratic polynomial? How can you interpret
-    the coefficients of the quadratic? (Hint you might want to transform
-    `year` so that it has mean zero.)
-
-1.  Explore other methods for visualising the distribution of $R^2$ per
-    continent. You might want to try the ggbeeswarm package, which provides 
-    similar methods for avoiding overlaps as jitter, but uses deterministic
-    methods.
-
-1.  To create the last plot (showing the data for the countries with the
-    worst model fits), we needed two steps: we created a data frame with
-    one row per country and then semi-joined it to the original dataset.
-    It's possible to avoid this join if we use `unnest()` instead of 
-    `unnest(.drop = TRUE)`. How?
-
-## List-columns
-
-Now that you've seen a basic workflow for managing many models, let's dive back into some of the details. In this section, we'll explore the list-column data structure in a little more detail. It's only recently that I've really appreciated the idea of the list-column. List-columns are implicit in the definition of the data frame: a data frame is a named list of equal length vectors. A list is a vector, so it's always been legitimate to use a list as a column of a data frame. However, base R doesn't make it easy to create list-columns, and `data.frame()` treats a list as a list of columns:.
-
-```{r}
-data.frame(x = list(1:3, 3:5))
-```
-
-You can prevent `data.frame()` from doing this with `I()`, but the result doesn't print particularly well:
-
-```{r}
-data.frame(
-  x = I(list(1:3, 3:5)), 
-  y = c("1, 2", "3, 4, 5")
-)
-```
-
-Tibble alleviates this problem by being lazier (`tibble()` doesn't modify its inputs) and by providing a better print method:
-
-```{r}
-tibble(
-  x = list(1:3, 3:5), 
-  y = c("1, 2", "3, 4, 5")
-)
-```
-
-It's even easier with `tribble()` as it can automatically work out that you need a list:
-
-```{r}
-tribble(
-   ~x, ~y,
-  1:3, "1, 2",
-  3:5, "3, 4, 5"
-)
-```
-
-List-columns are often most useful as intermediate data structure. They're hard to work with directly, because most R functions work with atomic vectors or data frames, but the advantage of keeping related items together in a data frame is worth a little hassle.
-
-Generally there are three parts of an effective list-column pipeline:
-
-1.  You create the list-column using one of `nest()`, `summarise()` + `list()`,
-    or `mutate()` + a map function, as described in [Creating list-columns].
-
-1.  You create other intermediate list-columns by transforming existing
-    list columns with `map()`, `map2()` or `pmap()`. For example, 
-    in the case study above, we created a list-column of models by transforming
-    a list-column of data frames.
-    
-1.  You simplify the list-column back down to a data frame or atomic vector,
-    as described in [Simplifying list-columns].
-
-## Creating list-columns
-
-Typically, you won't create list-columns with `tibble()`. Instead, you'll create them from regular columns, using one of three methods: 
-
-1.  With `tidyr::nest()` to convert a grouped data frame into a nested data 
-    frame where you have list-column of data frames.
-    
-1.  With `mutate()` and vectorised functions that return a list.
-
-1.  With `summarise()` and summary functions that return multiple results. 
-
-Alternatively, you might create them from a named list, using `tibble::enframe()`.
-
-Generally, when creating list-columns, you should make sure they're homogeneous: each element should contain the same type of thing. There are no checks to make sure this is true, but if you use purrr and remember what you've learned about type-stable functions, you should find it happens naturally.
-
-### With nesting
-
-`nest()` creates a nested data frame, which is a data frame with a list-column of data frames. In a nested data frame each row is a meta-observation: the other columns give variables that define the observation (like country and continent above), and the list-column of data frames gives the individual observations that make up the meta-observation.
-
-There are two ways to use `nest()`. So far you've seen how to use it with a grouped data frame. When applied to a grouped data frame, `nest()` keeps the grouping columns as is, and bundles everything else into the list-column:
-
-```{r}
-gapminder %>% 
-  group_by(country, continent) %>% 
-  nest()
-```
-
-You can also use it on an ungrouped data frame, specifying which columns you want to nest:
-
-```{r}
-gapminder %>% 
-  nest(year:gdpPercap)
-```
-
-### From vectorised functions
-
-Some useful functions take an atomic vector and return a list. For example, in [strings] you learned about `stringr::str_split()` which takes a character vector and returns a list of character vectors. If you use that inside mutate, you'll get a list-column:
-
-```{r}
-df <- tribble(
-  ~x1,
-  "a,b,c", 
-  "d,e,f,g"
-) 
-
-df %>% 
-  mutate(x2 = stringr::str_split(x1, ","))
-```
-
-`unnest()` knows how to handle these lists of vectors:
-
-```{r}
-df %>% 
-  mutate(x2 = stringr::str_split(x1, ",")) %>% 
-  unnest()
-```
-
-(If you find yourself using this pattern a lot, make sure to check out `tidyr::separate_rows()` which is a wrapper around this common pattern).
-
-Another example of this pattern is using the `map()`, `map2()`, `pmap()` from purrr. For example, we could take the final example from [Invoking different functions] and rewrite it to use `mutate()`:
-
-```{r}
-sim <- tribble(
-  ~f,      ~params,
-  "runif", list(min = -1, max = 1),
-  "rnorm", list(sd = 5),
-  "rpois", list(lambda = 10)
-)
-
-sim %>%
-  mutate(sims = invoke_map(f, params, n = 10))
-```
-
-Note that technically `sim` isn't homogeneous because it contains both double and integer vectors. However, this is unlikely to cause many problems since integers and doubles are both numeric vectors.
-
-### From multivalued summaries
-
-One restriction of `summarise()` is that it only works with summary functions that return a single value. That means that you can't use it with functions like `quantile()` that return a vector of arbitrary length:
-
-```{r, error = TRUE}
-mtcars %>% 
-  group_by(cyl) %>% 
-  summarise(q = quantile(mpg))
-```
-
-You can however, wrap the result in a list! This obeys the contract of `summarise()`, because each summary is now a list (a vector) of length 1.
-
-```{r}
-mtcars %>% 
-  group_by(cyl) %>% 
-  summarise(q = list(quantile(mpg)))
-```
-
-To make useful results with unnest, you'll also need to capture the probabilities:
-
-```{r}
-probs <- c(0.01, 0.25, 0.5, 0.75, 0.99)
-mtcars %>% 
-  group_by(cyl) %>% 
-  summarise(p = list(probs), q = list(quantile(mpg, probs))) %>% 
-  unnest()
-```
-
-### From a named list
-
-Data frames with list-columns provide a solution to a common problem: what do you do if you want to iterate over both the contents of a list and its elements? Instead of trying to jam everything into one object, it's often easier to make a data frame: one column can contain the elements, and one column can contain the list.  An easy way to create such a data frame from a list is `tibble::enframe()`.  
-
-```{r}
-x <- list(
-  a = 1:5,
-  b = 3:4, 
-  c = 5:6
-) 
-
-df <- enframe(x)
-df
-```
-
-The advantage of this structure is that it generalises in a straightforward way - names are useful if you have character vector of metadata, but don't help if you have other types of data, or multiple vectors.
-
-Now if you want to iterate over names and values in parallel, you can use `map2()`:
-
-```{r}
-df %>% 
-  mutate(
-    smry = map2_chr(name, value, ~ stringr::str_c(.x, ": ", .y[1]))
-  )
-```
-
-### Exercises
-
-1.  List all the functions that you can think of that take a atomic vector and 
-    return a list.
-    
-1.  Brainstorm useful summary functions that, like `quantile()`, return
-    multiple values.
-    
-1.  What's missing in the following data frame? How does `quantile()` return
-    that missing piece? Why isn't that helpful here?
-
-    ```{r}
-    mtcars %>% 
-      group_by(cyl) %>% 
-      summarise(q = list(quantile(mpg))) %>% 
-      unnest()
-    ```
-
-1.  What does this code do? Why might might it be useful?
-
-    ```{r, eval = FALSE}
-    mtcars %>% 
-      group_by(cyl) %>% 
-      summarise_each(funs(list))
-    ```
-
-## Simplifying list-columns
-
-To apply the techniques of data manipulation and visualisation you've learned in this book, you'll need to simplify the list-column back to a regular column (an atomic vector), or set of columns. The technique you'll use to collapse back down to a simpler structure depends on whether you want a single value per element, or multiple values:
-
-1.  If you want a single value, use `mutate()` with `map_lgl()`, 
-    `map_int()`, `map_dbl()`, and `map_chr()` to create an atomic vector.
-    
-1.  If you want many values, use `unnest()` to convert list-columns back
-    to regular columns, repeating the rows as many times as necessary.
-
-These are described in more detail below.
-
-### List to vector
-
-If you can reduce your list column to an atomic vector then it will be a regular column. For example, you can always summarise an object with its type and length, so this code will work regardless of what sort of list-column you have:
-
-```{r}
-df <- tribble(
-  ~x,
-  letters[1:5],
-  1:3,
-  runif(5)
-)
-  
-df %>% mutate(
-  type = map_chr(x, typeof),
-  length = map_int(x, length)
-)
-```
-
-This is the same basic information that you get from the default tbl print method, but now you can use it for filtering. This is a useful technique if you have a heterogeneous list, and want to filter out the parts aren't working for you.
-
-Don't forget about the `map_*()` shortcuts - you can use `map_chr(x, "apple")` to extract the string stored in `apple` for each element of `x`. This is useful for pulling apart nested lists into regular columns. Use the `.null` argument to provide a value to use if the element is missing (instead of returning `NULL`):
-
-```{r}
-df <- tribble(
-  ~x,
-  list(a = 1, b = 2),
-  list(a = 2, c = 4)
-)
-df %>% mutate(
-  a = map_dbl(x, "a"),
-  b = map_dbl(x, "b", .null = NA_real_)
-)
-```
-
-### Unnesting
-
-`unnest()` works by repeating the regular columns once for each element of the list-column. For example, in the following very simple example we repeat the first row 4 times (because there the first element of `y` has length four), and the second row once:
-
-```{r}
-tibble(x = 1:2, y = list(1:4, 1)) %>% unnest(y)
-```
-
-This means that you can't simultaneously unnest two columns that contain different number of elements:
-
-```{r, error = TRUE}
-# Ok, because y and z have the same number of elements in
-# every row
-df1 <- tribble(
-  ~x, ~y,           ~z,
-   1, c("a", "b"), 1:2,
-   2, "c",           3
-)
-df1
-df1 %>% unnest(y, z)
-
-# Doesn't work because y and z have different number of elements
-df2 <- tribble(
-  ~x, ~y,           ~z,
-   1, "a",         1:2,  
-   2, c("b", "c"),   3
-)
-df2
-df2 %>% unnest(y, z)
-```
-
-The same principle applies when unnesting list-columns of data frames. You can unnest multiple list-cols as long as all the data frames in each row have the same number of rows.
-
-### Exercises
-
-1.  Why might the `lengths()` function be useful for creating atomic
-    vector columns from list-columns?
-    
-1.  List the most common types of vector found in a data frame. What makes
-    lists different?
-
-## Making tidy data with broom
-
-The broom package provides three general tools for turning models into tidy data frames:
-
-1.  `broom::glance(model)` returns a row for each model. Each column gives a 
-    model summary: either a measure of model quality, or complexity, or a 
-    combination of the two.
-   
-1.  `broom::tidy(model)` returns a row for each coefficient in the model. Each 
-    column gives information about the estimate or its variability.
-    
-1.  `broom::augment(model, data)` returns a row for each row in `data`, adding
-    extra values like residuals, and influence statistics.
-
diff --git a/model.Rmd b/model.Rmd
deleted file mode 100644
index 4468f2658..000000000
--- a/model.Rmd
+++ /dev/null
@@ -1,59 +0,0 @@
-# (PART) Model {-}
-
-# Introduction {#model-intro}
-
-Now that you are equipped with powerful programming tools we can finally return to modelling. You'll use your new tools of data wrangling and programming, to fit many models and understand how they work. The focus of this book is on exploration, not confirmation or formal inference. But you'll learn a few basic tools that help you understand the variation within your models.
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/data-science-model.png")
-```
-
-The goal of a model is to provide a simple low-dimensional summary of a dataset. Ideally, the model will capture true "signals" (i.e. patterns generated by the phenomenon of interest), and ignore "noise" (i.e. random variation that you're not interested in). Here we only cover "predictive" models, which, as the name suggests, generate predictions. There is another type of model that we're not going to discuss: "data discovery" models. These models don't make predictions, but instead help you discover interesting relationships within your data. (These two categories of models are sometimes called supervised and unsupervised, but I don't think that terminology is particularly illuminating.)
-
-This book is not going to give you a deep understanding of the mathematical theory that underlies models. It will, however, build your intuition about how statistical models work, and give you a family of useful tools that allow you to use models to better understand your data:
-
-* In [model basics], you'll learn how models work mechanistically, focussing on
-  the important family of linear models. You'll learn general tools for gaining
-  insight into what a predictive model tells you about your data, focussing on
-  simple simulated datasets.
-
-* In [model building], you'll learn how to use models to pull out known
-  patterns in real data. Once you have recognised an important pattern
-  it's useful to make it explicit in a model, because then you can
-  more easily see the subtler signals that remain.
-
-* In [many models], you'll learn how to use many simple models to help 
-  understand complex datasets. This is a powerful technique, but to access
-  it you'll need to combine modelling and programming tools.
-
-These topics are notable because of what they don't include: any tools for quantitatively assessing models. That is deliberate: precisely quantifying a model requires a couple of big ideas that we just don't have the space to cover here. For now, you'll rely on qualitative assessment and your natural scepticism. In [Learning more about models], we'll point you to other resources where you can learn more.
-
-## Hypothesis generation vs. hypothesis confirmation
-
-In this book, we are going to use models as a tool for exploration, completing the trifecta of the tools for EDA that were introduced in Part 1. This is not how models are usually taught, but as you will see, models are an important tool for exploration. Traditionally, the focus of modelling is on inference, or for confirming that an hypothesis is true. Doing this correctly is not complicated, but it is hard. There is a pair of ideas that you must understand in order to do inference correctly:
-
-1. Each observation can either be used for exploration or confirmation, 
-   not both.
-
-1. You can use an observation as many times as you like for exploration,
-   but you can only use it once for confirmation. As soon as you use an 
-   observation twice, you've switched from confirmation to exploration.
-   
-This is necessary because to confirm a hypothesis you must use data independent of the data that you used to generate the hypothesis. Otherwise you will be over optimistic. There is absolutely nothing wrong with exploration, but you should never sell an exploratory analysis as a confirmatory analysis because it is fundamentally misleading. 
-
-If you are serious about doing an confirmatory analysis, one approach is to split your data into three pieces before you begin the analysis:
-
-1.  60% of your data goes into a __training__ (or exploration) set. You're 
-    allowed to do anything you like with this data: visualise it and fit tons 
-    of models to it.
-  
-1.  20% goes into a __query__ set. You can use this data to compare models 
-    or visualisations by hand, but you're not allowed to use it as part of
-    an automated process.
-
-1.  20% is held back for a __test__ set. You can only use this data ONCE, to 
-    test your final model. 
-    
-This partitioning allows you to explore the training data, occasionally generating candidate hypotheses that you check with the query set. When you are confident you have the right model, you can check it once with the test data.
-
-(Note that even when doing confirmatory modelling, you will still need to do EDA. If you don't do any EDA you will remain blind to the quality problems with your data.)
diff --git a/numbers.qmd b/numbers.qmd
new file mode 100644
index 000000000..8fce8f496
--- /dev/null
+++ b/numbers.qmd
@@ -0,0 +1,780 @@
+# Numbers {#sec-numbers}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+Numeric vectors are the backbone of data science, and you've already used them a bunch of times earlier in the book.
+Now it's time to systematically survey what you can do with them in R, ensuring that you're well situated to tackle any future problem involving numeric vectors.
+
+We'll start by giving you a couple of tools to make numbers if you have strings, and then going into a little more detail of `count()`.
+Then we'll dive into various numeric transformations that pair well with `mutate()`, including more general transformations that can be applied to other types of vectors, but are often used with numeric vectors.
+We'll finish off by covering the summary functions that pair well with `summarize()` and show you how they can also be used with `mutate()`.
+
+### Prerequisites
+
+This chapter mostly uses functions from base R, which are available without loading any packages.
+But we still need the tidyverse because we'll use these base R functions inside of tidyverse functions like `mutate()` and `filter()`.
+Like in the last chapter, we'll use real examples from nycflights13, as well as toy examples made with `c()` and `tribble()`.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(nycflights13)
+```
+
+## Making numbers
+
+In most cases, you'll get numbers already recorded in one of R's numeric types: integer or double.
+In some cases, however, you'll encounter them as strings, possibly because you've created them by pivoting from column headers or because something has gone wrong in your data import process.
+
+readr provides two useful functions for parsing strings into numbers: `parse_double()` and `parse_number()`.
+Use `parse_double()` when you have numbers that have been written as strings:
+
+```{r}
+x <- c("1.2", "5.6", "1e3")
+parse_double(x)
+```
+
+Use `parse_number()` when the string contains non-numeric text that you want to ignore.
+This is particularly useful for currency data and percentages:
+
+```{r}
+x <- c("$1,234", "USD 3,513", "59%")
+parse_number(x)
+```
+
+## Counts {#sec-counts}
+
+It's surprising how much data science you can do with just counts and a little basic arithmetic, so dplyr strives to make counting as easy as possible with `count()`.
+This function is great for quick exploration and checks during analysis:
+
+```{r}
+flights |> count(dest)
+```
+
+(Despite the advice in @sec-workflow-style, we usually put `count()` on a single line because it's usually used at the console for a quick check that a calculation is working as expected.)
+
+If you want to see the most common values, add `sort = TRUE`:
+
+```{r}
+flights |> count(dest, sort = TRUE)
+```
+
+And remember that if you want to see all the values, you can use `|> View()` or `|> print(n = Inf)`.
+
+You can perform the same computation "by hand" with `group_by()`, `summarize()` and `n()`.
+This is useful because it allows you to compute other summaries at the same time:
+
+```{r}
+flights |> 
+  group_by(dest) |> 
+  summarize(
+    n = n(),
+    delay = mean(arr_delay, na.rm = TRUE)
+  )
+```
+
+`n()` is a special summary function that doesn't take any arguments and instead accesses information about the "current" group.
+This means that it only works inside dplyr verbs:
+
+```{r}
+#| error: true
+
+n()
+```
+
+There are a couple of variants of `n()` and `count()` that you might find useful:
+
+-   `n_distinct(x)` counts the number of distinct (unique) values of one or more variables.
+    For example, we could figure out which destinations are served by the most carriers:
+
+    ```{r}
+    flights |> 
+      group_by(dest) |> 
+      summarize(carriers = n_distinct(carrier)) |> 
+      arrange(desc(carriers))
+    ```
+
+-   A weighted count is a sum.
+    For example you could "count" the number of miles each plane flew:
+
+    ```{r}
+    flights |> 
+      group_by(tailnum) |> 
+      summarize(miles = sum(distance))
+    ```
+
+    Weighted counts are a common problem so `count()` has a `wt` argument that does the same thing:
+
+    ```{r}
+    #| results: false
+    flights |> count(tailnum, wt = distance)
+    ```
+
+-   You can count missing values by combining `sum()` and `is.na()`.
+    In the `flights` dataset this represents flights that are cancelled:
+
+    ```{r}
+    flights |> 
+      group_by(dest) |> 
+      summarize(n_cancelled = sum(is.na(dep_time))) 
+    ```
+
+### Exercises
+
+1.  How can you use `count()` to count the number rows with a missing value for a given variable?
+2.  Expand the following calls to `count()` to instead use `group_by()`, `summarize()`, and `arrange()`:
+    1.  `flights |> count(dest, sort = TRUE)`
+
+    2.  `flights |> count(tailnum, wt = distance)`
+
+## Numeric transformations
+
+Transformation functions work well with `mutate()` because their output is the same length as the input.
+The vast majority of transformation functions are already built into base R.
+It's impractical to list them all so this section will show the most useful ones.
+As an example, while R provides all the trigonometric functions that you might dream of, we don't list them here because they're rarely needed for data science.
+
+### Arithmetic and recycling rules {#sec-recycling}
+
+We introduced the basics of arithmetic (`+`, `-`, `*`, `/`, `^`) in @sec-workflow-basics and have used them a bunch since.
+These functions don't need a huge amount of explanation because they do what you learned in grade school.
+But we need to briefly talk about the **recycling rules** which determine what happens when the left and right hand sides have different lengths.
+This is important for operations like `flights |> mutate(air_time = air_time / 60)` because there are 336,776 numbers on the left of `/` but only one on the right.
+
+R handles mismatched lengths by **recycling,** or repeating, the short vector.
+We can see this in operation more easily if we create some vectors outside of a data frame:
+
+```{r}
+x <- c(1, 2, 10, 20)
+x / 5
+# is shorthand for
+x / c(5, 5, 5, 5)
+```
+
+Generally, you only want to recycle single numbers (i.e. vectors of length 1), but R will recycle any shorter length vector.
+It usually (but not always) gives you a warning if the longer vector isn't a multiple of the shorter:
+
+```{r}
+x * c(1, 2)
+x * c(1, 2, 3)
+```
+
+These recycling rules are also applied to logical comparisons (`==`, `<`, `<=`, `>`, `>=`, `!=`) and can lead to a surprising result if you accidentally use `==` instead of `%in%` and the data frame has an unfortunate number of rows.
+For example, take this code which attempts to find all flights in January and February:
+
+```{r}
+flights |> 
+  filter(month == c(1, 2))
+```
+
+The code runs without error, but it doesn't return what you want.
+Because of the recycling rules it finds flights in odd numbered rows that departed in January and flights in even numbered rows that departed in February.
+And unfortunately there's no warning because `flights` has an even number of rows.
+
+To protect you from this type of silent failure, most tidyverse functions use a stricter form of recycling that only recycles single values.
+Unfortunately that doesn't help here, or in many other cases, because the key computation is performed by the base R function `==`, not `filter()`.
+
+### Minimum and maximum
+
+The arithmetic functions work with pairs of variables.
+Two closely related functions are `pmin()` and `pmax()`, which when given two or more variables will return the smallest or largest value in each row:
+
+```{r}
+df <- tribble(
+  ~x, ~y,
+  1,  3,
+  5,  2,
+  7, NA,
+)
+
+df |> 
+  mutate(
+    min = pmin(x, y, na.rm = TRUE),
+    max = pmax(x, y, na.rm = TRUE)
+  )
+```
+
+Note that these are different to the summary functions `min()` and `max()` which take multiple observations and return a single value.
+You can tell that you've used the wrong form when all the minimums and all the maximums have the same value:
+
+```{r}
+df |> 
+  mutate(
+    min = min(x, y, na.rm = TRUE),
+    max = max(x, y, na.rm = TRUE)
+  )
+```
+
+### Modular arithmetic
+
+Modular arithmetic is the technical name for the type of math you did before you learned about decimal places, i.e. division that yields a whole number and a remainder.
+In R, `%/%` does integer division and `%%` computes the remainder:
+
+```{r}
+1:10 %/% 3
+1:10 %% 3
+```
+
+Modular arithmetic is handy for the `flights` dataset, because we can use it to unpack the `sched_dep_time` variable into `hour` and `minute`:
+
+```{r}
+flights |> 
+  mutate(
+    hour = sched_dep_time %/% 100,
+    minute = sched_dep_time %% 100,
+    .keep = "used"
+  )
+```
+
+We can combine that with the `mean(is.na(x))` trick from @sec-logical-summaries to see how the proportion of cancelled flights varies over the course of the day.
+The results are shown in @fig-prop-cancelled.
+
+```{r}
+#| label: fig-prop-cancelled
+#| fig-cap: | 
+#|   A line plot with scheduled departure hour on the x-axis, and proportion
+#|   of cancelled flights on the y-axis. Cancellations seem to accumulate
+#|   over the course of the day until 8pm, very late flights are much
+#|   less likely to be cancelled.
+#| fig-alt: |
+#|   A line plot showing how proportion of cancelled flights changes over
+#|   the course of the day. The proportion starts low at around 0.5% at
+#|   6am, then steadily increases over the course of the day until peaking
+#|   at 4% at 7pm. The proportion of cancelled flights then drops rapidly
+#|   getting down to around 1% by midnight.
+flights |> 
+  group_by(hour = sched_dep_time %/% 100) |> 
+  summarize(prop_cancelled = mean(is.na(dep_time)), n = n()) |> 
+  filter(hour > 1) |> 
+  ggplot(aes(x = hour, y = prop_cancelled)) +
+  geom_line(color = "grey50") + 
+  geom_point(aes(size = n))
+```
+
+### Logarithms
+
+Logarithms are an incredibly useful transformation for dealing with data that ranges across multiple orders of magnitude and converting exponential growth to linear growth.
+In R, you have a choice of three logarithms: `log()` (the natural log, base e), `log2()` (base 2), and `log10()` (base 10).
+We recommend using `log2()` or `log10()`.
+`log2()` is easy to interpret because a difference of 1 on the log scale corresponds to doubling on the original scale and a difference of -1 corresponds to halving; whereas `log10()` is easy to back-transform because (e.g.) 3 is 10\^3 = 1000.
+The inverse of `log()` is `exp()`; to compute the inverse of `log2()` or `log10()` you'll need to use `2^` or `10^`.
+
+### Rounding {#sec-rounding}
+
+Use `round(x)` to round a number to the nearest integer:
+
+```{r}
+round(123.456)
+```
+
+You can control the precision of the rounding with the second argument, `digits`.
+`round(x, digits)` rounds to the nearest `10^-n` so `digits = 2` will round to the nearest 0.01.
+This definition is useful because it implies `round(x, -3)` will round to the nearest thousand, which indeed it does:
+
+```{r}
+round(123.456, 2)  # two digits
+round(123.456, 1)  # one digit
+round(123.456, -1) # round to nearest ten
+round(123.456, -2) # round to nearest hundred
+```
+
+There's one weirdness with `round()` that seems surprising at first glance:
+
+```{r}
+round(c(1.5, 2.5))
+```
+
+`round()` uses what's known as "round half to even" or Banker's rounding: if a number is half way between two integers, it will be rounded to the **even** integer.
+This is a good strategy because it keeps the rounding unbiased: half of all 0.5s are rounded up, and half are rounded down.
+
+`round()` is paired with `floor()` which always rounds down and `ceiling()` which always rounds up:
+
+```{r}
+x <- 123.456
+
+floor(x)
+ceiling(x)
+```
+
+These functions don't have a `digits` argument, so you can instead scale down, round, and then scale back up:
+
+```{r}
+# Round down to nearest two digits
+floor(x / 0.01) * 0.01
+# Round up to nearest two digits
+ceiling(x / 0.01) * 0.01
+```
+
+You can use the same technique if you want to `round()` to a multiple of some other number:
+
+```{r}
+# Round to nearest multiple of 4
+round(x / 4) * 4
+
+# Round to nearest 0.25
+round(x / 0.25) * 0.25
+```
+
+### Cutting numbers into ranges
+
+Use `cut()`[^numbers-1] to break up (aka bin) a numeric vector into discrete buckets:
+
+[^numbers-1]: ggplot2 provides some helpers for common cases in `cut_interval()`, `cut_number()`, and `cut_width()`.
+    ggplot2 is an admittedly weird place for these functions to live, but they are useful as part of histogram computation and were written before any other parts of the tidyverse existed.
+
+```{r}
+x <- c(1, 2, 5, 10, 15, 20)
+cut(x, breaks = c(0, 5, 10, 15, 20))
+```
+
+The breaks don't need to be evenly spaced:
+
+```{r}
+cut(x, breaks = c(0, 5, 10, 100))
+```
+
+You can optionally supply your own `labels`.
+Note that there should be one less `labels` than `breaks`.
+
+```{r}
+cut(x, 
+  breaks = c(0, 5, 10, 15, 20), 
+  labels = c("sm", "md", "lg", "xl")
+)
+```
+
+Any values outside of the range of the breaks will become `NA`:
+
+```{r}
+y <- c(NA, -10, 5, 10, 30)
+cut(y, breaks = c(0, 5, 10, 15, 20))
+```
+
+See the documentation for other useful arguments like `right` and `include.lowest`, which control if the intervals are `[a, b)` or `(a, b]` and if the lowest interval should be `[a, b]`.
+
+### Cumulative and rolling aggregates {#sec-cumulative-and-rolling-aggregates}
+
+Base R provides `cumsum()`, `cumprod()`, `cummin()`, `cummax()` for running, or cumulative, sums, products, mins and maxes.
+dplyr provides `cummean()` for cumulative means.
+Cumulative sums tend to come up the most in practice:
+
+```{r}
+x <- 1:10
+cumsum(x)
+```
+
+If you need more complex rolling or sliding aggregates, try the [slider](https://slider.r-lib.org/) package.
+
+### Exercises
+
+1.  Explain in words what each line of the code used to generate @fig-prop-cancelled does.
+
+2.  What trigonometric functions does R provide?
+    Guess some names and look up the documentation.
+    Do they use degrees or radians?
+
+3.  Currently `dep_time` and `sched_dep_time` are convenient to look at, but hard to compute with because they're not really continuous numbers.
+    You can see the basic problem by running the code below: there's a gap between each hour.
+
+    ```{r}
+    #| eval: false
+    flights |> 
+      filter(month == 1, day == 1) |> 
+      ggplot(aes(x = sched_dep_time, y = dep_delay)) +
+      geom_point()
+    ```
+
+    Convert them to a more truthful representation of time (either fractional hours or minutes since midnight).
+
+4.  Round `dep_time` and `arr_time` to the nearest five minutes.
+
+## General transformations
+
+The following sections describe some general transformations which are often used with numeric vectors, but can be applied to all other column types.
+
+### Ranks
+
+dplyr provides a number of ranking functions inspired by SQL, but you should always start with `dplyr::min_rank()`.
+It uses the typical method for dealing with ties, e.g., 1st, 2nd, 2nd, 4th.
+
+```{r}
+x <- c(1, 2, 2, 3, 4, NA)
+min_rank(x)
+```
+
+Note that the smallest values get the lowest ranks; use `desc(x)` to give the largest values the smallest ranks:
+
+```{r}
+min_rank(desc(x))
+```
+
+If `min_rank()` doesn't do what you need, look at the variants `dplyr::row_number()`, `dplyr::dense_rank()`, `dplyr::percent_rank()`, and `dplyr::cume_dist()`.
+See the documentation for details.
+
+```{r}
+df <- tibble(x = x)
+df |> 
+  mutate(
+    row_number = row_number(x),
+    dense_rank = dense_rank(x),
+    percent_rank = percent_rank(x),
+    cume_dist = cume_dist(x)
+  )
+```
+
+You can achieve many of the same results by picking the appropriate `ties.method` argument to base R's `rank()`; you'll probably also want to set `na.last = "keep"` to keep `NA`s as `NA`.
+
+`row_number()` can also be used without any arguments when inside a dplyr verb.
+In this case, it'll give the number of the "current" row.
+When combined with `%%` or `%/%` this can be a useful tool for dividing data into similarly sized groups:
+
+```{r}
+df <- tibble(id = 1:10)
+
+df |> 
+  mutate(
+    row0 = row_number() - 1,
+    three_groups = row0 %% 3,
+    three_in_each_group = row0 %/% 3
+  )
+```
+
+### Offsets
+
+`dplyr::lead()` and `dplyr::lag()` allow you to refer the values just before or just after the "current" value.
+They return a vector of the same length as the input, padded with `NA`s at the start or end:
+
+```{r}
+x <- c(2, 5, 11, 11, 19, 35)
+lag(x)
+lead(x)
+```
+
+-   `x - lag(x)` gives you the difference between the current and previous value.
+
+    ```{r}
+    x - lag(x)
+    ```
+
+-   `x == lag(x)` tells you when the current value changes.
+
+    ```{r}
+    x == lag(x)
+    ```
+
+You can lead or lag by more than one position by using the second argument, `n`.
+
+### Consecutive identifiers
+
+Sometimes you want to start a new group every time some event occurs.
+For example, when you're looking at website data, it's common to want to break up events into sessions, where you begin a new session after gap of more than `x` minutes since the last activity.
+For example, imagine you have the times when someone visited a website:
+
+```{r}
+events <- tibble(
+  time = c(0, 1, 2, 3, 5, 10, 12, 15, 17, 19, 20, 27, 28, 30)
+)
+
+```
+
+And you've computed the time between each event, and figured out if there's a gap that's big enough to qualify:
+
+```{r}
+events <- events |> 
+  mutate(
+    diff = time - lag(time, default = first(time)),
+    has_gap = diff >= 5
+  )
+events
+```
+
+But how do we go from that logical vector to something that we can `group_by()`?
+`cumsum()`, from @sec-cumulative-and-rolling-aggregates, comes to the rescue as gap, i.e. `has_gap` is `TRUE`, will increment `group` by one (@sec-numeric-summaries-of-logicals):
+
+```{r}
+events |> mutate(
+  group = cumsum(has_gap)
+)
+```
+
+Another approach for creating grouping variables is `consecutive_id()`, which starts a new group every time one of its arguments changes.
+For example, inspired by [this stackoverflow question](https://stackoverflow.com/questions/27482712), imagine you have a data frame with a bunch of repeated values:
+
+```{r}
+df <- tibble(
+  x = c("a", "a", "a", "b", "c", "c", "d", "e", "a", "a", "b", "b"),
+  y = c(1, 2, 3, 2, 4, 1, 3, 9, 4, 8, 10, 199)
+)
+```
+
+If you want to keep the first row from each repeated `x`, you could use `group_by()`, `consecutive_id()`, and `slice_head()`:
+
+```{r}
+df |> 
+  group_by(id = consecutive_id(x)) |> 
+  slice_head(n = 1)
+```
+
+### Exercises
+
+1.  Find the 10 most delayed flights using a ranking function.
+    How do you want to handle ties?
+    Carefully read the documentation for `min_rank()`.
+
+2.  Which plane (`tailnum`) has the worst on-time record?
+
+3.  What time of day should you fly if you want to avoid delays as much as possible?
+
+4.  What does `flights |> group_by(dest) |> filter(row_number() < 4)` do?
+    What does `flights |> group_by(dest) |> filter(row_number(dep_delay) < 4)` do?
+
+5.  For each destination, compute the total minutes of delay.
+    For each flight, compute the proportion of the total delay for its destination.
+
+6.  Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave.
+    Using `lag()`, explore how the average flight delay for an hour is related to the average delay for the previous hour.
+
+    ```{r}
+    #| results: false
+
+    flights |> 
+      mutate(hour = dep_time %/% 100) |> 
+      group_by(year, month, day, hour) |> 
+      summarize(
+        dep_delay = mean(dep_delay, na.rm = TRUE),
+        n = n(),
+        .groups = "drop"
+      ) |> 
+      filter(n > 5)
+    ```
+
+7.  Look at each destination.
+    Can you find flights that are suspiciously fast (i.e. flights that represent a potential data entry error)?
+    Compute the air time of a flight relative to the shortest flight to that destination.
+    Which flights were most delayed in the air?
+
+8.  Find all destinations that are flown by at least two carriers.
+    Use those destinations to come up with a relative ranking of the carriers based on their performance for the same destination.
+
+## Numeric summaries
+
+Just using the counts, means, and sums that we've introduced already can get you a long way, but R provides many other useful summary functions.
+Here is a selection that you might find useful.
+
+### Center
+
+So far, we've mostly used `mean()` to summarize the center of a vector of values.
+As we've seen in @sec-sample-size, because the mean is the sum divided by the count, it is sensitive to even just a few unusually high or low values.
+An alternative is to use the `median()`, which finds a value that lies in the "middle" of the vector, i.e. 50% of the values is above it and 50% are below it.
+Depending on the shape of the distribution of the variable you're interested in, mean or median might be a better measure of center.
+For example, for symmetric distributions we generally report the mean while for skewed distributions we usually report the median.
+
+@fig-mean-vs-median compares the mean vs. the median departure delay (in minutes) for each destination.
+The median delay is always smaller than the mean delay because flights sometimes leave multiple hours late, but never leave multiple hours early.
+
+```{r}
+#| label: fig-mean-vs-median
+#| fig-cap: |
+#|   A scatterplot showing the differences of summarizing daily depature
+#|   delay with median instead of mean.
+#| fig-alt: |
+#|   All points fall below a 45° line, meaning that the median delay is
+#|   always less than the mean delay. Most points are clustered in a 
+#|   dense region of mean [0, 20] and median [0, 5]. As the mean delay
+#|   increases, the spread of the median also increases. There are two
+#|   outlying points with mean ~60, median ~50, and mean ~85, median ~55.
+flights |>
+  group_by(year, month, day) |>
+  summarize(
+    mean = mean(dep_delay, na.rm = TRUE),
+    median = median(dep_delay, na.rm = TRUE),
+    n = n(),
+    .groups = "drop"
+  ) |> 
+  ggplot(aes(x = mean, y = median)) + 
+  geom_abline(slope = 1, intercept = 0, color = "white", linewidth = 2) +
+  geom_point()
+```
+
+You might also wonder about the **mode**, or the most common value.
+This is a summary that only works well for very simple cases (which is why you might have learned about it in high school), but it doesn't work well for many real datasets.
+If the data is discrete, there may be multiple most common values, and if the data is continuous, there might be no most common value because every value is ever so slightly different.
+For these reasons, the mode tends not to be used by statisticians and there's no mode function included in base R[^numbers-2].
+
+[^numbers-2]: The `mode()` function does something quite different!
+
+### Minimum, maximum, and quantiles {#sec-min-max-summary}
+
+What if you're interested in locations other than the center?
+`min()` and `max()` will give you the largest and smallest values.
+Another powerful tool is `quantile()` which is a generalization of the median: `quantile(x, 0.25)` will find the value of `x` that is greater than 25% of the values, `quantile(x, 0.5)` is equivalent to the median, and `quantile(x, 0.95)` will find the value that's greater than 95% of the values.
+
+For the `flights` data, you might want to look at the 95% quantile of delays rather than the maximum, because it will ignore the 5% of most delayed flights which can be quite extreme.
+
+```{r}
+flights |>
+  group_by(year, month, day) |>
+  summarize(
+    max = max(dep_delay, na.rm = TRUE),
+    q95 = quantile(dep_delay, 0.95, na.rm = TRUE),
+    .groups = "drop"
+  )
+```
+
+### Spread
+
+Sometimes you're not so interested in where the bulk of the data lies, but in how it is spread out.
+Two commonly used summaries are the standard deviation, `sd(x)`, and the inter-quartile range, `IQR()`.
+We won't explain `sd()` here since you're probably already familiar with it, but `IQR()` might be new --- it's `quantile(x, 0.75) - quantile(x, 0.25)` and gives you the range that contains the middle 50% of the data.
+
+We can use this to reveal a small oddity in the `flights` data.
+You might expect the spread of the distance between origin and destination to be zero, since airports are always in the same place.
+But the code below reveals a data oddity for airport [EGE](https://en.wikipedia.org/wiki/Eagle_County_Regional_Airport):
+
+```{r}
+flights |> 
+  group_by(origin, dest) |> 
+  summarize(
+    distance_sd = IQR(distance), 
+    n = n(),
+    .groups = "drop"
+  ) |> 
+  filter(distance_sd > 0)
+```
+
+### Distributions
+
+It's worth remembering that all of the summary statistics described above are a way of reducing the distribution down to a single number.
+This means that they're fundamentally reductive, and if you pick the wrong summary, you can easily miss important differences between groups.
+That's why it's always a good idea to visualize the distribution before committing to your summary statistics.
+
+@fig-flights-dist shows the overall distribution of departure delays.
+The distribution is so skewed that we have to zoom in to see the bulk of the data.
+This suggests that the mean is unlikely to be a good summary and we might prefer the median instead.
+
+```{r}
+#| echo: false
+#| label: fig-flights-dist
+#| fig-cap: |
+#|   (Left) The histogram of the full data is extremely skewed making it
+#|   hard to get any details. (Right) Zooming into delays of less than two
+#|   hours makes it possible to see what's happening with the bulk of the
+#|   observations.
+#| fig-alt: |
+#|   Two histograms of `dep_delay`. On the left, it's very hard to see
+#|   any pattern except that there's a very large spike around zero, the
+#|   bars rapidly decay in height, and for most of the plot, you can't
+#|   see any bars because they are too short to see. On the right,
+#|   where we've discarded delays of greater than two hours, we can
+#|   see that the spike occurs slightly below zero (i.e. most flights
+#|   leave a couple of minutes early), but there's still a very steep
+#|   decay after that.
+#| fig-asp: 0.5
+
+library(patchwork)
+
+full <- flights |>
+  ggplot(aes(x = dep_delay)) + 
+  geom_histogram(binwidth = 15, na.rm = TRUE)
+
+delayed120 <- flights |>
+  filter(dep_delay < 120) |> 
+  ggplot(aes(x = dep_delay)) + 
+  geom_histogram(binwidth = 5)
+
+full + delayed120
+```
+
+It's also a good idea to check that distributions for subgroups resemble the whole.
+In the following plot 365 frequency polygons of `dep_delay`, one for each day, are overlaid.
+The distributions seem to follow a common pattern, suggesting it's fine to use the same summary for each day.
+
+```{r}
+#| fig-alt: |
+#|   The distribution of `dep_delay` is highly right skewed with a strong
+#|   peak slightly less than 0. The 365 frequency polygons are mostly 
+#|   overlapping forming a thick black bland.
+
+flights |>
+  filter(dep_delay < 120) |> 
+  ggplot(aes(x = dep_delay, group = interaction(day, month))) + 
+  geom_freqpoly(binwidth = 5, alpha = 1/5)
+```
+
+Don't be afraid to explore your own custom summaries specifically tailored for the data that you're working with.
+In this case, that might mean separately summarizing the flights that left early vs. the flights that left late, or given that the values are so heavily skewed, you might try a log-transformation.
+Finally, don't forget what you learned in @sec-sample-size: whenever creating numerical summaries, it's a good idea to include the number of observations in each group.
+
+### Positions
+
+There's one final type of summary that's useful for numeric vectors, but also works with every other type of value: extracting a value at a specific position: `first(x)`, `last(x)`, and `nth(x, n)`.
+
+For example, we can find the first and last departure for each day:
+
+```{r}
+flights |> 
+  group_by(year, month, day) |> 
+  summarize(
+    first_dep = first(dep_time, na_rm = TRUE), 
+    fifth_dep = nth(dep_time, 5, na_rm = TRUE),
+    last_dep = last(dep_time, na_rm = TRUE)
+  )
+```
+
+(NB: Because dplyr functions use `_` to separate components of function and arguments names, these functions use `na_rm` instead of `na.rm`.)
+
+If you're familiar with `[`, which we'll come back to in @sec-subset-many, you might wonder if you ever need these functions.
+There are three reasons: the `default` argument allows you to provide a default if the specified position doesn't exist, the `order_by` argument allows you to locally override the order of the rows, and the `na_rm` argument allows you to drop missing values.
+
+Extracting values at positions is complementary to filtering on ranks.
+Filtering gives you all variables, with each observation in a separate row:
+
+```{r}
+flights |> 
+  group_by(year, month, day) |> 
+  mutate(r = min_rank(sched_dep_time)) |> 
+  filter(r %in% c(1, max(r)))
+```
+
+### With `mutate()`
+
+As the names suggest, the summary functions are typically paired with `summarize()`.
+However, because of the recycling rules we discussed in @sec-recycling they can also be usefully paired with `mutate()`, particularly when you want do some sort of group standardization.
+For example:
+
+-   `x / sum(x)` calculates the proportion of a total.
+-   `(x - mean(x)) / sd(x)` computes a Z-score (standardized to mean 0 and sd 1).
+-   `(x - min(x)) / (max(x) - min(x))` standardizes to range \[0, 1\].
+-   `x / first(x)` computes an index based on the first observation.
+
+### Exercises
+
+1.  Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights.
+    When is `mean()` useful?
+    When is `median()` useful?
+    When might you want to use something else?
+    Should you use arrival delay or departure delay?
+    Why might you want to use data from `planes`?
+
+2.  Which destinations show the greatest variation in air speed?
+
+3.  Create a plot to further explore the adventures of EGE.
+    Can you find any evidence that the airport moved locations?
+    Can you find another variable that might explain the difference?
+
+## Summary
+
+You're already familiar with many tools for working with numbers, and after reading this chapter you now know how to use them in R.
+You've also learned a handful of useful general transformations that are commonly, but not exclusively, applied to numeric vectors like ranks and offsets.
+Finally, you worked through a number of numeric summaries, and discussed a few of the statistical challenges that you should consider.
+
+Over the next two chapters, we'll dive into working with strings with the stringr package.
+Strings are a big topic so they get two chapters, one on the fundamentals of strings and one on regular expressions.
diff --git a/oreilly-colours.R b/oreilly-colours.R
new file mode 100644
index 000000000..bd22ee02e
--- /dev/null
+++ b/oreilly-colours.R
@@ -0,0 +1,38 @@
+library(farver)
+library(dplyr, warn.conflicts = FALSE)
+
+oreilly <- tribble(
+  ~ name, ~r, ~g, ~b,
+  "blue", 0, 113, 188,
+  "orange", 247, 147, 30,
+  "red", 193, 39, 45,
+  "green", 0, 146, 68,
+  "yellow", 255, 222, 0,
+  "purple", 153, 0, 204
+)
+oreilly$col <- encode_colour(oreilly[c("r", "g", "b")])
+
+tint <- function(col, tint) {
+  n <- length(tint)
+  col_Lab <- decode_colour(col, to = "Lab")
+  white_Lab <- decode_colour(white, to = "Lab")
+
+  encode_colour(
+    col_Lab[rep(1,n), ] * tint + white_Lab[rep(1,n), ] * (1 - tint),
+    from = "Lab"
+  )
+}
+
+
+tints <- seq(0.1, 1, length.out = 10)
+
+oreilly |>
+  group_by(name) |>
+  summarize(
+    tint = paste0("t", tints * 100),
+    colour = tint(col, tints),
+    .groups = "drop"
+  ) |>
+  tidyr::pivot_wider(names_from = tint, values_from = colour)
+
+scales::show_col(tint(oreilly$col[5], tints))
diff --git a/pipes.Rmd b/pipes.Rmd
deleted file mode 100644
index 1bfc9bb2b..000000000
--- a/pipes.Rmd
+++ /dev/null
@@ -1,264 +0,0 @@
-# Pipes
-
-## Introduction
-
-Pipes are a powerful tool for clearly expressing a sequence of multiple operations. So far, you've been using them without knowing how they work, or what the alternatives are. Now, in this chapter, it's time to explore the pipe in more detail. You'll learn the alternatives to the pipe, when you shouldn't use the pipe, and some useful related tools.
-
-### Prerequisites
-
-The pipe, `%>%`, comes from the __magrittr__ package by Stefan Milton Bache. Packages in the tidyverse load `%>%` for you automatically, so you don't usually load magrittr explicitly.  Here, however, we're focussing on piping, and we aren't loading any other packages, so we will load it explicitly.
-
-```{r setup, message = FALSE}
-library(magrittr)
-```
-
-## Piping alternatives
-
-The point of the pipe is to help you write code in a way that is easier to read and understand. To see why the pipe is so useful, we're going to explore a number of ways of writing the same code. Let's use code to tell a story about a little bunny named Foo Foo:
-
-> Little bunny Foo Foo  
-> Went hopping through the forest  
-> Scooping up the field mice  
-> And bopping them on the head  
-
-This is a popular Children's poem that is accompanied by hand actions.
-
-We'll start by defining an object to represent little bunny Foo Foo:
-
-```{r, eval = FALSE}
-foo_foo <- little_bunny()
-```
-
-And we'll use a function for each key verb: `hop()`, `scoop()`, and `bop()`. Using this object and these verbs, there are (at least) four ways we could retell the story in code:
-
-1. Save each intermediate step as a new object.
-1. Overwrite the original object many times.
-1. Compose functions.
-1. Use the pipe.
-
-We'll work through each approach, showing you the code and talking about the advantages and disadvantages.
-
-### Intermediate steps
-
-The simplest approach is to save each step as a new object:
-
-```{r, eval = FALSE}
-foo_foo_1 <- hop(foo_foo, through = forest)
-foo_foo_2 <- scoop(foo_foo_1, up = field_mice)
-foo_foo_3 <- bop(foo_foo_2, on = head)
-```
-
-The main downside of this form is that it forces you to name each intermediate element. If there are natural names, this is a good idea, and you should do it. But many times, like this in this example, there aren't natural names, and you add numeric suffixes to make the names unique. That leads to two problems:
-
-1. The code is cluttered with unimportant names
-
-1. You have to carefully increment the suffix on each line. 
-
-Whenever I write code like this, I invariably use the wrong number on one line and then spend 10 minutes scratching my head and trying to figure out what went wrong with my code.
-
-You may also worry that this form creates many copies of your data and takes up a lot of memory. Surprisingly, that's not the case. First, note that proactively worrying about memory is not a useful way to spend your time: worry about it when it becomes a problem (i.e. you run out of memory), not before. Second, R isn't stupid, and it will share columns across data frames, where possible. Let's take a look at an actual data manipulation pipeline where we add a new column to `ggplot2::diamonds`:
-
-```{r}
-diamonds <- ggplot2::diamonds
-diamonds2 <- diamonds %>% 
-  dplyr::mutate(price_per_carat = price / carat)
-
-pryr::object_size(diamonds)
-pryr::object_size(diamonds2)
-pryr::object_size(diamonds, diamonds2)
-```
-
-`pryr::object_size()` gives the memory occupied by all of its arguments. The results seem counterintuitive at first:
-
-* `diamonds` takes up 3.46 MB,
-* `diamonds2` takes up 3.89 MB,
-* `diamonds` and `diamonds2` together take up 3.89 MB!
-
-How can that work? Well, `diamonds2` has 10 columns in common with `diamonds`: there's no need to duplicate all that data, so the two data frames have variables in common. These variables will only get copied if you modify one of them. In the following example, we modify a single value in `diamonds$carat`. That means the `carat` variable can no longer be shared between the two data frames, and a copy must be made. The size of each data frame is unchanged, but the collective size increases:
-
-```{r}
-diamonds$carat[1] <- NA
-pryr::object_size(diamonds)
-pryr::object_size(diamonds2)
-pryr::object_size(diamonds, diamonds2)
-```
-
-(Note that we use `pryr::object_size()` here, not the built-in `object.size()`. `object.size()` only takes a single object so it can't compute how data is shared across multiple objects.)
-
-### Overwrite the original
-
-Instead of creating intermediate objects at each step, we could overwrite the original object:
-
-```{r, eval = FALSE}
-foo_foo <- hop(foo_foo, through = forest)
-foo_foo <- scoop(foo_foo, up = field_mice)
-foo_foo <- bop(foo_foo, on = head)
-```
-
-This is less typing (and less thinking), so you're less likely to make mistakes. However, there are two problems:
-
-1.  Debugging is painful: if you make a mistake you'll need to re-run the 
-    complete pipeline from the beginning.
-    
-1.  The repetition of the object being transformed (we've written `foo_foo` six 
-    times!) obscures what's changing on each line. 
-
-### Function composition
-
-Another approach is to abandon assignment and just string the function calls together:
-
-```{r, eval = FALSE}
-bop(
-  scoop(
-    hop(foo_foo, through = forest),
-    up = field_mice
-  ), 
-  on = head
-)
-```
-
-Here the disadvantage is that you have to read from inside-out, from right-to-left, and that the arguments end up spread far apart (evocatively called the 
-[dagwood sandwhich](https://en.wikipedia.org/wiki/Dagwood_sandwich) problem). In short, this code is hard for a human to consume.
-
-### Use the pipe 
-
-Finally, we can use the pipe:
-
-```{r, eval = FALSE}
-foo_foo %>%
-  hop(through = forest) %>%
-  scoop(up = field_mice) %>%
-  bop(on = head)
-```
-
-This is my favourite form, because it focusses on verbs, not nouns. You can read this series of function compositions like it's a set of imperative actions. Foo Foo hops, then scoops, then bops. The downside, of course, is that you need to be familiar with the pipe. If you've never seen `%>%` before, you'll have no idea what this code does. Fortunately, most people pick up the idea very quickly, so when you share your code with others who aren't familiar with the pipe, you can easily teach them.
-
-The pipe works by performing a "lexical transformation": behind the scenes, magrittr reassembles the code in the pipe to a form that works by overwriting an intermediate object. When you run a pipe like the one above, magrittr does something like this:
-
-```{r, eval = FALSE}
-my_pipe <- function(.) {
-  . <- hop(., through = forest)
-  . <- scoop(., up = field_mice)
-  bop(., on = head)
-}
-my_pipe(foo_foo)
-```
-
-This means that the pipe won't work for two classes of functions:
-
-1.  Functions that use the current environment. For example, `assign()`
-    will create a new variable with the given name in the current environment:
-     
-    ```{r}
-    assign("x", 10)
-    x
-    
-    "x" %>% assign(100)
-    x
-    ```
-    
-    The use of assign with the pipe does not work because it assigns it to 
-    a temporary environment used by `%>%`. If you do want to use assign with the
-    pipe, you must be explicit about the environment:
-    
-    ```{r}
-    env <- environment()
-    "x" %>% assign(100, envir = env)
-    x
-    ```
-    
-    Other functions with this problem include `get()` and `load()`.
-
-1.  Functions that use lazy evaluation. In R, function arguments
-    are only computed when the function uses them, not prior to calling the 
-    function. The pipe computes each element in turn, so you can't 
-    rely on this behaviour.
-    
-    One place that this is a problem is `tryCatch()`, which lets you capture
-    and handle errors:
-    
-    ```{r, error = TRUE}
-    tryCatch(stop("!"), error = function(e) "An error")
-    
-    stop("!") %>% 
-      tryCatch(error = function(e) "An error")
-    ```
-    
-    There are a relatively wide class of functions with this behaviour,
-    including `try()`, `suppressMessages()`, and `suppressWarnings()`
-    in base R.
-  
-## When not to use the pipe
-
-The pipe is a powerful tool, but it's not the only tool at your disposal, and it doesn't solve every problem! Pipes are most useful for rewriting a fairly short linear sequence of operations. I think you should reach for another tool when:
-
-* Your pipes are longer than (say) ten steps. In that case, create 
-  intermediate objects with meaningful names. That will make debugging easier,
-  because you can more easily check the intermediate results, and it makes
-  it easier to understand your code, because the variable names can help 
-  communicate intent.
-  
-* You have multiple inputs or outputs. If there isn't one primary object
-  being transformed, but two or more objects being combined together,
-  don't use the pipe.
-
-* You are starting to think about a directed graph with a complex
-  dependency structure. Pipes are fundamentally linear and expressing 
-  complex relationships with them will typically yield confusing code.
-
-## Other tools from magrittr
-
-All packages in the tidyverse automatically make `%>%` available for you, so you don't normally load magrittr explicitly. However, there are some other useful tools inside magrittr that you might want to try out:
-
-*   When working with more complex pipes, it's sometimes useful to call a 
-    function for its side-effects. Maybe you want to print out the current 
-    object, or plot it, or save it to disk. Many times, such functions don't 
-    return anything, effectively terminating the pipe.
-    
-    To work around this problem, you can use the "tee" pipe. `%T>%` works like 
-    `%>%` except that it returns the left-hand side instead of the right-hand 
-    side. It's called "tee" because it's like a literal T-shaped pipe.
-
-    ```{r}
-    rnorm(100) %>%
-      matrix(ncol = 2) %>%
-      plot() %>%
-      str()
-    
-    rnorm(100) %>%
-      matrix(ncol = 2) %T>%
-      plot() %>%
-      str()
-    ```
-
-*   If you're working with functions that don't have a data frame based API  
-    (i.e. you pass them individual vectors, not a data frame and expressions 
-    to be evaluated in the context of that data frame), you might find `%$%` 
-    useful. It "explodes" out the variables in a data frame so that you can 
-    refer to them explicitly. This is useful when working with many functions 
-    in base R:
-    
-    ```{r}
-    mtcars %$%
-      cor(disp, mpg)
-    ```
-
-*   For assignment magrittr provides the `%<>%` operator which allows you to
-    replace code like:
-  
-    ```{r, eval = FALSE}
-    mtcars <- mtcars %>% 
-      transform(cyl = cyl * 2)
-    ```
-    
-    with
-     
-    ```{r, eval = FALSE}
-    mtcars %<>% transform(cyl = cyl * 2)
-    ```
-    
-    I'm not a fan of this operator because I think assignment is such a 
-    special operation that it should always be clear when it's occurring.
-    In my opinion, a little bit of duplication (i.e. repeating the 
-    name of the object twice) is fine in return for making assignment
-    more explicit.
diff --git a/plausible.html b/plausible.html
new file mode 100644
index 000000000..8327c128b
--- /dev/null
+++ b/plausible.html
@@ -0,0 +1 @@
+<script defer data-domain="r4ds.hadley.nz" src="https://plausible.io/js/plausible.js"></script>
diff --git a/preface-2e.qmd b/preface-2e.qmd
new file mode 100644
index 000000000..26fcef602
--- /dev/null
+++ b/preface-2e.qmd
@@ -0,0 +1,31 @@
+# Preface to the second edition {.unnumbered}
+
+Welcome to the second edition of "R for Data Science"!
+This is a major reworking of the first edition, removing material we no longer think is useful, adding material we wish we included in the first edition, and generally updating the text and code to reflect changes in best practices.
+We're also very excited to welcome a new co-author: Mine Çetinkaya-Rundel, a noted data science educator and one of our colleagues at Posit (the company formerly known as RStudio).
+
+A brief summary of the biggest changes follows:
+
+-   The first part of the book has been renamed to "Whole game".
+    The goal of this section is to give you the rough details of the "whole game" of data science before we dive into the details.
+
+-   The second part of the book is "Visualize".
+    This part gives data visualization tools and best practices a more thorough coverage compared to the first edition.
+    The best place to get all the details is still the [ggplot2 book](http://ggplot2-book.org/), but now R4DS covers more of the most important techniques.
+
+-   The third part of the book is now called "Transform" and gains new chapters on numbers, logical vectors, and missing values.
+    These were previously parts of the data transformation chapter, but needed much more room to cover all the details.
+
+-   The fourth part of the book is called "Import".
+    It's a new set of chapters that goes beyond reading flat text files to working with spreadsheets, getting data out of databases, working with big data, rectangling hierarchical data, and scraping data from web sites.
+
+-   The "Program" part remains, but has been rewritten from top-to-bottom to focus on the most important parts of function writing and iteration.
+    Function writing now includes details on how to wrap tidyverse functions (dealing with the challenges of tidy evaluation), since this has become much easier and more important over the last few years.
+    We've added a new chapter on important base R functions that you're likely to see in wild-caught R code.
+
+-   The modeling part has been removed.
+    We never had enough room to fully do modelling justice, and there are now much better resources available.
+    We generally recommend using the [tidymodels](https://www.tidymodels.org/) packages and reading [Tidy Modeling with R](https://www.tmwr.org/) by Max Kuhn and Julia Silge.
+
+-   The "Communicate" part remains, but has been thoroughly updated to feature [Quarto](https://quarto.org/) instead of R Markdown.
+    This edition of the book has been written in Quarto, and it's clearly the tool of the future.
diff --git a/program.Rmd b/program.Rmd
deleted file mode 100644
index 0497e2fdc..000000000
--- a/program.Rmd
+++ /dev/null
@@ -1,53 +0,0 @@
-# (PART) Program {-}
-
-# Introduction {#program-intro}
-
-In this part of the book, you'll improve your programming skills. Programming is a cross-cutting skill needed for all data science work: you must use a computer to do data science; you cannot do it in your head, or with pencil and paper. 
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/data-science-program.png")
-```
-
-Programming produces code, and code is a tool of communication. Obviously code tells the computer what you want it to do. But it also communicates meaning to other humans. Thinking about code as a vehicle for communication is important because every project you do is fundamentally collaborative. Even if you're not working with other people, you'll definitely be working with future-you! Writing clear code is important so that others (like future-you) can understand why you tackled an analysis in the way you did. That means getting better at programming also involves getting better at communicating. Over time, you want your code to become not just easier to write, but easier for others to read. 
-
-Writing code is similar in many ways to writing prose. One parallel which I find particularly useful is that in both cases rewriting is the key to clarity. The first expression of your ideas is unlikely to be particularly clear, and you may need to rewrite multiple times. After solving a data analysis challenge, it's often worth looking at your code and thinking about whether or not it's obvious what you've done. If you spend a little time rewriting your code while the ideas are fresh, you can save a lot of time later trying to recreate what your code did. But this doesn't mean you should rewrite every function: you need to balance what you need to achieve now with saving time in the long run. (But the more you rewrite your functions the more likely your first attempt will be clear.)
-
-In the following four chapters, you'll learn skills that will allow you to both tackle new programs and to solve existing problems with greater clarity and ease: 
-
-1.  In [pipes], you will dive deep into the __pipe__, `%>%`, and learn more 
-    about how it works, what the alternatives are, and when not to use it.
-
-1.  Copy-and-paste is a powerful tool, but you should avoid doing it more than
-    twice. Repeating yourself in code is dangerous because it can easily lead 
-    to errors and inconsistencies. Instead, in [functions], you'll learn
-    how to write __functions__ which let you extract out repeated code so that 
-    it can be easily reused.
-
-1.  As you start to write more powerful functions, you'll need a solid
-    grounding in R's __data structures__, provided by [vectors]. You must master 
-    the four common atomic vectors, the three important S3 classes built on 
-    top of them, and understand the mysteries of the list and data frame. 
-
-1.  Functions extract out repeated code, but you often need to repeat the
-    same actions on different inputs. You need tools for __iteration__ that
-    let you do similar things again and again. These tools include for loops 
-    and functional programming, which you'll learn about in [iteration].
-
-## Learning more
-
-The goal of these chapters is to teach you the minimum about programming that you need to practice data science, which turns out to be a reasonable amount. Once you have mastered the material in this book, I strongly believe you should invest further in your programming skills. Learning more about programming is a long-term investment: it won't pay off immediately, but in the long term it will allow you to solve new problems more quickly, and let you reuse your insights from previous problems in new scenarios.
-
-To learn more you need to study R as a programming language, not just an interactive environment for data science. We have written two books that will help you do so:
-
-* [_Hands on Programming with R_](https://amzn.com/1449359019),
-  by Garrett Grolemund. This is an introduction to R as a programming language 
-  and is a great place to start if R is your first programming language. It 
-  covers similar material to these chapters, but with a different style and
-  different motivation examples (based in the casino). It's a useful complement 
-  if you find that these four chapters go by too quickly.
-  
-* [_Advanced R_](https://amzn.com/1466586966) by Hadley Wickham. This dives into the
-  details of R the programming language. This is a great place to start if you
-  have existing programming experience. It's also a great next step once you've 
-  internalised the ideas in these chapters. You can read it online at
-  <http://adv-r.had.co.nz>.
diff --git a/program.qmd b/program.qmd
new file mode 100644
index 000000000..faabd2a1f
--- /dev/null
+++ b/program.qmd
@@ -0,0 +1,52 @@
+# Program {#sec-program-intro .unnumbered}
+
+```{r}
+#| results: "asis"
+#| echo: false
+source("_common.R")
+```
+
+In this part of the book, you'll improve your programming skills.
+Programming is a cross-cutting skill needed for all data science work: you must use a computer to do data science; you cannot do it in your head, or with pencil and paper.
+
+```{r}
+#| label: fig-ds-program
+#| echo: false
+#| out.width: ~
+#| fig-cap: |
+#|   Programming is the water in which all the other components swim.
+#| fig-alt: |
+#|   Our model of the data science process with program (import, tidy,
+#|   transform, visualize, model, and communicate, i.e. everything) 
+#|   highlighted in blue.
+
+knitr::include_graphics("diagrams/data-science/program.png", dpi = 270)
+```
+
+Programming produces code, and code is a tool of communication.
+Obviously code tells the computer what you want it to do.
+But it also communicates meaning to other humans.
+Thinking about code as a vehicle for communication is important because every project you do is fundamentally collaborative.
+Even if you're not working with other people, you'll definitely be working with future-you!
+Writing clear code is important so that others (like future-you) can understand why you tackled an analysis in the way you did.
+That means getting better at programming also involves getting better at communicating.
+Over time, you want your code to become not just easier to write, but easier for others to read.
+
+In the following three chapters, you'll learn skills to improve your programming skills:
+
+1.  Copy-and-paste is a powerful tool, but you should avoid doing it more than twice.
+    Repeating yourself in code is dangerous because it can easily lead to errors and inconsistencies.
+    Instead, in @sec-functions, you'll learn how to write **functions** which let you extract out repeated tidyverse code so that it can be easily reused.
+
+2.  Functions extract out repeated code, but you often need to repeat the same actions on different inputs.
+    You need tools for **iteration** that let you do similar things again and again.
+    These tools include for loops and functional programming, which you'll learn about in @sec-iteration.
+
+3.  As you read more code written by others, you'll see more code that doesn't use the tidyverse.
+    In @sec-base-r, you'll learn some of the most important base R functions that you'll see in the wild.
+
+The goal of these chapters is to teach you the minimum about programming that you need for data science.
+Once you have mastered the material here, we strongly recommend that you continue to invest in your programming skills.
+We've written two books that you might find helpful.
+[*Hands on Programming with R*](https://rstudio-education.github.io/hopr/), by Garrett Grolemund, is an introduction to R as a programming language and is a great place to start if R is your first programming language.
+[*Advanced R*](https://adv-r.hadley.nz/) by Hadley Wickham dives into the details of R the programming language; it's great place to start if you have existing programming experience and great next step once you've internalized the ideas in these chapters.
diff --git a/quarto-formats.qmd b/quarto-formats.qmd
new file mode 100644
index 000000000..8c749b629
--- /dev/null
+++ b/quarto-formats.qmd
@@ -0,0 +1,280 @@
+# Quarto formats {#sec-quarto-formats}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+So far, you've seen Quarto used to produce HTML documents.
+This chapter gives a brief overview of some of the many other types of output you can produce with Quarto.
+
+There are two ways to set the output of a document:
+
+1.  Permanently, by modifying the YAML header:
+
+    ``` yaml
+    title: "Diamond sizes"
+    format: html
+    ```
+
+2.  Transiently, by calling `quarto::quarto_render()` by hand:
+
+    ```{r}
+    #| eval: false
+
+    quarto::quarto_render("diamond-sizes.qmd", output_format = "docx")
+    ```
+
+    This is useful if you want to programmatically produce multiple types of output since the `output_format` argument can also take a list of values.
+
+    ```{r}
+    #| eval: false
+
+    quarto::quarto_render("diamond-sizes.qmd", output_format = c("docx", "pdf"))
+    ```
+
+## Output options
+
+Quarto offers a wide range of output formats.
+You can find the complete list at <https://quarto.org/docs/output-formats/all-formats.html>.
+Many formats share some output options (e.g., `toc: true` for including a table of contents), but others have options that are format specific (e.g., `code-fold: true` collapses code chunks into a `<details>` tag for HTML output so the user can display it on demand, it's not applicable in a PDF or Word document).
+
+To override the default options, you need to use an expanded `format` field.
+For example, if you wanted to render an `html` with a floating table of contents, you'd use:
+
+``` yaml
+format:
+  html:
+    toc: true
+    toc_float: true
+```
+
+You can even render to multiple outputs by supplying a list of formats:
+
+``` yaml
+format:
+  html:
+    toc: true
+    toc_float: true
+  pdf: default
+  docx: default
+```
+
+Note the special syntax (`pdf: default`) if you don't want to override any default options.
+
+To render to all formats specified in the YAML of a document, you can use `output_format = "all"`.
+
+```{r}
+#| eval: false
+
+quarto::quarto_render("diamond-sizes.qmd", output_format = "all")
+```
+
+## Documents
+
+The previous chapter focused on the default `html` output.
+There are several basic variations on that theme, generating different types of documents.
+For example:
+
+-   `pdf` makes a PDF with LaTeX (an open-source document layout system), which you'll need to install.
+    RStudio will prompt you if you don't already have it.
+
+-   `docx` for Microsoft Word (`.docx`) documents.
+
+-   `odt` for OpenDocument Text (`.odt`) documents.
+
+-   `rtf` for Rich Text Format (`.rtf`) documents.
+
+-   `gfm` for a GitHub Flavored Markdown (`.md`) document.
+
+-   `ipynb` for Jupyter Notebooks (`.ipynb`).
+
+Remember, when generating a document to share with decision-makers, you can turn off the default display of code by setting global options in the document YAML:
+
+``` yaml
+execute:
+  echo: false
+```
+
+For `html` documents another option is to make the code chunks hidden by default, but visible with a click:
+
+``` yaml
+format:
+  html:
+    code: true
+```
+
+## Presentations
+
+You can also use Quarto to produce presentations.
+You get less visual control than with a tool like Keynote or PowerPoint, but automatically inserting the results of your R code into a presentation can save a huge amount of time.
+Presentations work by dividing your content into slides, with a new slide beginning at each second (`##`) level header.
+Additionally, first (`#`) level headers indicate the beginning of a new section with a section title slide that is, by default, centered in the middle.
+
+Quarto supports a variety of presentation formats, including:
+
+1.  `revealjs` - HTML presentation with revealjs
+
+2.  `pptx` - PowerPoint presentation
+
+3.  `beamer` - PDF presentation with LaTeX Beamer.
+
+You can read more about creating presentations with Quarto at [https://quarto.org/docs/presentations](https://quarto.org/docs/presentations/).
+
+## Interactivity
+
+Just like any HTML document, HTML documents created with Quarto can contain interactive components as well.
+Here we introduce two options for including interactivity in your Quarto documents: htmlwidgets and Shiny.
+
+### htmlwidgets
+
+HTML is an interactive format, and you can take advantage of that interactivity with **htmlwidgets**, R functions that produce interactive HTML visualizations.
+For example, take the **leaflet** map below.
+If you're viewing this page on the web, you can drag the map around, zoom in and out, etc.
+You obviously can't do that in a book, so Quarto automatically inserts a static screenshot for you.
+
+```{r}
+#| fig-alt: Leaflet map of Maungawhau / Mount Eden.
+
+library(leaflet)
+leaflet() |>
+  setView(174.764, -36.877, zoom = 16) |> 
+  addTiles() |>
+  addMarkers(174.764, -36.877, popup = "Maungawhau") 
+```
+
+The great thing about htmlwidgets is that you don't need to know anything about HTML or JavaScript to use them.
+All the details are wrapped inside the package, so you don't need to worry about it.
+
+There are many packages that provide htmlwidgets, including:
+
+-   [**dygraphs**](https://rstudio.github.io/dygraphs) for interactive time series visualizations.
+
+-   [**DT**](https://rstudio.github.io/DT/) for interactive tables.
+
+-   [**threejs**](https://bwlewis.github.io/rthreejs) for interactive 3d plots.
+
+-   [**DiagrammeR**](https://rich-iannone.github.io/DiagrammeR) for diagrams (like flow charts and simple node-link diagrams).
+
+To learn more about htmlwidgets and see a complete list of packages that provide them visit <https://www.htmlwidgets.org>.
+
+### Shiny
+
+htmlwidgets provide **client-side** interactivity --- all the interactivity happens in the browser, independently of R.
+On the one hand, that's great because you can distribute the HTML file without any connection to R.
+However, that fundamentally limits what you can do to things that have been implemented in HTML and JavaScript.
+An alternative approach is to use **shiny**, a package that allows you to create interactivity using R code, not JavaScript.
+
+To call Shiny code from a Quarto document, add `server: shiny` to the YAML header:
+
+``` yaml
+title: "Shiny Web App"
+format: html
+server: shiny
+```
+
+Then you can use the "input" functions to add interactive components to the document:
+
+```{r}
+#| eval: false
+
+library(shiny)
+
+textInput("name", "What is your name?")
+numericInput("age", "How old are you?", NA, min = 0, max = 150)
+```
+
+```{r}
+#| echo: false
+#| out-width: null
+#| fig-alt: |
+#|   Two input boxes on top of each other. Top one says, "What is your 
+#|   name?", the bottom, "How old are you?".
+
+knitr::include_graphics("quarto/quarto-shiny.png")
+```
+
+And you also need a code chunk with chunk option `context: server` which contains the code that needs to run in a Shiny server.
+
+You can then refer to the values with `input$name` and `input$age`, and the code that uses them will be automatically re-run whenever they change.
+
+We can't show you a live shiny app here because shiny interactions occur on the **server-side**.
+This means that you can write interactive apps without knowing JavaScript, but you need a server to run them on.
+This introduces a logistical issue: Shiny apps need a Shiny server to be run online.
+When you run Shiny apps on your own computer, Shiny automatically sets up a Shiny server for you, but you need a public-facing Shiny server if you want to publish this sort of interactivity online.
+That's the fundamental trade-off of shiny: you can do anything in a shiny document that you can do in R, but it requires someone to be running R.
+
+For learning more about Shiny, we recommend reading Mastering Shiny by Hadley Wickham, [https://mastering-shiny.org](https://mastering-shiny.org/).
+
+## Websites and books
+
+With a bit of additional infrastructure, you can use Quarto to generate a complete website or book:
+
+-   Put your `.qmd` files in a single directory.
+    `index.qmd` will become the home page.
+
+-   Add a YAML file named `_quarto.yml` that provides the navigation for the site.
+    In this file, set the `project` type to either `book` or `website`, e.g.:
+
+    ``` yaml
+    project:
+      type: book
+    ```
+
+For example, the following `_quarto.yml` file creates a website from three source files: `index.qmd` (the home page), `viridis-colors.qmd`, and `terrain-colors.qmd`.
+
+```{r}
+#| echo: false
+#| comment: ""
+
+cat(readr::read_file("quarto/example-site.yml"))
+```
+
+The `_quarto.yml` file you need for a book is very similarly structured.
+The following example shows how you can create a book with four chapters that renders to three different outputs (`html`, `pdf`, and `epub`).
+Once again, the source files are `.qmd` files.
+
+```{r}
+#| echo: false
+#| comment: ""
+
+cat(readr::read_file("quarto/example-book.yml"))
+```
+
+We recommend that you use an RStudio project for your websites and books.
+Based on the `_quarto.yml` file, RStudio will recognize the type of project you're working on, and add a Build tab to the IDE that you can use to render and preview your websites and books.
+Both websites and books can also be rendered using `quarto::render()`.
+
+Read more at <https://quarto.org/docs/websites> about Quarto websites and <https://quarto.org/docs/books> about books.
+
+## Other formats
+
+Quarto offers even more output formats:
+
+-   You can write journal articles using Quarto Journal Templates: <https://quarto.org/docs/journals/templates.html>.
+
+-   You can output Quarto documents to Jupyter Notebooks with `format: ipynb`: <https://quarto.org/docs/reference/formats/ipynb.html>.
+
+See <https://quarto.org/docs/output-formats/all-formats.html> for a list of even more formats.
+
+## Summary
+
+In this chapter we presented you a variety of options for communicating your results with Quarto, from static and interactive documents to presentations to websites and books.
+
+To learn more about effective communication in these different formats, we recommend the following resources:
+
+-   To improve your presentation skills, try [*Presentation Patterns*](https://presentationpatterns.com/) by Neal Ford, Matthew McCollough, and Nathaniel Schutta.
+    It provides a set of effective patterns (both low- and high-level) that you can apply to improve your presentations.
+
+-   If you give academic talks, you might like the [*Leek group guide to giving talks*](https://github.com/jtleek/talkguide).
+
+-   We haven't taken it ourselves, but we've heard good things about Matt McGarrity's online course on public speaking: <https://www.coursera.org/learn/public-speaking>.
+
+-   If you are creating many dashboards, make sure to read Stephen Few's [*Information Dashboard Design: The Effective Visual Communication of Data*](https://www.amazon.com/Information-Dashboard-Design-Effective-Communication/dp/0596100167).
+    It will help you create dashboards that are truly useful, not just pretty to look at.
+
+-   Effectively communicating your ideas often benefits from some knowledge of graphic design.
+    Robin Williams' [*The Non-Designer's Design Book*](https://www.amazon.com/Non-Designers-Design-Book-4th/dp/0133966151) is a great place to start.
diff --git a/quarto.qmd b/quarto.qmd
new file mode 100644
index 000000000..812a6e0ea
--- /dev/null
+++ b/quarto.qmd
@@ -0,0 +1,867 @@
+# Quarto {#sec-quarto}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+Quarto provides a unified authoring framework for data science, combining your code, its results, and your prose.
+Quarto documents are fully reproducible and support dozens of output formats, like PDFs, Word files, presentations, and more.
+
+Quarto files are designed to be used in three ways:
+
+1.  For communicating to decision-makers, who want to focus on the conclusions, not the code behind the analysis.
+
+2.  For collaborating with other data scientists (including future you!), who are interested in both your conclusions, and how you reached them (i.e. the code).
+
+3.  As an environment in which to *do* data science, as a modern-day lab notebook where you can capture not only what you did, but also what you were thinking.
+
+Quarto is a command line interface tool, not an R package.
+This means that help is, by-and-large, not available through `?`.
+Instead, as you work through this chapter, and use Quarto in the future, you should refer to the [Quarto documentation](https://quarto.org).
+
+If you're an R Markdown user, you might be thinking "Quarto sounds a lot like R Markdown".
+You're not wrong!
+Quarto unifies the functionality of many packages from the R Markdown ecosystem (rmarkdown, bookdown, distill, xaringan, etc.) into a single consistent system as well as extends it with native support for multiple programming languages like Python and Julia in addition to R.
+In a way, Quarto reflects everything that was learned from expanding and supporting the R Markdown ecosystem over a decade.
+
+### Prerequisites
+
+You need the Quarto command line interface (Quarto CLI), but you don't need to explicitly install it or load it, as RStudio automatically does both when needed.
+
+```{r}
+#| label: setup
+#| include: false
+
+chunk <- "```"
+inline <- function(x = "") paste0("`` `r ", x, "` ``")
+library(tidyverse)
+```
+
+## Quarto basics
+
+This is a Quarto file -- a plain text file that has the extension `.qmd`:
+
+```{r echo = FALSE, comment = ""}
+cat(readr::read_file("quarto/diamond-sizes.qmd"))
+```
+
+It contains three important types of content:
+
+1.  An (optional) **YAML header** surrounded by `---`s.
+2.  **Chunks** of R code surrounded by ```` ``` ````.
+3.  Text mixed with simple text formatting like `# heading` and `_italics_`.
+
+@fig-diamond-sizes-notebook shows a `.qmd` document in RStudio with notebook interface where code and output are interleaved.
+You can run each code chunk by clicking the Run icon (it looks like a play button at the top of the chunk), or by pressing Cmd/Ctrl + Shift + Enter.
+RStudio executes the code and displays the results inline with the code.
+
+```{r}
+#| label: fig-diamond-sizes-notebook
+#| echo: false
+#| out-width: "90%"
+#| fig-cap: |
+#|   A Quarto document in RStudio. Code and output interleaved in 
+#|   the document, with the plot output appearing right underneath the code.
+#| fig-alt: |
+#|   RStudio window with a Quarto document titled "diamond-sizes.qmd" 
+#|   on the left and a blank Viewer window on the right. The Quarto 
+#|   document has a code chunk that creates a frequency plot of diamonds 
+#|   that weigh less than 2.5 carats. The plot shows that the frequency 
+#|   decreases as the weight increases.
+
+knitr::include_graphics("quarto/diamond-sizes-notebook.png")
+```
+
+If you don't like seeing your plots and output in your document and would rather make use of RStudio's Console and Plot panes, you can click on the gear icon next to "Render" and switch to "Chunk Output in Console", as shown in @fig-diamond-sizes-console-output.
+
+```{r}
+#| label: fig-diamond-sizes-console-output
+#| echo: false
+#| out-width: "90%"
+#| fig-cap: |
+#|   A Quarto document in RStudio with the plot output in the Plots pane.
+#| fig-alt: |
+#|   RStudio window with a Quarto document titled "diamond-sizes.qmd" 
+#|   on the left and the Plot pane on the bottom right. The Quarto 
+#|   document has a code chunk that creates a frequency plot of diamonds 
+#|   that weigh less than 2.5 carats. The plot is displayed in the Plot 
+#|   pane and shows that the frequency decreases as the weight increases. 
+#|   The RStudio option to show Chunk Output in Console is also 
+#|   highlighted.
+
+knitr::include_graphics("quarto/diamond-sizes-console-output.png")
+```
+
+To produce a complete report containing all text, code, and results, click "Render" or press Cmd/Ctrl + Shift + K.
+You can also do this programmatically with `quarto::quarto_render("diamond-sizes.qmd")`.
+This will display the report in the viewer pane as shown in @fig-diamond-sizes-report and create an HTML file.
+
+```{r}
+#| label: fig-diamond-sizes-report
+#| echo: false
+#| out-width: "90%"
+#| fig-cap: |
+#|   A Quarto document in RStudio with the rendered document in the 
+#|   Viewer pane.
+#| fig-alt: |
+#|   RStudio window with a Quarto document titled "diamond-sizes.qmd" 
+#|   on the left and the Plot pane on the bottom right. The rendered 
+#|   document does not show any of the code, but the code is visible 
+#|   in the source document.
+
+knitr::include_graphics("quarto/diamond-sizes-report.png")
+```
+
+When you render the document, Quarto sends the `.qmd` file to **knitr**, <https://yihui.org/knitr/>, which executes all of the code chunks and creates a new markdown (`.md`) document which includes the code and its output.
+The markdown file generated by knitr is then processed by **pandoc**, [https://pandoc.org](https://pandoc.org/){.uri}, which is responsible for creating the finished file.
+This process is shown in @fig-quarto-flow.
+The advantage of this two step workflow is that you can create a very wide range of output formats, as you'll learn about in @sec-quarto-formats.
+
+```{r}
+#| label: fig-quarto-flow
+#| echo: false
+#| out-width: "75%"
+#| fig-alt: |
+#|   Workflow diagram starting with a qmd file, then knitr, then md, 
+#|   then pandoc, then PDF, MS Word, or HTML.
+#| fig-cap: |
+#|   Diagram of Quarto workflow from qmd, to knitr, to md, to pandoc, 
+#|   to output in PDF, MS Word, or HTML formats.
+
+knitr::include_graphics("images/quarto-flow.png")
+```
+
+To get started with your own `.qmd` file, select *File \> New File \> Quarto Document...* in the menu bar.
+RStudio will launch a wizard that you can use to pre-populate your file with useful content that reminds you how the key features of Quarto work.
+
+The following sections dive into the three components of a Quarto document in more details: the markdown text, the code chunks, and the YAML header.
+
+### Exercises
+
+1.  Create a new Quarto document using *File \> New File \> Quarto Document*.
+    Read the instructions.
+    Practice running the chunks individually.
+    Then render the document by clicking the appropriate button and then by using the appropriate keyboard short cut.
+    Verify that you can modify the code, re-run it, and see modified output.
+
+2.  Create one new Quarto document for each of the three built-in formats: HTML, PDF and Word.
+    Render each of the three documents.
+    How do the outputs differ?
+    How do the inputs differ?
+    (You may need to install LaTeX in order to build the PDF output --- RStudio will prompt you if this is necessary.)
+
+## Visual editor
+
+The Visual editor in RStudio provides a [WYSIWYM](https://en.wikipedia.org/wiki/WYSIWYM) interface for authoring Quarto documents.
+Under the hood, prose in Quarto documents (`.qmd` files) is written in Markdown, a lightweight set of conventions for formatting plain text files.
+In fact, Quarto uses Pandoc markdown (a slightly extended version of Markdown that Quarto understands), including tables, citations, cross-references, footnotes, divs/spans, definition lists, attributes, raw HTML/TeX, and more as well as support for executing code cells and viewing their output inline.
+While Markdown is designed to be easy to read and write, as you will see in @sec-source-editor, it still requires learning new syntax.
+Therefore, if you're new to computational documents like `.qmd` files but have experience using tools like Google Docs or MS Word, the easiest way to get started with Quarto in RStudio is the visual editor.
+
+In the visual editor you can either use the buttons on the menu bar to insert images, tables, cross-references, etc. or you can use the catch-all <kbd>⌘ /</kbd> shortcut to insert just about anything.
+If you are at the beginning of a line (as shown in @fig-visual-editor), you can also enter just <kbd>/</kbd> to invoke the shortcut.
+
+```{r}
+#| label: fig-visual-editor
+#| echo: false
+#| out-width: "75%"
+#| fig-cap: |
+#|   Quarto visual editor.
+#| fig-alt: |
+#|   A Quarto document displaying various features of the 
+#|   visual editor such as text formatting (italic, bold, 
+#|   underline, small caps, code, superscript, and subscript),
+#|   first through third level headings, bulleted and numbered
+#|   lists, links, linked phrases, and images (along with a 
+#|   pop-up window for customizing image size, adding a 
+#|   caption and alt text, etc.), tables with a header row, 
+#|   and the insert anything tool with options to insert an 
+#|   R code chunk, a Python code chunk, a div, a bullet list, 
+#|   a numbered list, or a first level heading (the top few 
+#|   choices in the tool).
+
+knitr::include_graphics("quarto/quarto-visual-editor.png")
+```
+
+Inserting images and customizing how they are displayed is also facilitated with the visual editor.
+You can either paste an image from your clipboard directly into the visual editor (and RStudio will place a copy of that image in the project directory and link to it) or you can use the visual editor's Insert \> Figure / Image menu to browse to the image you want to insert or paste it's URL.
+In addition, using the same menu you can resize the image as well as add a caption, alternative text, and a link.
+
+The visual editor has many more features that we haven't enumerated here that you might find useful as you gain experience authoring with it.
+
+Most importantly, while the visual editor displays your content with formatting, under the hood, it saves your content in plain Markdown and you can switch back and forth between the visual and source editors to view and edit your content using either tool.
+
+### Exercises
+
+1.  Re-create the document in @fig-visual-editor using the visual editor.
+2.  Using the visual editor, insert a code chunk using the Insert menu and then the insert anything tool.
+3.  Using the visual editor, figure out how to:
+    a.  Add a footnote.
+    b.  Add a horizontal rule.
+    c.  Add a block quote.
+4.  In the visual editor, go to Insert \> Citation and insert a citation to the paper titled [Welcome to the Tidyverse](https://joss.theoj.org/papers/10.21105/joss.01686) using its DOI (digital object identifier), which is [10.21105/joss.01686](https://doi.org/10.21105/joss.01686). Render the document and observe how the reference shows up in the document. What change do you observe in the YAML of your document?
+
+## Source editor {#sec-source-editor}
+
+You can also edit Quarto documents using the Source editor in RStudio, without the assist of the Visual editor.
+While the Visual editor will feel familiar to those with experience writing in tools like Google docs, the Source editor will feel familiar to those with experience writing R scripts or R Markdown documents.
+The Source editor can also be useful for debugging any Quarto syntax errors since it's often easier to catch these in plain text.
+
+The guide below shows how to use Pandoc's Markdown for authoring Quarto documents in the source editor.
+
+```{r}
+#| echo: false
+#| comment: ""
+
+cat(readr::read_file("quarto/markdown.qmd"))
+```
+
+The best way to learn these is simply to try them out.
+It will take a few days, but soon they will become second nature, and you won't need to think about them.
+If you forget, you can get to a handy reference sheet with *Help \> Markdown Quick Reference*.
+
+### Exercises
+
+1.  Practice what you've learned by creating a brief CV.
+    The title should be your name, and you should include headings for (at least) education or employment.
+    Each of the sections should include a bulleted list of jobs/degrees.
+    Highlight the year in bold.
+
+2.  Using the source editor and the Markdown quick reference, figure out how to:
+
+    a.  Add a footnote.
+    b.  Add a horizontal rule.
+    c.  Add a block quote.
+
+3.  Copy and paste the contents of `diamond-sizes.qmd` from <https://github.com/hadley/r4ds/tree/main/quarto> in to a local R Quarto document.
+    Check that you can run it, then add text after the frequency polygon that describes its most striking features.
+
+4.  Create a document in a Google doc or MS Word (or locate a document you have created previously) with some content in it such as headings, hyperlinks, formatted text, etc.
+    Copy the contents of this document and paste it into a Quarto document in the visual editor.
+    Then, switch over to the source editor and inspect the source code.
+
+## Code chunks
+
+To run code inside a Quarto document, you need to insert a chunk.
+There are three ways to do so:
+
+1.  The keyboard shortcut Cmd + Option + I / Ctrl + Alt + I.
+
+2.  The "Insert" button icon in the editor toolbar.
+
+3.  By manually typing the chunk delimiters ```` ```{r} ```` and ```` ``` ````.
+
+We'd recommend you learn the keyboard shortcut.
+It will save you a lot of time in the long run!
+
+You can continue to run the code using the keyboard shortcut that by now (we hope!) you know and love: Cmd/Ctrl + Enter.
+However, chunks get a new keyboard shortcut: Cmd/Ctrl + Shift + Enter, which runs all the code in the chunk.
+Think of a chunk like a function.
+A chunk should be relatively self-contained, and focused around a single task.
+
+The following sections describe the chunk header which consists of ```` ```{r} ````, followed by an optional chunk label and various other chunk options, each on their own line, marked by `#|`.
+
+### Chunk label
+
+Chunks can be given an optional label, e.g.
+
+```{r}
+#| echo: fenced
+#| label: simple-addition
+
+1 + 1
+```
+
+This has three advantages:
+
+1.  You can more easily navigate to specific chunks using the drop-down code navigator in the bottom-left of the script editor:
+
+    ```{r}
+    #| echo: false
+    #| out-width: "30%"
+    #| fig-alt: |
+    #|   Snippet of RStudio IDE showing only the drop-down code navigator 
+    #|   which shows three chunks. Chunk 1 is setup. Chunk 2 is cars and 
+    #|   it is in a section called Quarto. Chunk 3 is pressure and it is in 
+    #|   a section called Including plots.
+
+    knitr::include_graphics("screenshots/quarto-chunk-nav.png")
+    ```
+
+2.  Graphics produced by the chunks will have useful names that make them easier to use elsewhere.
+    More on that in @sec-figures.
+
+3.  You can set up networks of cached chunks to avoid re-performing expensive computations on every run.
+    More on that in @sec-caching.
+
+Your chunk labels should be short but evocative and should not contain spaces.
+We recommend using dashes (`-`) to separate words (instead of underscores, `_`) and avoiding other special characters in chunk labels.
+
+You are generally free to label your chunk however you like, but there is one chunk name that imbues special behavior: `setup`.
+When you're in a notebook mode, the chunk named setup will be run automatically once, before any other code is run.
+
+Additionally, chunk labels cannot be duplicated.
+Each chunk label must be unique.
+
+### Chunk options
+
+Chunk output can be customized with **options**, fields supplied to chunk header.
+Knitr provides almost 60 options that you can use to customize your code chunks.
+Here we'll cover the most important chunk options that you'll use frequently.
+You can see the full list at [https://yihui.org/knitr/options](https://yihui.org/knitr/options/){.uri}.
+
+The most important set of options controls if your code block is executed and what results are inserted in the finished report:
+
+-   `eval: false` prevents code from being evaluated.
+    (And obviously if the code is not run, no results will be generated).
+    This is useful for displaying example code, or for disabling a large block of code without commenting each line.
+
+-   `include: false` runs the code, but doesn't show the code or results in the final document.
+    Use this for setup code that you don't want cluttering your report.
+
+-   `echo: false` prevents code, but not the results from appearing in the finished file.
+    Use this when writing reports aimed at people who don't want to see the underlying R code.
+
+-   `message: false` or `warning: false` prevents messages or warnings from appearing in the finished file.
+
+-   `results: hide` hides printed output; `fig-show: hide` hides plots.
+
+-   `error: true` causes the render to continue even if code returns an error.
+    This is rarely something you'll want to include in the final version of your report, but can be very useful if you need to debug exactly what is going on inside your `.qmd`.
+    It's also useful if you're teaching R and want to deliberately include an error.
+    The default, `error: false` causes rendering to fail if there is a single error in the document.
+
+Each of these chunk options get added to the header of the chunk, following `#|`, e.g., in the following chunk the result is not printed since `eval` is set to false.
+
+```{r}
+#| echo: fenced
+#| label: simple-multiplication
+#| eval: false
+
+2 * 2
+```
+
+The following table summarizes which types of output each option suppresses:
+
+| Option           | Run code | Show code | Output | Plots | Messages | Warnings |
+|------------------|:--------:|:---------:|:------:|:-----:|:--------:|:--------:|
+| `eval: false`    |    X     |           |   X    |   X   |    X     |    X     |
+| `include: false` |          |     X     |   X    |   X   |    X     |    X     |
+| `echo: false`    |          |     X     |        |       |          |          |
+| `results: hide`  |          |           |   X    |       |          |          |
+| `fig-show: hide` |          |           |        |   X   |          |          |
+| `message: false` |          |           |        |       |    X     |          |
+| `warning: false` |          |           |        |       |          |    X     |
+
+### Global options
+
+As you work more with knitr, you will discover that some of the default chunk options don't fit your needs and you want to change them.
+
+You can do this by adding the preferred options in the document YAML, under `execute`.
+For example, if you are preparing a report for an audience who does not need to see your code but only your results and narrative, you might set `echo: false` at the document level.
+That will hide the code by default, so only showing the chunks you deliberately choose to show (with `echo: true`).
+You might consider setting `message: false` and `warning: false`, but that would make it harder to debug problems because you wouldn't see any messages in the final document.
+
+``` yaml
+title: "My report"
+execute:
+  echo: false
+```
+
+Since Quarto is designed to be multi-lingual (works with R as well as other languages like Python, Julia, etc.), all of the knitr options are not available at the document execution level since some of them only work with knitr and not other engines Quarto uses for running code in other languages (e.g., Jupyter).
+You can, however, still set these as global options for your document under the `knitr` field, under `opts_chunk`.
+For example, when writing books and tutorials we set:
+
+``` yaml
+title: "Tutorial"
+knitr:
+  opts_chunk:
+    comment: "#>"
+    collapse: true
+```
+
+This uses our preferred comment formatting and ensures that the code and output are kept closely entwined.
+
+### Inline code
+
+There is one other way to embed R code into a Quarto document: directly into the text, with: `r inline()`.
+This can be very useful if you mention properties of your data in the text.
+For example, the example document used at the start of the chapter had:
+
+> We have data about `r inline('nrow(diamonds)')` diamonds.
+> Only `r inline('nrow(diamonds) - nrow(smaller)')` are larger than 2.5 carats.
+> The distribution of the remainder is shown below:
+
+When the report is rendered, the results of these computations are inserted into the text:
+
+> We have data about 53940 diamonds.
+> Only 126 are larger than 2.5 carats.
+> The distribution of the remainder is shown below:
+
+When inserting numbers into text, `format()` is your friend.
+It allows you to set the number of `digits` so you don't print to a ridiculous degree of accuracy, and a `big.mark` to make numbers easier to read.
+You might combine these into a helper function:
+
+```{r}
+comma <- function(x) format(x, digits = 2, big.mark = ",")
+comma(3452345)
+comma(.12358124331)
+```
+
+### Exercises
+
+1.  Add a section that explores how diamond sizes vary by cut, color, and clarity.
+    Assume you're writing a report for someone who doesn't know R, and instead of setting `echo: false` on each chunk, set a global option.
+
+2.  Download `diamond-sizes.qmd` from <https://github.com/hadley/r4ds/tree/main/quarto>.
+    Add a section that describes the largest 20 diamonds, including a table that displays their most important attributes.
+
+3.  Modify `diamonds-sizes.qmd` to use `label_comma()` to produce nicely formatted output.
+    Also include the percentage of diamonds that are larger than 2.5 carats.
+
+## Figures {#sec-figures}
+
+The figures in a Quarto document can be embedded (e.g., a PNG or JPEG file) or generated as a result of a code chunk.
+
+To embed an image from an external file, you can use the Insert menu in the Visual Editor in RStudio and select Figure / Image.
+This will pop open a menu where you can browse to the image you want to insert as well as add alternative text or caption to it and adjust its size.
+In the visual editor you can also simply paste an image from your clipboard into your document and RStudio will place a copy of that image in your project folder.
+
+If you include a code chunk that generates a figure (e.g., includes a `ggplot()` call), the resulting figure will be automatically included in your Quarto document.
+
+### Figure sizing
+
+The biggest challenge of graphics in Quarto is getting your figures the right size and shape.
+There are five main options that control figure sizing: `fig-width`, `fig-height`, `fig-asp`, `out-width` and `out-height`.
+Image sizing is challenging because there are two sizes (the size of the figure created by R and the size at which it is inserted in the output document), and multiple ways of specifying the size (i.e. height, width, and aspect ratio: pick two of three).
+
+We recommend three of the five options:
+
+-   Plots tend to be more aesthetically pleasing if they have consistent width.
+    To enforce this, set `fig-width: 6` (6") and `fig-asp: 0.618` (the golden ratio) in the defaults.
+    Then in individual chunks, only adjust `fig-asp`.
+
+-   Control the output size with `out-width` and set it to a percentage of the body width of the output document.
+    We suggest to `out-width: "70%"` and `fig-align: center`.
+
+    That gives plots room to breathe, without taking up too much space.
+
+-   To put multiple plots in a single row, set the `layout-ncol` to 2 for two plots, 3 for three plots, etc.
+    This effectively sets `out-width` to "50%" for each of your plots if `layout-ncol` is 2, "33%" if `layout-ncol` is 3, etc.
+    Depending on what you're trying to illustrate (e.g., show data or show plot variations), you might also tweak `fig-width`, as discussed below.
+
+If you find that you're having to squint to read the text in your plot, you need to tweak `fig-width`.
+If `fig-width` is larger than the size the figure is rendered in the final doc, the text will be too small; if `fig-width` is smaller, the text will be too big.
+You'll often need to do a little experimentation to figure out the right ratio between the `fig-width` and the eventual width in your document.
+To illustrate the principle, the following three plots have `fig-width` of 4, 6, and 8 respectively:
+
+```{r}
+#| include: false
+
+plot <- ggplot(mpg, aes(x = displ, y = hwy)) + geom_point()
+```
+
+```{r}
+#| echo: false
+#| fig-width: 4
+#| out-width: "50%"
+#| fig-alt: |
+#|   Scatterplot of highway mileage vs. displacement of cars, where the points 
+#|   are normally sized and the axis text and labels are in similar font size 
+#|   to the surrounding text.
+
+plot
+```
+
+```{r}
+#| echo: false
+#| fig-width: 6
+#| out-width: "50%"
+#| fig-alt: |
+#|   Scatterplot of highway mileage vs. displacement of cars, where the points 
+#|   are smaller than in the previous plot and the axis text and labels are 
+#|   smallter than the surrounding text.
+
+plot
+```
+
+```{r}
+#| echo: false
+#| fig-width: 8
+#| out-width: "50%"
+#| fig-alt: |
+#|   Scatterplot of highway mileage vs. displacement of cars, where the points 
+#|   are even smaller than in the previous plot and the axis text and labels are 
+#|   even smallter than the surrounding text.
+
+plot
+```
+
+If you want to make sure the font size is consistent across all your figures, whenever you set `out-width`, you'll also need to adjust `fig-width` to maintain the same ratio with your default `out-width`.
+For example, if your default `fig-width` is 6 and `out-width` is "70%", when you set `out-width: "50%"` you'll need to set `fig-width` to 4.3 (6 \* 0.5 / 0.7).
+
+Figure sizing and scaling is an art and science and getting things right can require an iterative trial-and-error approach.
+You can learn more about figure sizing in the [taking control of plot scaling blog post](https://www.tidyverse.org/blog/2020/08/taking-control-of-plot-scaling/).
+
+### Other important options
+
+When mingling code and text, like in this book, you can set `fig-show: hold` so that plots are shown after the code.
+This has the pleasant side effect of forcing you to break up large blocks of code with their explanations.
+
+To add a caption to the plot, use `fig-cap`.
+In Quarto this will change the figure from inline to "floating".
+
+If you're producing PDF output, the default graphics type is PDF.
+This is a good default because PDFs are high quality vector graphics.
+However, they can produce very large and slow plots if you are displaying thousands of points.
+In that case, set `fig-format: "png"` to force the use of PNGs.
+They are slightly lower quality, but will be much more compact.
+
+It's a good idea to name code chunks that produce figures, even if you don't routinely label other chunks.
+The chunk label is used to generate the file name of the graphic on disk, so naming your chunks makes it much easier to pick out plots and reuse in other circumstances (e.g., if you want to quickly drop a single plot into an email).
+
+### Exercises
+
+1.  Open `diamond-sizes.qmd` in the visual editor, find an image of a diamond, copy it, and paste it into the document. Double click on the image and add a caption. Resize the image and render your document. Observe how the image is saved in your current working directory.
+2.  Edit the label of the code chunk in `diamond-sizes.qmd` that generates a plot to start with the prefix `fig-` and add a caption to the figure with the chunk option `fig-cap`. Then, edit the text above the code chunk to add a cross-reference to the figure with Insert \> Cross Reference.
+3.  Change the size of the figure with the following chunk options, one at a time, render your document, and describe how the figure changes.
+    a.  `fig-width: 10`
+
+    b.  `fig-height: 3`
+
+    c.  `out-width: "100%"`
+
+    d.  `out-width: "20%"`
+
+## Tables
+
+Similar to figures, you can include two types of tables in a Quarto document.
+They can be markdown tables that you create directly in your Quarto document (using the Insert Table menu) or they can be tables generated as a result of a code chunk.
+In this section we will focus on the latter, tables generated via computation.
+
+By default, Quarto prints data frames and matrices as you'd see them in the console:
+
+```{r}
+mtcars[1:5, ]
+```
+
+If you prefer that data be displayed with additional formatting you can use the `knitr::kable()` function.
+The code below generates @tbl-kable.
+
+```{r}
+#| label: tbl-kable
+#| tbl-cap: A knitr kable.
+
+knitr::kable(mtcars[1:5, ], )
+```
+
+Read the documentation for `?knitr::kable` to see the other ways in which you can customize the table.
+For even deeper customization, consider the **gt**, **huxtable**, **reactable**, **kableExtra**, **xtable**, **stargazer**, **pander**, **tables**, and **ascii** packages.
+Each provides a set of tools for returning formatted tables from R code.
+
+### Exercises
+
+1.  Open `diamond-sizes.qmd` in the visual editor, insert a code chunk, and add a table with `knitr::kable()` that shows the first 5 rows of the `diamonds` data frame.
+2.  Display the same table with `gt::gt()` instead.
+3.  Add a chunk label that starts with the prefix `tbl-` and add a caption to the table with the chunk option `tbl-cap`. Then, edit the text above the code chunk to add a cross-reference to the table with Insert \> Cross Reference.
+
+## Caching {#sec-caching}
+
+Normally, each render of a document starts from a completely clean slate.
+This is great for reproducibility, because it ensures that you've captured every important computation in code.
+However, it can be painful if you have some computations that take a long time.
+The solution is `cache: true`.
+
+You can enable the Knitr cache at the document level for caching the results of all computations in a document using standard YAML options:
+
+``` yaml
+---
+title: "My Document"
+execute: 
+  cache: true
+---
+```
+
+You can also enable caching at the chunk level for caching the results of computation in a specific chunk:
+
+```{r}
+#| echo: fenced
+#| cache: true
+
+# code for lengthy computation...
+```
+
+When set, this will save the output of the chunk to a specially named file on disk.
+On subsequent runs, knitr will check to see if the code has changed, and if it hasn't, it will reuse the cached results.
+
+The caching system must be used with care, because by default it is based on the code only, not its dependencies.
+For example, here the `processed_data` chunk depends on the `raw-data` chunk:
+
+``` {{r}}
+#| label: raw-data
+#| cache: true
+
+rawdata <- readr::read_csv("a_very_large_file.csv")
+```
+
+``` {{r}}
+#| label: processed_data
+#| cache: true
+
+processed_data <- rawdata |> 
+  filter(!is.na(import_var)) |> 
+  mutate(new_variable = complicated_transformation(x, y, z))
+```
+
+Caching the `processed_data` chunk means that it will get re-run if the dplyr pipeline is changed, but it won't get rerun if the `read_csv()` call changes.
+You can avoid that problem with the `dependson` chunk option:
+
+``` {{r}}
+#| label: processed-data
+#| cache: true
+#| dependson: "raw-data"
+
+processed_data <- rawdata |> 
+  filter(!is.na(import_var)) |> 
+  mutate(new_variable = complicated_transformation(x, y, z))
+```
+
+`dependson` should contain a character vector of *every* chunk that the cached chunk depends on.
+Knitr will update the results for the cached chunk whenever it detects that one of its dependencies have changed.
+
+Note that the chunks won't update if `a_very_large_file.csv` changes, because knitr caching only tracks changes within the `.qmd` file.
+If you want to also track changes to that file you can use the `cache.extra` option.
+This is an arbitrary R expression that will invalidate the cache whenever it changes.
+A good function to use is `file.mtime()`: it returns when it was last modified.
+Then you can write:
+
+``` {{r}}
+#| label: raw-data
+#| cache: true
+#| cache.extra: !expr file.mtime("a_very_large_file.csv")
+
+rawdata <- readr::read_csv("a_very_large_file.csv")
+```
+
+We've followed the advice of [David Robinson](https://twitter.com/drob/status/738786604731490304) to name these chunks: each chunk is named after the primary object that it creates.
+This makes it easier to understand the `dependson` specification.
+
+As your caching strategies get progressively more complicated, it's a good idea to regularly clear out all your caches with `knitr::clean_cache()`.
+
+### Exercises
+
+1.  Set up a network of chunks where `d` depends on `c` and `b`, and both `b` and `c` depend on `a`. Have each chunk print `lubridate::now()`, set `cache: true`, then verify your understanding of caching.
+
+## Troubleshooting
+
+Troubleshooting Quarto documents can be challenging because you are no longer in an interactive R environment, and you will need to learn some new tricks.
+Additionally, the error could be due to issues with the Quarto document itself or due to the R code in the Quarto document.
+
+One common error in documents with code chunks is duplicated chunk labels, which are especially pervasive if your workflow involves copying and pasting code chunks.
+To address this issue, all you need to do is to change one of your duplicated labels.
+
+If the errors are due to the R code in the document, the first thing you should always try is to recreate the problem in an interactive session.
+Restart R, then "Run all chunks", either from the Code menu, under Run region or with the keyboard shortcut Ctrl + Alt + R.
+If you're lucky, that will recreate the problem, and you can figure out what's going on interactively.
+
+If that doesn't help, there must be something different between your interactive environment and the Quarto environment.
+You're going to need to systematically explore the options.
+The most common difference is the working directory: the working directory of a Quarto is the directory in which it lives.
+Check the working directory is what you expect by including `getwd()` in a chunk.
+
+Next, brainstorm all the things that might cause the bug.
+You'll need to systematically check that they're the same in your R session and your Quarto session.
+The easiest way to do that is to set `error: true` on the chunk causing the problem, then use `print()` and `str()` to check that settings are as you expect.
+
+## YAML header
+
+You can control many other "whole document" settings by tweaking the parameters of the YAML header.
+You might wonder what YAML stands for: it's "YAML Ain't Markup Language", which is designed for representing hierarchical data in a way that's easy for humans to read and write.
+Quarto uses it to control many details of the output.
+Here we'll discuss three: self-contained documents, document parameters, and bibliographies.
+
+### Self-contained
+
+HTML documents typically have a number of external dependencies (e.g., images, CSS style sheets, JavaScript, etc.) and, by default, Quarto places these dependencies in a `_files` folder in the same directory as your `.qmd` file.
+If you publish the HTML file on a hosting platform (e.g., QuartoPub, <https://quartopub.com/>), the dependencies in this directory are published with your document and hence are available in the published report.
+However, if you want to email the report to a colleague, you might prefer to have a single, self-contained, HTML document that embeds all of its dependencies.
+You can do this by specifying the `embed-resources` option:
+
+``` yaml
+format:
+  html:
+    embed-resources: true
+```
+
+The resulting file will be self-contained, such that it will need no external files and no internet access to be displayed properly by a browser.
+
+### Parameters
+
+Quarto documents can include one or more parameters whose values can be set when you render the report.
+Parameters are useful when you want to re-render the same report with distinct values for various key inputs.
+For example, you might be producing sales reports per branch, exam results by student, or demographic summaries by country.
+To declare one or more parameters, use the `params` field.
+
+This example uses a `my_class` parameter to determine which class of cars to display:
+
+```{r}
+#| echo: false
+#| out-width: "100%"
+#| comment: ""
+
+cat(readr::read_file("quarto/fuel-economy.qmd"))
+```
+
+As you can see, parameters are available within the code chunks as a read-only list named `params`.
+
+You can write atomic vectors directly into the YAML header.
+You can also run arbitrary R expressions by prefacing the parameter value with `!expr`.
+This is a good way to specify date/time parameters.
+
+``` yaml
+params:
+  start: !expr lubridate::ymd("2015-01-01")
+  snapshot: !expr lubridate::ymd_hms("2015-01-01 12:30:00")
+```
+
+### Bibliographies and Citations
+
+Quarto can automatically generate citations and a bibliography in a number of styles.
+The most straightforward way of adding citations and bibliographies to a Quarto document is using the visual editor in RStudio.
+
+To add a citation using the visual editor, go to Insert \> Citation.
+Citations can be inserted from a variety of sources:
+
+1.  [DOI](https://quarto.org/docs/visual-editor/technical.html#citations-from-dois) (Document Object Identifier) references.
+
+2.  [Zotero](https://quarto.org/docs/visual-editor/technical.html#citations-from-zotero) personal or group libraries.
+
+3.  Searches of [Crossref](https://www.crossref.org/), [DataCite](https://datacite.org/), or [PubMed](https://pubmed.ncbi.nlm.nih.gov/).
+
+4.  Your document bibliography (a `.bib` file in the directory of your document)
+
+Under the hood, the visual mode uses the standard Pandoc markdown representation for citations (e.g., `[@citation]`).
+
+If you add a citation using one of the first three methods, the visual editor will automatically create a `bibliography.bib` file for you and add the reference to it.
+It will also add a `bibliography` field to the document YAML.
+As you add more references, this file will get populated with their citations.
+You can also directly edit this file using many common bibliography formats including BibLaTeX, BibTeX, EndNote, Medline.
+
+To create a citation within your .qmd file in the source editor, use a key composed of '\@' + the citation identifier from the bibliography file.
+Then place the citation in square brackets.
+Here are some examples:
+
+``` markdown
+Separate multiple citations with a `;`: Blah blah [@smith04; @doe99].
+
+You can add arbitrary comments inside the square brackets: 
+Blah blah [see @doe99, pp. 33-35; also @smith04, ch. 1].
+
+Remove the square brackets to create an in-text citation: @smith04 
+says blah, or @smith04 [p. 33] says blah.
+
+Add a `-` before the citation to suppress the author's name: 
+Smith says blah [-@smith04].
+```
+
+When Quarto renders your file, it will build and append a bibliography to the end of your document.
+The bibliography will contain each of the cited references from your bibliography file, but it will not contain a section heading.
+As a result it is common practice to end your file with a section header for the bibliography, such as `# References` or `# Bibliography`.
+
+You can change the style of your citations and bibliography by referencing a CSL (citation style language) file in the `csl` field:
+
+``` yaml
+bibliography: rmarkdown.bib
+csl: apa.csl
+```
+
+As with the bibliography field, your csl file should contain a path to the file.
+Here we assume that the csl file is in the same directory as the .qmd file.
+A good place to find CSL style files for common bibliography styles is <https://github.com/citation-style-language/styles>.
+
+## Workflow
+
+Earlier, we discussed a basic workflow for capturing your R code where you work interactively in the *console*, then capture what works in the *script editor*.
+Quarto brings together the console and the script editor, blurring the lines between interactive exploration and long-term code capture.
+You can rapidly iterate within a chunk, editing and re-executing with Cmd/Ctrl + Shift + Enter.
+When you're happy, you move on and start a new chunk.
+
+Quarto is also important because it so tightly integrates prose and code.
+This makes it a great **analysis notebook** because it lets you develop code and record your thoughts.
+An analysis notebook shares many of the same goals as a classic lab notebook in the physical sciences.
+It:
+
+-   Records what you did and why you did it.
+    Regardless of how great your memory is, if you don't record what you do, there will come a time when you have forgotten important details.
+    Write them down so you don't forget!
+
+-   Supports rigorous thinking.
+    You are more likely to come up with a strong analysis if you record your thoughts as you go, and continue to reflect on them.
+    This also saves you time when you eventually write up your analysis to share with others.
+
+-   Helps others understand your work.
+    It is rare to do data analysis by yourself, and you'll often be working as part of a team.
+    A lab notebook helps you share not only what you've done, but why you did it with your colleagues or lab mates.
+
+Much of the good advice about using lab notebooks effectively can also be translated to analysis notebooks.
+We've drawn on our own experiences and Colin Purrington's advice on lab notebooks (<https://colinpurrington.com/tips/lab-notebooks>) to come up with the following tips:
+
+-   Ensure each notebook has a descriptive title, an evocative file name, and a first paragraph that briefly describes the aims of the analysis.
+
+-   Use the YAML header date field to record the date you started working on the notebook:
+
+    ``` yaml
+    date: 2016-08-23
+    ```
+
+    Use ISO8601 YYYY-MM-DD format so that's there no ambiguity.
+    Use it even if you don't normally write dates that way!
+
+-   If you spend a lot of time on an analysis idea and it turns out to be a dead end, don't delete it!
+    Write up a brief note about why it failed and leave it in the notebook.
+    That will help you avoid going down the same dead end when you come back to the analysis in the future.
+
+-   Generally, you're better off doing data entry outside of R.
+    But if you do need to record a small snippet of data, clearly lay it out using `tibble::tribble()`.
+
+-   If you discover an error in a data file, never modify it directly, but instead write code to correct the value.
+    Explain why you made the fix.
+
+-   Before you finish for the day, make sure you can render the notebook.
+    If you're using caching, make sure to clear the caches.
+    That will let you fix any problems while the code is still fresh in your mind.
+
+-   If you want your code to be reproducible in the long-run (i.e. so you can come back to run it next month or next year), you'll need to track the versions of the packages that your code uses.
+    A rigorous approach is to use **renv**, <https://rstudio.github.io/renv/index.html>, which stores packages in your project directory.
+    A quick and dirty hack is to include a chunk that runs `sessionInfo()` --- that won't let you easily recreate your packages as they are today, but at least you'll know what they were.
+
+-   You are going to create many, many, many analysis notebooks over the course of your career.
+    How are you going to organize them so you can find them again in the future?
+    We recommend storing them in individual projects, and coming up with a good naming scheme.
+
+## Summary
+
+In this chapter we introduced you to Quarto for authoring and publishing reproducible computational documents that include your code and your prose in one place.
+You've learned about writing Quarto documents in RStudio with the visual or the source editor, how code chunks work and how to customize options for them, how to include figures and tables in your Quarto documents, and options for caching for computations.
+Additionally, you've learned about adjusting YAML header options for creating self-contained or parametrized documents as well as including citations and bibliography.
+We have also given you some troubleshooting and workflow tips.
+
+While this introduction should be sufficient to get you started with Quarto, there is still a lot more to learn.
+Quarto is still relatively young, and is still growing rapidly.
+The best place to stay on top of innovations is the official Quarto website: [https://quarto.org](https://quarto.org/){.uri}.
+
+There are two important topics that we haven't covered here: collaboration and the details of accurately communicating your ideas to other humans.
+Collaboration is a vital part of modern data science, and you can make your life much easier by using version control tools, like Git and GitHub.
+We recommend "Happy Git with R", a user friendly introduction to Git and GitHub from R users, by Jenny Bryan.
+The book is freely available online: <https://happygitwithr.com>.
+
+We have also not touched on what you should actually write in order to clearly communicate the results of your analysis.
+To improve your writing, we highly recommend reading either [*Style: Lessons in Clarity and Grace*](https://www.amazon.com/Style-Lessons-Clarity-Grace-12th/dp/0134080416) by Joseph M. Williams & Joseph Bizup, or [*The Sense of Structure: Writing from the Reader's Perspective*](https://www.amazon.com/Sense-Structure-Writing-Readers-Perspective/dp/0205296327) by George Gopen.
+Both books will help you understand the structure of sentences and paragraphs, and give you the tools to make your writing more clear.
+(These books are rather expensive if purchased new, but they're used by many English classes so there are plenty of cheap second-hand copies).
+George Gopen also has a number of short articles on writing at <https://www.georgegopen.com/the-litigation-articles.html>.
+They are aimed at lawyers, but almost everything applies to data scientists too.
diff --git a/quarto/.gitignore b/quarto/.gitignore
new file mode 100644
index 000000000..075b2542a
--- /dev/null
+++ b/quarto/.gitignore
@@ -0,0 +1 @@
+/.quarto/
diff --git a/quarto/_quarto.yml b/quarto/_quarto.yml
new file mode 100644
index 000000000..b8bae5830
--- /dev/null
+++ b/quarto/_quarto.yml
@@ -0,0 +1,2 @@
+project:
+  type: default
diff --git a/quarto/chunk-labels.qmd b/quarto/chunk-labels.qmd
new file mode 100644
index 000000000..fe42941f3
--- /dev/null
+++ b/quarto/chunk-labels.qmd
@@ -0,0 +1,20 @@
+---
+title: "Code chunks"
+editor: visual
+---
+
+```{r}
+#| label: setup
+```
+
+# Quarto 
+
+```{r}
+#| label: cars
+```
+
+# Including plots
+
+```{r}
+#| label: pressure
+```
diff --git a/quarto/diamond-sizes-console-output.png b/quarto/diamond-sizes-console-output.png
new file mode 100644
index 000000000..35504cd02
Binary files /dev/null and b/quarto/diamond-sizes-console-output.png differ
diff --git a/quarto/diamond-sizes-notebook.png b/quarto/diamond-sizes-notebook.png
new file mode 100644
index 000000000..926973e60
Binary files /dev/null and b/quarto/diamond-sizes-notebook.png differ
diff --git a/quarto/diamond-sizes-report.png b/quarto/diamond-sizes-report.png
new file mode 100644
index 000000000..02704df2f
Binary files /dev/null and b/quarto/diamond-sizes-report.png differ
diff --git a/quarto/diamond-sizes-visual-editor.png b/quarto/diamond-sizes-visual-editor.png
new file mode 100644
index 000000000..fefd47774
Binary files /dev/null and b/quarto/diamond-sizes-visual-editor.png differ
diff --git a/quarto/diamond-sizes.qmd b/quarto/diamond-sizes.qmd
new file mode 100644
index 000000000..234fe97a0
--- /dev/null
+++ b/quarto/diamond-sizes.qmd
@@ -0,0 +1,28 @@
+---
+title: "Diamond sizes"
+date: 2022-09-12
+format: html
+---
+
+```{r}
+#| label: setup
+#| include: false
+
+library(tidyverse)
+
+smaller <- diamonds |> 
+  filter(carat <= 2.5)
+```
+
+We have data about `r nrow(diamonds)` diamonds.
+Only `r nrow(diamonds) - nrow(smaller)` are larger than 2.5 carats.
+The distribution of the remainder is shown below:
+
+```{r}
+#| label: plot-smaller-diamonds
+#| echo: false
+
+smaller |> 
+  ggplot(aes(x = carat)) + 
+  geom_freqpoly(binwidth = 0.01)
+```
diff --git a/quarto/example-book.yml b/quarto/example-book.yml
new file mode 100644
index 000000000..177d5b414
--- /dev/null
+++ b/quarto/example-book.yml
@@ -0,0 +1,17 @@
+project:
+  type: book
+
+book:
+  title: "A book on color scales"
+  author: "Jane Coloriste"
+  chapters:
+    - index.qmd
+    - intro.qmd
+    - viridis-colors.qmd
+    - terrain-colors.qmd
+
+format:
+  html:
+    theme: cosmo
+  pdf: default
+  epub: default
diff --git a/quarto/example-site.yml b/quarto/example-site.yml
new file mode 100644
index 000000000..dd21e6788
--- /dev/null
+++ b/quarto/example-site.yml
@@ -0,0 +1,13 @@
+project:
+  type: website
+
+website:
+  title: "A website on color scales"
+  navbar:
+    left:
+      - href: index.qmd
+        text: Home
+      - href: viridis-colors.qmd
+        text: Viridis colors
+      - href: terrain-colors.qmd
+        text: Terrain colors
diff --git a/quarto/fuel-economy.qmd b/quarto/fuel-economy.qmd
new file mode 100644
index 000000000..0b191c176
--- /dev/null
+++ b/quarto/fuel-economy.qmd
@@ -0,0 +1,24 @@
+---
+format: html
+params:
+  my_class: "suv"
+---
+
+```{r}
+#| label: setup
+#| include: false
+
+library(tidyverse)
+
+class <- mpg |> filter(class == params$my_class)
+```
+
+# Fuel economy for `r params$my_class`s
+
+```{r}
+#| message: false
+
+ggplot(class, aes(x = displ, y = hwy)) + 
+  geom_point() + 
+  geom_smooth(se = FALSE)
+```
diff --git a/quarto/markdown.qmd b/quarto/markdown.qmd
new file mode 100644
index 000000000..3873702fc
--- /dev/null
+++ b/quarto/markdown.qmd
@@ -0,0 +1,45 @@
+## Text formatting
+
+*italic* **bold** ~~strikeout~~ `code`
+
+superscript^2^ subscript~2~
+
+[underline]{.underline} [small caps]{.smallcaps}
+
+## Headings
+
+# 1st Level Header
+
+## 2nd Level Header
+
+### 3rd Level Header
+
+## Lists
+
+-   Bulleted list item 1
+
+-   Item 2
+
+    -   Item 2a
+
+    -   Item 2b
+
+1.  Numbered list item 1
+
+2.  Item 2.
+    The numbers are incremented automatically in the output.
+
+## Links and images
+
+<http://example.com>
+
+[linked phrase](http://example.com)
+
+![optional caption text](quarto.png){fig-alt="Quarto logo and the word quarto spelled in small case letters"}
+
+## Tables
+
+| First Header | Second Header |
+|--------------|---------------|
+| Content Cell | Content Cell  |
+| Content Cell | Content Cell  |
diff --git a/quarto/quarto-shiny.png b/quarto/quarto-shiny.png
new file mode 100644
index 000000000..804badad9
Binary files /dev/null and b/quarto/quarto-shiny.png differ
diff --git a/quarto/quarto-shiny.qmd b/quarto/quarto-shiny.qmd
new file mode 100644
index 000000000..df375cf5e
--- /dev/null
+++ b/quarto/quarto-shiny.qmd
@@ -0,0 +1,15 @@
+---
+format: html
+server: shiny
+---
+
+```{r}
+library(shiny)
+
+textInput("name", "What is your name?")
+numericInput("age", "How old are you?", NA, min = 0, max = 150) 
+```
+
+```{r}
+#| context: server
+```
diff --git a/quarto/quarto-visual-editor.png b/quarto/quarto-visual-editor.png
new file mode 100644
index 000000000..69206d735
Binary files /dev/null and b/quarto/quarto-visual-editor.png differ
diff --git a/rmarkdown/rmarkdown.Rproj b/quarto/quarto.Rproj
similarity index 71%
rename from rmarkdown/rmarkdown.Rproj
rename to quarto/quarto.Rproj
index b42a3ccb7..7ad844bf1 100644
--- a/rmarkdown/rmarkdown.Rproj
+++ b/quarto/quarto.Rproj
@@ -14,3 +14,10 @@ LaTeX: XeLaTeX
 
 AutoAppendNewline: Yes
 StripTrailingWhitespace: Yes
+
+UseNativePipeOperator: Yes
+
+MarkdownWrap: Sentence
+MarkdownCanonical: Yes
+
+SpellingDictionary: en_US
diff --git a/quarto/quarto.png b/quarto/quarto.png
new file mode 100644
index 000000000..616d17b92
Binary files /dev/null and b/quarto/quarto.png differ
diff --git a/quarto/screenshot-editing.key b/quarto/screenshot-editing.key
new file mode 100755
index 000000000..d15bbf7b3
Binary files /dev/null and b/quarto/screenshot-editing.key differ
diff --git a/r4ds.Rproj b/r4ds.Rproj
index c5d14c268..eb93c7040 100644
--- a/r4ds.Rproj
+++ b/r4ds.Rproj
@@ -12,7 +12,13 @@ Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: XeLaTeX
 
-AutoAppendNewline: Yes
 StripTrailingWhitespace: Yes
 
-BuildType: Website
+BuildType: None
+
+UseNativePipeOperator: Yes
+
+MarkdownWrap: Sentence
+MarkdownCanonical: Yes
+
+SpellingDictionary: en_US
diff --git a/r4ds.css b/r4ds.css
deleted file mode 100644
index bcd7a2eb5..000000000
--- a/r4ds.css
+++ /dev/null
@@ -1,15 +0,0 @@
-.book .book-header h1 {
-  opacity: 1;
-  text-align: left;
-}
-
-#header .title {
-  margin-bottom: 0em;
-}
-#header h4.author {
-  margin: 0;
-  color: #666;
-}
-#header h4.author em {
-  font-style: normal;
-}
diff --git a/r4ds.scss b/r4ds.scss
new file mode 100644
index 000000000..3c53e9d21
--- /dev/null
+++ b/r4ds.scss
@@ -0,0 +1,58 @@
+/*-- scss:defaults --*/
+
+$primary: #637238 !default;
+$font-size-root: 18px !default;
+
+/*-- scss:rules --*/
+
+.sidebar-title {
+  color: #637238;
+}
+
+div.sidebar-item-container .active {
+  font-weight: bold;
+}
+
+.sidebar nav[role=doc-toc] ul>li>a.active, .sidebar nav[role=doc-toc] ul>li>ul>li>a.active{
+  font-weight: bold;
+}
+
+img.quarto-cover-image {
+  box-shadow: 0 .5rem 1rem rgba(0,0,0,.15);
+}
+
+/* Headings ------------------------------------------------------ */
+
+#title-block-header.quarto-title-block.default .quarto-title h1.title {
+  margin-bottom: 0.5rem;
+}
+
+h2 {
+  margin-top: 2rem;
+  margin-bottom: 1rem;
+  font-size: 1.4rem;
+  font-weight: 600;
+}
+h3 { margin-top: 1.5em; font-size: 1.2rem; font-weight: 500; }
+h4 { margin-top: 1.5em; font-size: 1.1rem; }
+h5 { margin-top: 1.5em; font-size: 1rem; }
+
+.quarto-section-identifier {
+  color: #6C6C6C;
+  font-weight: normal;
+}
+
+/* Code ------------------------------------------------ */
+
+code {
+  color: #373a3c;
+}
+
+code a:any-link {
+  text-decoration: underline;
+  text-decoration-color: #ccc;
+}
+
+pre {
+  background-image: linear-gradient(160deg,#f8f8f8 0,#f1f1f1 100%);
+}
diff --git a/rectangling.qmd b/rectangling.qmd
new file mode 100644
index 000000000..81c8a36ab
--- /dev/null
+++ b/rectangling.qmd
@@ -0,0 +1,747 @@
+# Hierarchical data {#sec-rectangling}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+In this chapter, you'll learn the art of data **rectangling**: taking data that is fundamentally hierarchical, or tree-like, and converting it into a rectangular data frame made up of rows and columns.
+This is important because hierarchical data is surprisingly common, especially when working with data that comes from the web.
+
+To learn about rectangling, you'll need to first learn about lists, the data structure that makes hierarchical data possible.
+Then you'll learn about two crucial tidyr functions: `tidyr::unnest_longer()` and `tidyr::unnest_wider()`.
+We'll then show you a few case studies, applying these simple functions again and again to solve real problems.
+We'll finish off by talking about JSON, the most frequent source of hierarchical datasets and a common format for data exchange on the web.
+
+### Prerequisites
+
+In this chapter, we'll use many functions from tidyr, a core member of the tidyverse.
+We'll also use repurrrsive to provide some interesting datasets for rectangling practice, and we'll finish by using jsonlite to read JSON files into R lists.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(repurrrsive)
+library(jsonlite)
+```
+
+## Lists
+
+So far you've worked with data frames that contain simple vectors like integers, numbers, characters, date-times, and factors.
+These vectors are simple because they're homogeneous: every element is of the same data type.
+If you want to store elements of different types in the same vector, you'll need a **list**, which you create with `list()`:
+
+```{r}
+x1 <- list(1:4, "a", TRUE)
+x1
+```
+
+It's often convenient to name the components, or **children**, of a list, which you can do in the same way as naming the columns of a tibble:
+
+```{r}
+x2 <- list(a = 1:2, b = 1:3, c = 1:4)
+x2
+```
+
+Even for these very simple lists, printing takes up quite a lot of space.
+A useful alternative is `str()`, which generates a compact display of the **str**ucture, de-emphasizing the contents:
+
+```{r}
+str(x1)
+str(x2)
+```
+
+As you can see, `str()` displays each child of the list on its own line.
+It displays the name, if present, then an abbreviation of the type, then the first few values.
+
+### Hierarchy
+
+Lists can contain any type of object, including other lists.
+This makes them suitable for representing hierarchical (tree-like) structures:
+
+```{r}
+x3 <- list(list(1, 2), list(3, 4))
+str(x3)
+```
+
+This is notably different to `c()`, which generates a flat vector:
+
+```{r}
+c(c(1, 2), c(3, 4))
+
+x4 <- c(list(1, 2), list(3, 4))
+str(x4)
+```
+
+As lists get more complex, `str()` gets more useful, as it lets you see the hierarchy at a glance:
+
+```{r}
+x5 <- list(1, list(2, list(3, list(4, list(5)))))
+str(x5)
+```
+
+As lists get even larger and more complex, `str()` eventually starts to fail, and you'll need to switch to `View()`[^rectangling-1].
+@fig-view-collapsed shows the result of calling `View(x5)`. The viewer starts by showing just the top level of the list, but you can interactively expand any of the components to see more, as in @fig-view-expand-1. RStudio will also show you the code you need to access that element, as in @fig-view-expand-2. We'll come back to how this code works in @sec-subset-one.
+
+[^rectangling-1]: This is an RStudio feature.
+
+```{r}
+#| label: fig-view-collapsed
+#| fig.cap: >
+#|   The RStudio view lets you interactively explore a complex list.  
+#|   The viewer opens showing only the top level of the list.
+#| fig.alt: >
+#|   A screenshot of RStudio showing the list-viewer. It shows the
+#|   two children of x5: the first child is a double vector and the
+#|   second child is a list. A rightward facing triable indicates that the
+#|   second child itself has children but you can't see them.
+#| echo: false
+#| out-width: NULL
+knitr::include_graphics("screenshots/View-1.png", dpi = 220)
+```
+
+```{r}
+#| label: fig-view-expand-1
+#| fig.cap: >
+#|   Clicking on the rightward facing triangle expands that component
+#|   of the list so that you can also see its children.
+#| fig.alt: >
+#|   Another screenshot of the list-viewer having expand the second
+#|   child of x5. It also has two children, a double vector and another
+#|   list.
+#| echo: false
+#| out-width: NULL
+knitr::include_graphics("screenshots/View-2.png", dpi = 220)
+```
+
+```{r}
+#| label: fig-view-expand-2
+#| fig.cap: >
+#|   You can repeat this operation as many times as needed to get to the 
+#|   data you're interested in. Note the bottom-left corner: if you click
+#|   an element of the list, RStudio will give you the subsetting code
+#|   needed to access it, in this case `x5[[2]][[2]][[2]]`.
+#| fig.alt: >
+#|   Another screenshot, having expanded the grandchild of x5 to see its
+#|   two children, again a double vector and a list.
+#| echo: false
+#| out-width: NULL
+knitr::include_graphics("screenshots/View-3.png", dpi = 220)
+```
+
+### List-columns
+
+Lists can also live inside a tibble, where we call them list-columns.
+List-columns are useful because they allow you to place objects in a tibble that wouldn't usually belong in there.
+In particular, list-columns are used a lot in the [tidymodels](https://www.tidymodels.org) ecosystem, because they allow you to store things like model outputs or resamples in a data frame.
+
+Here's a simple example of a list-column:
+
+```{r}
+df <- tibble(
+  x = 1:2, 
+  y = c("a", "b"),
+  z = list(list(1, 2), list(3, 4, 5))
+)
+df
+```
+
+There's nothing special about lists in a tibble; they behave like any other column:
+
+```{r}
+df |> 
+  filter(x == 1)
+```
+
+Computing with list-columns is harder, but that's because computing with lists is harder in general; we'll come back to that in @sec-iteration.
+In this chapter, we'll focus on unnesting list-columns out into regular variables so you can use your existing tools on them.
+
+The default print method just displays a rough summary of the contents.
+The list column could be arbitrarily complex, so there's no good way to print it.
+If you want to see it, you'll need to pull out just the one list-column and apply one of the techniques that you've learned above, like `df |> pull(z) |> str()` or `df |> pull(z) |> View()`.
+
+::: callout-note
+## Base R
+
+It's possible to put a list in a column of a `data.frame`, but it's a lot fiddlier because `data.frame()` treats a list as a list of columns:
+
+```{r}
+data.frame(x = list(1:3, 3:5))
+```
+
+You can force `data.frame()` to treat a list as a list of rows by wrapping it in list `I()`, but the result doesn't print particularly well:
+
+```{r}
+data.frame(
+  x = I(list(1:2, 3:5)), 
+  y = c("1, 2", "3, 4, 5")
+)
+```
+
+It's easier to use list-columns with tibbles because `tibble()` treats lists like vectors and the print method has been designed with lists in mind.
+:::
+
+## Unnesting
+
+Now that you've learned the basics of lists and list-columns, let's explore how you can turn them back into regular rows and columns.
+Here we'll use very simple sample data so you can get the basic idea; in the next section we'll switch to real data.
+
+List-columns tend to come in two basic forms: named and unnamed.
+When the children are **named**, they tend to have the same names in every row.
+For example, in `df1`, every element of list-column `y` has two elements named `a` and `b`.
+Named list-columns naturally unnest into columns: each named element becomes a new named column.
+
+```{r}
+df1 <- tribble(
+  ~x, ~y,
+  1, list(a = 11, b = 12),
+  2, list(a = 21, b = 22),
+  3, list(a = 31, b = 32),
+)
+```
+
+When the children are **unnamed**, the number of elements tends to vary from row-to-row.
+For example, in `df2`, the elements of list-column `y` are unnamed and vary in length from one to three.
+Unnamed list-columns naturally unnest into rows: you'll get one row for each child.
+
+```{r}
+
+df2 <- tribble(
+  ~x, ~y,
+  1, list(11, 12, 13),
+  2, list(21),
+  3, list(31, 32),
+)
+```
+
+tidyr provides two functions for these two cases: `unnest_wider()` and `unnest_longer()`.
+The following sections explain how they work.
+
+### `unnest_wider()`
+
+When each row has the same number of elements with the same names, like `df1`, it's natural to put each component into its own column with `unnest_wider()`:
+
+```{r}
+df1 |> 
+  unnest_wider(y)
+```
+
+By default, the names of the new columns come exclusively from the names of the list elements, but you can use the `names_sep` argument to request that they combine the column name and the element name.
+This is useful for disambiguating repeated names.
+
+```{r}
+df1 |> 
+  unnest_wider(y, names_sep = "_")
+```
+
+### `unnest_longer()`
+
+When each row contains an unnamed list, it's most natural to put each element into its own row with `unnest_longer()`:
+
+```{r}
+df2 |> 
+  unnest_longer(y)
+```
+
+Note how `x` is duplicated for each element inside of `y`: we get one row of output for each element inside the list-column.
+But what happens if one of the elements is empty, as in the following example?
+
+```{r}
+df6 <- tribble(
+  ~x, ~y,
+  "a", list(1, 2),
+  "b", list(3),
+  "c", list()
+)
+df6 |> unnest_longer(y)
+```
+
+We get zero rows in the output, so the row effectively disappears.
+If you want to preserve that row, adding `NA` in `y`, set `keep_empty = TRUE`.
+
+### Inconsistent types
+
+What happens if you unnest a list-column that contains different types of vector?
+For example, take the following dataset where the list-column `y` contains two numbers, a character, and a logical, which can't normally be mixed in a single column.
+
+```{r}
+df4 <- tribble(
+  ~x, ~y,
+  "a", list(1),
+  "b", list("a", TRUE, 5)
+)
+```
+
+`unnest_longer()` always keeps the set of columns unchanged, while changing the number of rows.
+So what happens?
+How does `unnest_longer()` produce five rows while keeping everything in `y`?
+
+```{r}
+df4 |> 
+  unnest_longer(y)
+```
+
+As you can see, the output contains a list-column, but every element of the list-column contains a single element.
+Because `unnest_longer()` can't find a common type of vector, it keeps the original types in a list-column.
+You might wonder if this breaks the commandment that every element of a column must be the same type.
+It doesn't: every element is a list, even though the contents are of different types.
+
+Dealing with inconsistent types is challenging and the details depend on the precise nature of the problem and your goals, but you'll most likely need tools from @sec-iteration.
+
+### Other functions
+
+tidyr has a few other useful rectangling functions that we're not going to cover in this book:
+
+-   `unnest_auto()` automatically picks between `unnest_longer()` and `unnest_wider()` based on the structure of the list-column. It's great for rapid exploration, but ultimately it's a bad idea because it doesn't force you to understand how your data is structured, and makes your code harder to understand.
+-   `unnest()` expands both rows and columns. It's useful when you have a list-column that contains a 2d structure like a data frame, which you don't see in this book, but you might encounter if you use the [tidymodels](https://www.tmwr.org/base-r.html#combining-base-r-models-and-the-tidyverse) ecosystem.
+
+These functions are good to know about as you might encounter them when reading other people's code or tackling rarer rectangling challenges yourself.
+
+### Exercises
+
+1.  What happens when you use `unnest_wider()` with unnamed list-columns like `df2`?
+    What argument is now necessary?
+    What happens to missing values?
+
+2.  What happens when you use `unnest_longer()` with named list-columns like `df1`?
+    What additional information do you get in the output?
+    How can you suppress that extra detail?
+
+3.  From time-to-time you encounter data frames with multiple list-columns with aligned values.
+    For example, in the following data frame, the values of `y` and `z` are aligned (i.e. `y` and `z` will always have the same length within a row, and the first value of `y` corresponds to the first value of `z`).
+    What happens if you apply two `unnest_longer()` calls to this data frame?
+    How can you preserve the relationship between `x` and `y`?
+    (Hint: carefully read the docs).
+
+    ```{r}
+    df4 <- tribble(
+      ~x, ~y, ~z,
+      "a", list("y-a-1", "y-a-2"), list("z-a-1", "z-a-2"),
+      "b", list("y-b-1", "y-b-2", "y-b-3"), list("z-b-1", "z-b-2", "z-b-3")
+    )
+    ```
+
+## Case studies
+
+The main difference between the simple examples we used above and real data is that real data typically contains multiple levels of nesting that require multiple calls to `unnest_longer()` and/or `unnest_wider()`.
+To show that in action, this section works through three real rectangling challenges using datasets from the repurrrsive package.
+
+### Very wide data
+
+We'll start with `gh_repos`.
+This is a list that contains data about a collection of GitHub repositories retrieved using the GitHub API. It's a very deeply nested list so it's difficult to show the structure in this book; we recommend exploring a little on your own with `View(gh_repos)` before we continue.
+
+`gh_repos` is a list, but our tools work with list-columns, so we'll begin by putting it into a tibble.
+We call this column `json` for reasons we'll get to later.
+
+```{r}
+repos <- tibble(json = gh_repos)
+repos
+```
+
+This tibble contains 6 rows, one row for each child of `gh_repos`.
+Each row contains a unnamed list with either 26 or 30 rows.
+Since these are unnamed, we'll start with `unnest_longer()` to put each child in its own row:
+
+```{r}
+repos |> 
+  unnest_longer(json)
+```
+
+At first glance, it might seem like we haven't improved the situation: while we have more rows (176 instead of 6) each element of `json` is still a list.
+However, there's an important difference: now each element is a **named** list so we can use `unnest_wider()` to put each element into its own column:
+
+```{r}
+repos |> 
+  unnest_longer(json) |> 
+  unnest_wider(json) 
+```
+
+This has worked but the result is a little overwhelming: there are so many columns that tibble doesn't even print all of them!
+We can see them all with `names()`; and here we look at the first 10:
+
+```{r}
+repos |> 
+  unnest_longer(json) |> 
+  unnest_wider(json) |> 
+  names() |> 
+  head(10)
+```
+
+Let's pull out a few that look interesting:
+
+```{r}
+repos |> 
+  unnest_longer(json) |> 
+  unnest_wider(json) |> 
+  select(id, full_name, owner, description)
+```
+
+You can use this to work back to understand how `gh_repos` was structured: each child was a GitHub user containing a list of up to 30 GitHub repositories that they created.
+
+`owner` is another list-column, and since it contains a named list, we can use `unnest_wider()` to get at the values:
+
+```{r}
+#| error: true
+repos |> 
+  unnest_longer(json) |> 
+  unnest_wider(json) |> 
+  select(id, full_name, owner, description) |> 
+  unnest_wider(owner)
+```
+
+Uh oh, this list column also contains an `id` column and we can't have two `id` columns in the same data frame.
+As suggested, lets use `names_sep` to resolve the problem:
+
+```{r}
+repos |> 
+  unnest_longer(json) |> 
+  unnest_wider(json) |> 
+  select(id, full_name, owner, description) |> 
+  unnest_wider(owner, names_sep = "_")
+```
+
+This gives another wide dataset, but you can get the sense that `owner` appears to contain a lot of additional data about the person who "owns" the repository.
+
+### Relational data
+
+Nested data is sometimes used to represent data that we'd usually spread across multiple data frames.
+For example, take `got_chars` which contains data about characters that appear in the Game of Thrones books and TV series.
+Like `gh_repos` it's a list, so we start by turning it into a list-column of a tibble:
+
+```{r}
+chars <- tibble(json = got_chars)
+chars
+```
+
+The `json` column contains named elements, so we'll start by widening it:
+
+```{r}
+chars |> 
+  unnest_wider(json)
+```
+
+And selecting a few columns to make it easier to read:
+
+```{r}
+characters <- chars |> 
+  unnest_wider(json) |> 
+  select(id, name, gender, culture, born, died, alive)
+characters
+```
+
+This dataset contains also many list-columns:
+
+```{r}
+chars |> 
+  unnest_wider(json) |> 
+  select(id, where(is.list))
+```
+
+Let's explore the `titles` column.
+It's an unnamed list-column, so we'll unnest it into rows:
+
+```{r}
+chars |> 
+  unnest_wider(json) |> 
+  select(id, titles) |> 
+  unnest_longer(titles)
+```
+
+You might expect to see this data in its own table because it would be easy to join to the characters data as needed.
+Let's do that, which requires little cleaning: removing the rows containing empty strings and renaming `titles` to `title` since each row now only contains a single title.
+
+```{r}
+titles <- chars |> 
+  unnest_wider(json) |> 
+  select(id, titles) |> 
+  unnest_longer(titles) |> 
+  filter(titles != "") |> 
+  rename(title = titles)
+titles
+```
+
+You could imagine creating a table like this for each of the list-columns, then using joins to combine them with the character data as you need it.
+
+### Deeply nested
+
+We'll finish off these case studies with a list-column that's very deeply nested and requires repeated rounds of `unnest_wider()` and `unnest_longer()` to unravel: `gmaps_cities`.
+This is a two column tibble containing five city names and the results of using Google's [geocoding API](https://developers.google.com/maps/documentation/geocoding) to determine their location:
+
+```{r}
+gmaps_cities
+```
+
+`json` is a list-column with internal names, so we start with an `unnest_wider()`:
+
+```{r}
+gmaps_cities |> 
+  unnest_wider(json)
+```
+
+This gives us the `status` and the `results`.
+We'll drop the status column since they're all `OK`; in a real analysis, you'd also want to capture all the rows where `status != "OK"` and figure out what went wrong.
+`results` is an unnamed list, with either one or two elements (we'll see why shortly) so we'll unnest it into rows:
+
+```{r}
+gmaps_cities |> 
+  unnest_wider(json) |> 
+  select(-status) |> 
+  unnest_longer(results)
+```
+
+Now `results` is a named list, so we'll use `unnest_wider()`:
+
+```{r}
+locations <- gmaps_cities |> 
+  unnest_wider(json) |> 
+  select(-status) |> 
+  unnest_longer(results) |> 
+  unnest_wider(results)
+locations
+```
+
+Now we can see why two cities got two results: Washington matched both Washington state and Washington, DC, and Arlington matched Arlington, Virginia and Arlington, Texas.
+
+There are a few different places we could go from here.
+We might want to determine the exact location of the match, which is stored in the `geometry` list-column:
+
+```{r}
+locations |> 
+  select(city, formatted_address, geometry) |> 
+  unnest_wider(geometry)
+```
+
+That gives us new `bounds` (a rectangular region) and `location` (a point).
+We can unnest `location` to see the latitude (`lat`) and longitude (`lng`):
+
+```{r}
+locations |> 
+  select(city, formatted_address, geometry) |> 
+  unnest_wider(geometry) |> 
+  unnest_wider(location)
+```
+
+Extracting the bounds requires a few more steps:
+
+```{r}
+locations |> 
+  select(city, formatted_address, geometry) |> 
+  unnest_wider(geometry) |> 
+  # focus on the variables of interest
+  select(!location:viewport) |>
+  unnest_wider(bounds)
+```
+
+We then rename `southwest` and `northeast` (the corners of the rectangle) so we can use `names_sep` to create short but evocative names:
+
+```{r}
+locations |> 
+  select(city, formatted_address, geometry) |> 
+  unnest_wider(geometry) |> 
+  select(!location:viewport) |>
+  unnest_wider(bounds) |> 
+  rename(ne = northeast, sw = southwest) |> 
+  unnest_wider(c(ne, sw), names_sep = "_") 
+```
+
+Note how we unnest two columns simultaneously by supplying a vector of variable names to `unnest_wider()`.
+
+Once you've discovered the path to get to the components you're interested in, you can extract them directly using another tidyr function, `hoist()`:
+
+```{r}
+#| results: false
+locations |> 
+  select(city, formatted_address, geometry) |> 
+  hoist(
+    geometry,
+    ne_lat = c("bounds", "northeast", "lat"),
+    sw_lat = c("bounds", "southwest", "lat"),
+    ne_lng = c("bounds", "northeast", "lng"),
+    sw_lng = c("bounds", "southwest", "lng"),
+  )
+```
+
+If these case studies have whetted your appetite for more real-life rectangling, you can see a few more examples in `vignette("rectangling", package = "tidyr")`.
+
+### Exercises
+
+1.  Roughly estimate when `gh_repos` was created.
+    Why can you only roughly estimate the date?
+
+2.  The `owner` column of `gh_repo` contains a lot of duplicated information because each owner can have many repos.
+    Can you construct an `owners` data frame that contains one row for each owner?
+    (Hint: does `distinct()` work with `list-cols`?)
+
+3.  Follow the steps used for `titles` to create similar tables for the aliases, allegiances, books, and TV series for the Game of Thrones characters.
+
+4.  Explain the following code line-by-line.
+    Why is it interesting?
+    Why does it work for `got_chars` but might not work in general?
+
+    ```{r}
+    #| results: false
+    tibble(json = got_chars) |> 
+      unnest_wider(json) |> 
+      select(id, where(is.list)) |> 
+      pivot_longer(
+        where(is.list), 
+        names_to = "name", 
+        values_to = "value"
+      ) |>  
+      unnest_longer(value)
+    ```
+
+5.  In `gmaps_cities`, what does `address_components` contain?
+    Why does the length vary between rows?
+    Unnest it appropriately to figure it out.
+    (Hint: `types` always appears to contain two elements. Does `unnest_wider()` make it easier to work with than `unnest_longer()`?)
+    .
+
+## JSON
+
+All of the case studies in the previous section were sourced from wild-caught JSON.
+JSON is short for **j**ava**s**cript **o**bject **n**otation and is the way that most web APIs return data.
+It's important to understand it because while JSON and R's data types are pretty similar, there isn't a perfect 1-to-1 mapping, so it's good to understand a bit about JSON if things go wrong.
+
+### Data types
+
+JSON is a simple format designed to be easily read and written by machines, not humans.
+It has six key data types.
+Four of them are scalars:
+
+-   The simplest type is a null (`null`) which plays the same role as `NA` in R. It represents the absence of data.
+-   A **string** is much like a string in R, but must always use double quotes.
+-   A **number** is similar to R's numbers: they can use integer (e.g., 123), decimal (e.g., 123.45), or scientific (e.g., 1.23e3) notation. JSON doesn't support `Inf`, `-Inf`, or `NaN`.
+-   A **boolean** is similar to R's `TRUE` and `FALSE`, but uses lowercase `true` and `false`.
+
+JSON's strings, numbers, and booleans are pretty similar to R's character, numeric, and logical vectors.
+The main difference is that JSON's scalars can only represent a single value.
+To represent multiple values you need to use one of the two remaining types: arrays and objects.
+
+Both arrays and objects are similar to lists in R; the difference is whether or not they're named.
+An **array** is like an unnamed list, and is written with `[]`.
+For example `[1, 2, 3]` is an array containing 3 numbers, and `[null, 1, "string", false]` is an array that contains a null, a number, a string, and a boolean.
+An **object** is like a named list, and is written with `{}`.
+The names (keys in JSON terminology) are strings, so must be surrounded by quotes.
+For example, `{"x": 1, "y": 2}` is an object that maps `x` to 1 and `y` to 2.
+
+Note that JSON doesn't have any native way to represent dates or date-times, so they're often stored as strings, and you'll need to use `readr::parse_date()` or `readr::parse_datetime()` to turn them into the correct data structure.
+Similarly, JSON's rules for representing floating point numbers in JSON are a little imprecise, so you'll also sometimes find numbers stored in strings.
+Apply `readr::parse_double()` as needed to get the correct variable type.
+
+### jsonlite
+
+To convert JSON into R data structures, we recommend the jsonlite package, by Jeroen Ooms.
+We'll use only two jsonlite functions: `read_json()` and `parse_json()`.
+In real life, you'll use `read_json()` to read a JSON file from disk.
+For example, the repurrsive package also provides the source for `gh_user` as a JSON file and you can read it with `read_json()`:
+
+```{r}
+# A path to a json file inside the package:
+gh_users_json()
+
+# Read it with read_json()
+gh_users2 <- read_json(gh_users_json())
+
+# Check it's the same as the data we were using previously
+identical(gh_users, gh_users2)
+```
+
+In this book, we'll also use `parse_json()`, since it takes a string containing JSON, which makes it good for generating simple examples.
+To get started, here are three simple JSON datasets, starting with a number, then putting a few numbers in an array, then putting that array in an object:
+
+```{r}
+str(parse_json('1'))
+str(parse_json('[1, 2, 3]'))
+str(parse_json('{"x": [1, 2, 3]}'))
+```
+
+jsonlite has another important function called `fromJSON()`.
+We don't use it here because it performs automatic simplification (`simplifyVector = TRUE`).
+This often works well, particularly in simple cases, but we think you're better off doing the rectangling yourself so you know exactly what's happening and can more easily handle the most complicated nested structures.
+
+### Starting the rectangling process
+
+In most cases, JSON files contain a single top-level array, because they're designed to provide data about multiple "things", e.g., multiple pages, or multiple records, or multiple results.
+In this case, you'll start your rectangling with `tibble(json)` so that each element becomes a row:
+
+```{r}
+json <- '[
+  {"name": "John", "age": 34},
+  {"name": "Susan", "age": 27}
+]'
+df <- tibble(json = parse_json(json))
+df
+
+df |> 
+  unnest_wider(json)
+```
+
+In rarer cases, the JSON file consists of a single top-level JSON object, representing one "thing".
+In this case, you'll need to kick off the rectangling process by wrapping it in a list, before you put it in a tibble.
+
+```{r}
+json <- '{
+  "status": "OK", 
+  "results": [
+    {"name": "John", "age": 34},
+    {"name": "Susan", "age": 27}
+ ]
+}
+'
+df <- tibble(json = list(parse_json(json)))
+df
+
+df |> 
+  unnest_wider(json) |> 
+  unnest_longer(results) |> 
+  unnest_wider(results)
+```
+
+Alternatively, you can reach inside the parsed JSON and start with the bit that you actually care about:
+
+```{r}
+df <- tibble(results = parse_json(json)$results)
+df |> 
+  unnest_wider(results)
+```
+
+### Exercises
+
+1.  Rectangle the `df_col` and `df_row` below.
+    They represent the two ways of encoding a data frame in JSON.
+
+    ```{r}
+    json_col <- parse_json('
+      {
+        "x": ["a", "x", "z"],
+        "y": [10, null, 3]
+      }
+    ')
+    json_row <- parse_json('
+      [
+        {"x": "a", "y": 10},
+        {"x": "x", "y": null},
+        {"x": "z", "y": 3}
+      ]
+    ')
+
+    df_col <- tibble(json = list(json_col)) 
+    df_row <- tibble(json = json_row)
+    ```
+
+## Summary
+
+In this chapter, you learned what lists are, how you can generate them from JSON files, and how turn them into rectangular data frames.
+Surprisingly we only need two new functions: `unnest_longer()` to put list elements into rows and `unnest_wider()` to put list elements into columns.
+It doesn't matter how deeply nested the list-column is; all you need to do is repeatedly call these two functions.
+
+JSON is the most common data format returned by web APIs.
+What happens if the website doesn't have an API, but you can see data you want on the website?
+That's the topic of the next chapter: web scraping, extracting data from HTML webpages.
diff --git a/regexps.qmd b/regexps.qmd
new file mode 100644
index 000000000..f95acdc8d
--- /dev/null
+++ b/regexps.qmd
@@ -0,0 +1,868 @@
+# Regular expressions {#sec-regular-expressions}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+In @sec-strings, you learned a whole bunch of useful functions for working with strings.
+This chapter will focus on functions that use **regular expressions**, a concise and powerful language for describing patterns within strings.
+The term "regular expression" is a bit of a mouthful, so most people abbreviate it to "regex"[^regexps-1] or "regexp".
+
+[^regexps-1]: You can pronounce it with either a hard-g (reg-x) or a soft-g (rej-x).
+
+The chapter starts with the basics of regular expressions and the most useful stringr functions for data analysis.
+We'll then expand your knowledge of patterns and cover seven important new topics (escaping, anchoring, character classes, shorthand classes, quantifiers, precedence, and grouping).
+Next, we'll talk about some of the other types of patterns that stringr functions can work with and the various "flags" that allow you to tweak the operation of regular expressions.
+We'll finish with a survey of other places in the tidyverse and base R where you might use regexes.
+
+### Prerequisites
+
+In this chapter, we'll use regular expression functions from stringr and tidyr, both core members of the tidyverse, as well as data from the babynames package.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(babynames)
+```
+
+Through this chapter, we'll use a mix of very simple inline examples so you can get the basic idea, the baby names data, and three character vectors from stringr:
+
+-   `fruit` contains the names of 80 fruits.
+-   `words` contains 980 common English words.
+-   `sentences` contains 720 short sentences.
+
+## Pattern basics {#sec-reg-basics}
+
+We'll use `str_view()` to learn how regex patterns work.
+We used `str_view()` in the last chapter to better understand a string vs. its printed representation, and now we'll use it with its second argument, a regular expression.
+When this is supplied, `str_view()` will show only the elements of the string vector that match, surrounding each match with `<>`, and, where possible, highlighting the match in blue.
+
+The simplest patterns consist of letters and numbers which match those characters exactly:
+
+```{r}
+str_view(fruit, "berry")
+```
+
+Letters and numbers match exactly and are called **literal characters**.
+Most punctuation characters, like `.`, `+`, `*`, `[`, `]`, and `?`, have special meanings[^regexps-2] and are called **metacharacters**. For example, `.`
+will match any character[^regexps-3], so `"a."` will match any string that contains an "a" followed by another character
+:
+
+[^regexps-2]: You'll learn how to escape these special meanings in @sec-regexp-escaping.
+
+[^regexps-3]: Well, any character apart from `\n`.
+
+```{r}
+str_view(c("a", "ab", "ae", "bd", "ea", "eab"), "a.")
+```
+
+Or we could find all the fruits that contain an "a", followed by three letters, followed by an "e":
+
+```{r}
+str_view(fruit, "a...e")
+```
+
+**Quantifiers** control how many times a pattern can match:
+
+-   `?` makes a pattern optional (i.e. it matches 0 or 1 times)
+-   `+` lets a pattern repeat (i.e. it matches at least once)
+-   `*` lets a pattern be optional or repeat (i.e. it matches any number of times, including 0).
+
+```{r}
+# ab? matches an "a", optionally followed by a "b".
+str_view(c("a", "ab", "abb"), "ab?")
+
+# ab+ matches an "a", followed by at least one "b".
+str_view(c("a", "ab", "abb"), "ab+")
+
+# ab* matches an "a", followed by any number of "b"s.
+str_view(c("a", "ab", "abb"), "ab*")
+```
+
+**Character classes** are defined by `[]` and let you match a set of characters, e.g., `[abcd]` matches "a", "b", "c", or "d".
+You can also invert the match by starting with `^`: `[^abcd]` matches anything **except** "a", "b", "c", or "d".
+We can use this idea to find the words containing an "x" surrounded by vowels, or a "y" surrounded by consonants:
+
+```{r}
+str_view(words, "[aeiou]x[aeiou]")
+str_view(words, "[^aeiou]y[^aeiou]")
+```
+
+You can use **alternation**, `|`, to pick between one or more alternative patterns.
+For example, the following patterns look for fruits containing "apple", "melon", or "nut", or a repeated vowel.
+
+```{r}
+str_view(fruit, "apple|melon|nut")
+str_view(fruit, "aa|ee|ii|oo|uu")
+```
+
+Regular expressions are very compact and use a lot of punctuation characters, so they can seem overwhelming and hard to read at first.
+Don't worry; you'll get better with practice, and simple patterns will soon become second nature.
+Let's kick off that process by practicing with some useful stringr functions.
+
+## Key functions {#sec-stringr-regex-funs}
+
+Now that you've got the basics of regular expressions under your belt, let's use them with some stringr and tidyr functions.
+In the following section, you'll learn how to detect the presence or absence of a match, how to count the number of matches, how to replace a match with fixed text, and how to extract text using a pattern.
+
+### Detect matches
+
+`str_detect()` returns a logical vector that is `TRUE` if the pattern matches an element of the character vector and `FALSE` otherwise:
+
+```{r}
+str_detect(c("a", "b", "c"), "[aeiou]")
+```
+
+Since `str_detect()` returns a logical vector of the same length as the initial vector, it pairs well with `filter()`.
+For example, this code finds all the most popular names containing a lower-case "x":
+
+```{r}
+babynames |> 
+  filter(str_detect(name, "x")) |> 
+  count(name, wt = n, sort = TRUE)
+```
+
+We can also use `str_detect()` with `summarize()` by pairing it with `sum()` or `mean()`: `sum(str_detect(x, pattern))` tells you the number of observations that match and `mean(str_detect(x, pattern))` tells you the proportion that match.
+For example, the following snippet computes and visualizes the proportion of baby names[^regexps-4] that contain "x", broken down by year.
+It looks like they've radically increased in popularity lately!
+
+[^regexps-4]: This gives us the proportion of **names** that contain an "x"; if you wanted the proportion of babies with a name containing an x, you'd need to perform a weighted mean.
+
+```{r}
+#| fig-alt: |
+#|   A time series showing the proportion of baby names that contain the letter x. 
+#|   The proportion declines gradually from 8 per 1000 in 1880 to 4 per 1000 in 
+#|   1980, then increases rapidly to 16 per 1000 in 2019.
+
+babynames |> 
+  group_by(year) |> 
+  summarize(prop_x = mean(str_detect(name, "x"))) |> 
+  ggplot(aes(x = year, y = prop_x)) + 
+  geom_line()
+```
+
+There are two functions that are closely related to `str_detect()`: `str_subset()` and `str_which()`.
+`str_subset()` returns a character vector containing only the strings that match.
+`str_which()` returns an integer vector giving the positions of the strings that match.
+
+### Count matches
+
+The next step up in complexity from `str_detect()` is `str_count()`: rather than a true or false, it tells you how many matches there are in each string.
+
+```{r}
+x <- c("apple", "banana", "pear")
+str_count(x, "p")
+```
+
+Note that each match starts at the end of the previous match, i.e. regex matches never overlap.
+For example, in `"abababa"`, how many times will the pattern `"aba"` match?
+Regular expressions say two, not three:
+
+```{r}
+str_count("abababa", "aba")
+str_view("abababa", "aba")
+```
+
+It's natural to use `str_count()` with `mutate()`.
+The following example uses `str_count()` with character classes to count the number of vowels and consonants in each name.
+
+```{r}
+babynames |> 
+  count(name) |> 
+  mutate(
+    vowels = str_count(name, "[aeiou]"),
+    consonants = str_count(name, "[^aeiou]")
+  )
+```
+
+If you look closely, you'll notice that there's something off with our calculations: "Aaban" contains three "a"s, but our summary reports only two vowels.
+That's because regular expressions are case sensitive.
+There are three ways we could fix this:
+
+-   Add the upper case vowels to the character class: `str_count(name, "[aeiouAEIOU]")`.
+-   Tell the regular expression to ignore case: `str_count(name, regex("[aeiou]", ignore_case = TRUE))`. We'll talk about more in @sec-flags.
+-   Use `str_to_lower()` to convert the names to lower case: `str_count(str_to_lower(name), "[aeiou]")`.
+
+This variety of approaches is pretty typical when working with strings --- there are often multiple ways to reach your goal, either by making your pattern more complicated or by doing some preprocessing on your string.
+If you get stuck trying one approach, it can often be useful to switch gears and tackle the problem from a different perspective.
+
+In this case, since we're applying two functions to the name, I think it's easier to transform it first:
+
+```{r}
+babynames |> 
+  count(name) |> 
+  mutate(
+    name = str_to_lower(name),
+    vowels = str_count(name, "[aeiou]"),
+    consonants = str_count(name, "[^aeiou]")
+  )
+```
+
+### Replace values
+
+As well as detecting and counting matches, we can also modify them with `str_replace()` and `str_replace_all()`.
+`str_replace()` replaces the first match, and as the name suggests, `str_replace_all()` replaces all matches.
+
+```{r}
+x <- c("apple", "pear", "banana")
+str_replace_all(x, "[aeiou]", "-")
+```
+
+`str_remove()` and `str_remove_all()` are handy shortcuts for `str_replace(x, pattern, "")`:
+
+```{r}
+x <- c("apple", "pear", "banana")
+str_remove_all(x, "[aeiou]")
+```
+
+These functions are naturally paired with `mutate()` when doing data cleaning, and you'll often apply them repeatedly to peel off layers of inconsistent formatting.
+
+### Extract variables {#sec-extract-variables}
+
+The last function we'll discuss uses regular expressions to extract data out of one column into one or more new columns: `separate_wider_regex()`.
+It's a peer of the `separate_wider_position()` and `separate_wider_delim()` functions that you learned about in @sec-string-columns.
+These functions live in tidyr because they operate on (columns of) data frames, rather than individual vectors.
+
+Let's create a simple dataset to show how it works.
+Here we have some data derived from `babynames` where we have the name, gender, and age of a bunch of people in a rather weird format[^regexps-5]:
+
+[^regexps-5]: We wish we could reassure you that you'd never see something this weird in real life, but unfortunately over the course of your career you're likely to see much weirder!
+
+```{r}
+df <- tribble(
+  ~str,
+  "<Sheryl>-F_34",
+  "<Kisha>-F_45", 
+  "<Brandon>-N_33",
+  "<Sharon>-F_38", 
+  "<Penny>-F_58",
+  "<Justin>-M_41", 
+  "<Patricia>-F_84", 
+)
+```
+
+To extract this data using `separate_wider_regex()` we just need to construct a sequence of regular expressions that match each piece.
+If we want the contents of that piece to appear in the output, we give it a name:
+
+```{r}
+df |> 
+  separate_wider_regex(
+    str,
+    patterns = c(
+      "<", 
+      name = "[A-Za-z]+", 
+      ">-", 
+      gender = ".", "_", 
+      age = "[0-9]+"
+    )
+  )
+```
+
+If the match fails, you can use `too_short = "debug"` to figure out what went wrong, just like `separate_wider_delim()` and `separate_wider_position()`.
+
+### Exercises
+
+1.  What baby name has the most vowels?
+    What name has the highest proportion of vowels?
+    (Hint: what is the denominator?)
+
+2.  Replace all forward slashes in `"a/b/c/d/e"` with backslashes.
+    What happens if you attempt to undo the transformation by replacing all backslashes with forward slashes?
+    (We'll discuss the problem very soon.)
+
+3.  Implement a simple version of `str_to_lower()` using `str_replace_all()`.
+
+4.  Create a regular expression that will match telephone numbers as commonly written in your country.
+
+## Pattern details
+
+Now that you understand the basics of the pattern language and how to use it with some stringr and tidyr functions, it's time to dig into more of the details.
+First, we'll start with **escaping**, which allows you to match metacharacters that would otherwise be treated specially.
+Next, you'll learn about **anchors** which allow you to match the start or end of the string.
+Then, you'll learn more about **character classes** and their shortcuts which allow you to match any character from a set.
+Next, you'll learn the final details of **quantifiers** which control how many times a pattern can match.
+Then, we have to cover the important (but complex) topic of **operator precedence** and parentheses.
+And we'll finish off with some details of **grouping** components of the pattern.
+
+The terms we use here are the technical names for each component.
+They're not always the most evocative of their purpose, but it's very helpful to know the correct terms if you later want to Google for more details.
+
+### Escaping {#sec-regexp-escaping}
+
+In order to match a literal `.`, you need an **escape** which tells the regular expression to match metacharacters[^regexps-6] literally.
+Like strings, regexps use the backslash for escaping.
+So, to match a `.`, you need the regexp `\.`. Unfortunately this creates a problem.
+We use strings to represent regular expressions, and `\` is also used as an escape symbol in strings.
+So to create the regular expression `\.` we need the string `"\\."`, as the following example shows.
+
+[^regexps-6]: The complete set of metacharacters is `.^$\|*+?{}[]()`
+
+```{r}
+# To create the regular expression \., we need to use \\.
+dot <- "\\."
+
+# But the expression itself only contains one \
+str_view(dot)
+
+# And this tells R to look for an explicit .
+str_view(c("abc", "a.c", "bef"), "a\\.c")
+```
+
+In this book, we'll usually write regular expression without quotes, like `\.`.
+If we need to emphasize what you'll actually type, we'll surround it with quotes and add extra escapes, like `"\\."`.
+
+If `\` is used as an escape character in regular expressions, how do you match a literal `\`?
+Well, you need to escape it, creating the regular expression `\\`.
+To create that regular expression, you need to use a string, which also needs to escape `\`.
+That means to match a literal `\` you need to write `"\\\\"` --- you need four backslashes to match one!
+
+```{r}
+x <- "a\\b"
+str_view(x)
+str_view(x, "\\\\")
+```
+
+Alternatively, you might find it easier to use the raw strings you learned about in @sec-raw-strings).
+That lets you avoid one layer of escaping:
+
+```{r}
+str_view(x, r"{\\}")
+```
+
+If you're trying to match a literal `.`, `$`, `|`, `*`, `+`, `?`, `{`, `}`, `(`, `)`, there's an alternative to using a backslash escape: you can use a character class: `[.]`, `[$]`, `[|]`, \...
+all match the literal values.
+
+```{r}
+str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
+str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")
+```
+
+### Anchors
+
+By default, regular expressions will match any part of a string.
+If you want to match at the start or end you need to **anchor** the regular expression using `^` to match the start or `$` to match the end:
+
+```{r}
+str_view(fruit, "^a")
+str_view(fruit, "a$")
+```
+
+It's tempting to think that `$` should match the start of a string, because that's how we write dollar amounts, but that's not what regular expressions want.
+
+To force a regular expression to match only the full string, anchor it with both `^` and `$`:
+
+```{r}
+str_view(fruit, "apple")
+str_view(fruit, "^apple$")
+```
+
+You can also match the boundary between words (i.e. the start or end of a word) with `\b`.
+This can be particularly useful when using RStudio's find and replace tool.
+For example, if to find all uses of `sum()`, you can search for `\bsum\b` to avoid matching `summarize`, `summary`, `rowsum` and so on:
+
+```{r}
+x <- c("summary(x)", "summarize(df)", "rowsum(x)", "sum(x)")
+str_view(x, "sum")
+str_view(x, "\\bsum\\b")
+```
+
+When used alone, anchors will produce a zero-width match:
+
+```{r}
+str_view("abc", c("$", "^", "\\b"))
+```
+
+This helps you understand what happens when you replace a standalone anchor:
+
+```{r}
+str_replace_all("abc", c("$", "^", "\\b"), "--")
+```
+
+### Character classes
+
+A **character class**, or character **set**, allows you to match any character in a set.
+As we discussed above, you can construct your own sets with `[]`, where `[abc]` matches "a", "b", or "c" and `[^abc]` matches any character except "a", "b", or "c".
+Apart from `^` there are two other characters that have special meaning inside of `[]:`
+
+-   `-` defines a range, e.g., `[a-z]` matches any lower case letter and `[0-9]` matches any number.
+-   `\` escapes special characters, so `[\^\-\]]` matches `^`, `-`, or `]`.
+
+Here are few examples:
+
+```{r}
+x <- "abcd ABCD 12345 -!@#%."
+str_view(x, "[abc]+")
+str_view(x, "[a-z]+")
+str_view(x, "[^a-z0-9]+")
+
+# You need an escape to match characters that are otherwise
+# special inside of []
+str_view("a-b-c", "[a-c]")
+str_view("a-b-c", "[a\\-c]")
+```
+
+Some character classes are used so commonly that they get their own shortcut.
+You've already seen `.`, which matches any character apart from a newline.
+There are three other particularly useful pairs[^regexps-7]:
+
+[^regexps-7]: Remember, to create a regular expression containing `\d` or `\s`, you'll need to escape the `\` for the string, so you'll type `"\\d"` or `"\\s"`.
+
+-   `\d` matches any digit;\
+    `\D` matches anything that isn't a digit.
+-   `\s` matches any whitespace (e.g., space, tab, newline);\
+    `\S` matches anything that isn't whitespace.
+-   `\w` matches any "word" character, i.e. letters and numbers;\
+    `\W` matches any "non-word" character.
+
+The following code demonstrates the six shortcuts with a selection of letters, numbers, and punctuation characters.
+
+```{r}
+x <- "abcd ABCD 12345 -!@#%."
+str_view(x, "\\d+")
+str_view(x, "\\D+")
+str_view(x, "\\s+")
+str_view(x, "\\S+")
+str_view(x, "\\w+")
+str_view(x, "\\W+")
+```
+
+### Quantifiers {#sec-quantifiers}
+
+**Quantifiers** control how many times a pattern matches.
+In @sec-reg-basics you learned about `?` (0 or 1 matches), `+` (1 or more matches), and `*` (0 or more matches).
+For example, `colou?r` will match American or British spelling, `\d+` will match one or more digits, and `\s?` will optionally match a single item of whitespace.
+You can also specify the number of matches precisely with `{}`:
+
+-   `{n}` matches exactly n times.
+-   `{n,}` matches at least n times.
+-   `{n,m}` matches between n and m times.
+
+### Operator precedence and parentheses
+
+What does `ab+` match?
+Does it match "a" followed by one or more "b"s, or does it match "ab" repeated any number of times?
+What does `^a|b$` match?
+Does it match the complete string a or the complete string b, or does it match a string starting with a or a string ending with b?
+
+The answer to these questions is determined by operator precedence, similar to the PEMDAS or BEDMAS rules you might have learned in school.
+You know that `a + b * c` is equivalent to `a + (b * c)` not `(a + b) * c` because `*` has higher precedence and `+` has lower precedence: you compute `*` before `+`.
+
+Similarly, regular expressions have their own precedence rules: quantifiers have high precedence and alternation has low precedence which means that `ab+` is equivalent to `a(b+)`, and `^a|b$` is equivalent to `(^a)|(b$)`.
+Just like with algebra, you can use parentheses to override the usual order.
+But unlike algebra you're unlikely to remember the precedence rules for regexes, so feel free to use parentheses liberally.
+
+### Grouping and capturing
+
+As well as overriding operator precedence, parentheses have another important effect: they create **capturing groups** that allow you to use sub-components of the match.
+
+The first way to use a capturing group is to refer back to it within a match with **back reference**: `\1` refers to the match contained in the first parenthesis, `\2` in the second parenthesis, and so on.
+For example, the following pattern finds all fruits that have a repeated pair of letters:
+
+```{r}
+str_view(fruit, "(..)\\1")
+```
+
+And this one finds all words that start and end with the same pair of letters:
+
+```{r}
+str_view(words, "^(..).*\\1$")
+```
+
+You can also use back references in `str_replace()`.
+For example, this code switches the order of the second and third words in `sentences`:
+
+```{r}
+sentences |> 
+  str_replace("(\\w+) (\\w+) (\\w+)", "\\1 \\3 \\2") |> 
+  str_view()
+```
+
+If you want to extract the matches for each group you can use `str_match()`.
+But `str_match()` returns a matrix, so it's not particularly easy to work with[^regexps-8]:
+
+[^regexps-8]: Mostly because we never discuss matrices in this book!
+
+```{r}
+sentences |> 
+  str_match("the (\\w+) (\\w+)") |> 
+  head()
+```
+
+You could convert to a tibble and name the columns:
+
+```{r}
+sentences |> 
+  str_match("the (\\w+) (\\w+)") |> 
+  as_tibble(.name_repair = "minimal") |> 
+  set_names("match", "word1", "word2")
+```
+
+But then you've basically recreated your own version of `separate_wider_regex()`.
+Indeed, behind the scenes, `separate_wider_regex()` converts your vector of patterns to a single regex that uses grouping to capture the named components.
+
+Occasionally, you'll want to use parentheses without creating matching groups.
+You can create a non-capturing group with `(?:)`.
+
+```{r}
+x <- c("a gray cat", "a grey dog")
+str_match(x, "gr(e|a)y")
+str_match(x, "gr(?:e|a)y")
+```
+
+### Exercises
+
+1.  How would you match the literal string `"'\`? How about `"$^$"`?
+
+2.  Explain why each of these patterns don't match a `\`: `"\"`, `"\\"`, `"\\\"`.
+
+3.  Given the corpus of common words in `stringr::words`, create regular expressions that find all words that:
+
+    a.  Start with "y".
+    b.  Don't start with "y".
+    c.  End with "x".
+    d.  Are exactly three letters long. (Don't cheat by using `str_length()`!)
+    e.  Have seven letters or more.
+    f.  Contain a vowel-consonant pair.
+    g.  Contain at least two vowel-consonant pairs in a row.
+    h.  Only consist of repeated vowel-consonant pairs.
+
+4.  Create 11 regular expressions that match the British or American spellings for each of the following words: airplane/aeroplane, aluminum/aluminium, analog/analogue, ass/arse, center/centre, defense/defence, donut/doughnut, gray/grey, modeling/modelling, skeptic/sceptic, summarize/summarise.
+    Try and make the shortest possible regex!
+
+5.  Switch the first and last letters in `words`.
+    Which of those strings are still `words`?
+
+6.  Describe in words what these regular expressions match: (read carefully to see if each entry is a regular expression or a string that defines a regular expression.)
+
+    a.  `^.*$`
+    b.  `"\\{.+\\}"`
+    c.  `\d{4}-\d{2}-\d{2}`
+    d.  `"\\\\{4}"`
+    e.  `\..\..\..`
+    f.  `(.)\1\1`
+    g.  `"(..)\\1"`
+
+7.  Solve the beginner regexp crosswords at <https://regexcrossword.com/challenges/beginner>.
+
+## Pattern control
+
+It's possible to exercise extra control over the details of the match by using a pattern object instead of just a string.
+This allows you to control the so called regex flags and match various types of fixed strings, as described below.
+
+### Regex flags {#sec-flags}
+
+There are a number of settings that can be used to control the details of the regexp.
+These settings are often called **flags** in other programming languages.
+In stringr, you can use these by wrapping the pattern in a call to `regex()`.
+The most useful flag is probably `ignore_case = TRUE` because it allows characters to match either their uppercase or lowercase forms:
+
+```{r}
+bananas <- c("banana", "Banana", "BANANA")
+str_view(bananas, "banana")
+str_view(bananas, regex("banana", ignore_case = TRUE))
+```
+
+If you're doing a lot of work with multiline strings (i.e. strings that contain `\n`), `dotall`and `multiline` may also be useful:
+
+-   `dotall = TRUE` lets `.` match everything, including `\n`:
+
+    ```{r}
+    x <- "Line 1\nLine 2\nLine 3"
+    str_view(x, ".Line")
+    str_view(x, regex(".Line", dotall = TRUE))
+    ```
+
+-   `multiline = TRUE` makes `^` and `$` match the start and end of each line rather than the start and end of the complete string:
+
+    ```{r}
+    x <- "Line 1\nLine 2\nLine 3"
+    str_view(x, "^Line")
+    str_view(x, regex("^Line", multiline = TRUE))
+    ```
+
+Finally, if you're writing a complicated regular expression and you're worried you might not understand it in the future, you might try `comments = TRUE`.
+It tweaks the pattern language to ignore spaces and new lines, as well as everything after `#`.
+This allows you to use comments and whitespace to make complex regular expressions more understandable[^regexps-9], as in the following example:
+
+[^regexps-9]: `comments = TRUE` is particularly effective in combination with a raw string, as we use here.
+
+```{r}
+phone <- regex(
+  r"(
+    \(?     # optional opening parens
+    (\d{3}) # area code
+    [)\-]?  # optional closing parens or dash
+    \ ?     # optional space
+    (\d{3}) # another three numbers
+    [\ -]?  # optional space or dash
+    (\d{4}) # four more numbers
+  )", 
+  comments = TRUE
+)
+
+str_extract(c("514-791-8141", "(123) 456 7890", "123456"), phone)
+```
+
+If you're using comments and want to match a space, newline, or `#`, you'll need to escape it with `\`.
+
+### Fixed matches
+
+You can opt-out of the regular expression rules by using `fixed()`:
+
+```{r}
+str_view(c("", "a", "."), fixed("."))
+```
+
+`fixed()` also gives you the ability to ignore case:
+
+```{r}
+str_view("x X", "X")
+str_view("x X", fixed("X", ignore_case = TRUE))
+```
+
+If you're working with non-English text, you will probably want `coll()` instead of `fixed()`, as it implements the full rules for capitalization as used by the `locale` you specify.
+See @sec-other-languages for more details on locales.
+
+```{r}
+str_view("i İ ı I", fixed("İ", ignore_case = TRUE))
+str_view("i İ ı I", coll("İ", ignore_case = TRUE, locale = "tr"))
+```
+
+## Practice
+
+To put these ideas into practice we'll solve a few semi-authentic problems next.
+We'll discuss three general techniques:
+
+1.  checking your work by creating simple positive and negative controls
+2.  combining regular expressions with Boolean algebra
+3.  creating complex patterns using string manipulation
+
+### Check your work
+
+First, let's find all sentences that start with "The".
+Using the `^` anchor alone is not enough:
+
+```{r}
+str_view(sentences, "^The")
+```
+
+Because that pattern also matches sentences starting with words like `They` or `These`.
+We need to make sure that the "e" is the last letter in the word, which we can do by adding a word boundary:
+
+```{r}
+str_view(sentences, "^The\\b")
+```
+
+What about finding all sentences that begin with a pronoun?
+
+```{r}
+str_view(sentences, "^She|He|It|They\\b")
+```
+
+A quick inspection of the results shows that we're getting some spurious matches.
+That's because we've forgotten to use parentheses:
+
+```{r}
+str_view(sentences, "^(She|He|It|They)\\b")
+```
+
+You might wonder how you might spot such a mistake if it didn't occur in the first few matches.
+A good technique is to create a few positive and negative matches and use them to test that your pattern works as expected:
+
+```{r}
+pos <- c("He is a boy", "She had a good time")
+neg <- c("Shells come from the sea", "Hadley said 'It's a great day'")
+
+pattern <- "^(She|He|It|They)\\b"
+str_detect(pos, pattern)
+str_detect(neg, pattern)
+```
+
+It's typically much easier to come up with good positive examples than negative examples, because it takes a while before you're good enough with regular expressions to predict where your weaknesses are.
+Nevertheless, they're still useful: as you work on the problem you can slowly accumulate a collection of your mistakes, ensuring that you never make the same mistake twice.
+
+### Boolean operations {#sec-boolean-operations}
+
+Imagine we want to find words that only contain consonants.
+One technique is to create a character class that contains all letters except for the vowels (`[^aeiou]`), then allow that to match any number of letters (`[^aeiou]+`), then force it to match the whole string by anchoring to the beginning and the end (`^[^aeiou]+$`):
+
+```{r}
+str_view(words, "^[^aeiou]+$")
+```
+
+But you can make this problem a bit easier by flipping the problem around.
+Instead of looking for words that contain only consonants, we could look for words that don't contain any vowels:
+
+```{r}
+str_view(words[!str_detect(words, "[aeiou]")])
+```
+
+This is a useful technique whenever you're dealing with logical combinations, particularly those involving "and" or "not".
+For example, imagine if you want to find all words that contain "a" and "b".
+There's no "and" operator built in to regular expressions so we have to tackle it by looking for all words that contain an "a" followed by a "b", or a "b" followed by an "a":
+
+```{r}
+str_view(words, "a.*b|b.*a")
+```
+
+It's simpler to combine the results of two calls to `str_detect()`:
+
+```{r}
+words[str_detect(words, "a") & str_detect(words, "b")]
+```
+
+What if we wanted to see if there was a word that contains all vowels?
+If we did it with patterns we'd need to generate 5!
+(120) different patterns:
+
+```{r}
+#| results: false
+words[str_detect(words, "a.*e.*i.*o.*u")]
+# ...
+words[str_detect(words, "u.*o.*i.*e.*a")]
+```
+
+It's much simpler to combine five calls to `str_detect()`:
+
+```{r}
+words[
+  str_detect(words, "a") &
+  str_detect(words, "e") &
+  str_detect(words, "i") &
+  str_detect(words, "o") &
+  str_detect(words, "u")
+]
+```
+
+In general, if you get stuck trying to create a single regexp that solves your problem, take a step back and think if you could break the problem down into smaller pieces, solving each challenge before moving onto the next one.
+
+### Creating a pattern with code
+
+What if we wanted to find all `sentences` that mention a color?
+The basic idea is simple: we just combine alternation with word boundaries.
+
+```{r}
+str_view(sentences, "\\b(red|green|blue)\\b")
+```
+
+But as the number of colors grows, it would quickly get tedious to construct this pattern by hand.
+Wouldn't it be nice if we could store the colors in a vector?
+
+```{r}
+rgb <- c("red", "green", "blue")
+```
+
+Well, we can!
+We'd just need to create the pattern from the vector using `str_c()` and `str_flatten()`:
+
+```{r}
+str_c("\\b(", str_flatten(rgb, "|"), ")\\b")
+```
+
+We could make this pattern more comprehensive if we had a good list of colors.
+One place we could start from is the list of built-in colors that R can use for plots:
+
+```{r}
+str_view(colors())
+```
+
+But lets first eliminate the numbered variants:
+
+```{r}
+cols <- colors()
+cols <- cols[!str_detect(cols, "\\d")]
+str_view(cols)
+```
+
+Then we can turn this into one giant pattern.
+We won't show the pattern here because it's huge, but you can see it working:
+
+```{r}
+pattern <- str_c("\\b(", str_flatten(cols, "|"), ")\\b")
+str_view(sentences, pattern)
+```
+
+In this example, `cols` only contains numbers and letters so you don't need to worry about metacharacters.
+But in general, whenever you create patterns from existing strings it's wise to run them through `str_escape()` to ensure they match literally.
+
+### Exercises
+
+1.  For each of the following challenges, try solving it by using both a single regular expression, and a combination of multiple `str_detect()` calls.
+
+    a.  Find all `words` that start or end with `x`.
+    b.  Find all `words` that start with a vowel and end with a consonant.
+    c.  Are there any `words` that contain at least one of each different vowel?
+
+2.  Construct patterns to find evidence for and against the rule "i before e except after c"?
+
+3.  `colors()` contains a number of modifiers like "lightgray" and "darkblue".
+    How could you automatically identify these modifiers?
+    (Think about how you might detect and then removed the colors that are modified).
+
+4.  Create a regular expression that finds any base R dataset.
+    You can get a list of these datasets via a special use of the `data()` function: `data(package = "datasets")$results[, "Item"]`.
+    Note that a number of old datasets are individual vectors; these contain the name of the grouping "data frame" in parentheses, so you'll need to strip those off.
+
+## Regular expressions in other places
+
+Just like in the stringr and tidyr functions, there are many other places in R where you can use regular expressions.
+The following sections describe some other useful functions in the wider tidyverse and base R.
+
+### tidyverse
+
+There are three other particularly useful places where you might want to use a regular expressions
+
+-   `matches(pattern)` will select all variables whose name matches the supplied pattern.
+    It's a "tidyselect" function that you can use anywhere in any tidyverse function that selects variables (e.g., `select()`, `rename_with()` and `across()`).
+
+-   `pivot_longer()'s` `names_pattern` argument takes a vector of regular expressions, just like `separate_wider_regex()`.
+    It's useful when extracting data out of variable names with a complex structure
+
+-   The `delim` argument in `separate_longer_delim()` and `separate_wider_delim()` usually matches a fixed string, but you can use `regex()` to make it match a pattern.
+    This is useful, for example, if you want to match a comma that is optionally followed by a space, i.e. `regex(", ?")`.
+
+### Base R
+
+`apropos(pattern)` searches all objects available from the global environment that match the given pattern.
+This is useful if you can't quite remember the name of a function:
+
+```{r}
+apropos("replace")
+```
+
+`list.files(path, pattern)` lists all files in `path` that match a regular expression `pattern`.
+For example, you can find all the R Markdown files in the current directory with:
+
+```{r}
+head(list.files(pattern = "\\.Rmd$"))
+```
+
+It's worth noting that the pattern language used by base R is very slightly different to that used by stringr.
+That's because stringr is built on top of the [stringi package](https://stringi.gagolewski.com), which is in turn built on top of the [ICU engine](https://unicode-org.github.io/icu/userguide/strings/regexp.html), whereas base R functions use either the [TRE engine](https://github.com/laurikari/tre) or the [PCRE engine](https://www.pcre.org), depending on whether or not you've set `perl = TRUE`.
+Fortunately, the basics of regular expressions are so well established that you'll encounter few variations when working with the patterns you'll learn in this book.
+You only need to be aware of the difference when you start to rely on advanced features like complex Unicode character ranges or special features that use the `(?…)` syntax.
+
+## Summary
+
+With every punctuation character potentially overloaded with meaning, regular expressions are one of the most compact languages out there.
+They're definitely confusing at first but as you train your eyes to read them and your brain to understand them, you unlock a powerful skill that you can use in R and in many other places.
+
+In this chapter, you've started your journey to become a regular expression master by learning the most useful stringr functions and the most important components of the regular expression language.
+And there are plenty of resources to learn more.
+
+A good place to start is `vignette("regular-expressions", package = "stringr")`: it documents the full set of syntax supported by stringr.
+Another useful reference is [https://www.regular-expressions.info/](https://www.regular-expressions.info/tutorial.html).
+It's not R specific, but you can use it to learn about the most advanced features of regexes and how they work under the hood.
+
+It's also good to know that stringr is implemented on top of the stringi package by Marek Gagolewski.
+If you're struggling to find a function that does what you need in stringr, don't be afraid to look in stringi.
+You'll find stringi very easy to pick up because it follows many of the the same conventions as stringr.
+
+In the next chapter, we'll talk about a data structure closely related to strings: factors.
+Factors are used to represent categorical data in R, i.e. data with a fixed and known set of possible values identified by a vector of strings.
diff --git a/relational-data.Rmd b/relational-data.Rmd
deleted file mode 100644
index bf6af4ba5..000000000
--- a/relational-data.Rmd
+++ /dev/null
@@ -1,601 +0,0 @@
-# Relational data
-
-## Introduction
-
-It's rare that a data analysis involves only a single table of data. Typically you have many tables of data, and you must combine them to answer the questions that you're interested in. Collectively, multiple tables of data are called __relational data__ because it is the relations, not just the individual datasets, that are important.
-
-Relations are always defined between a pair of tables. All other relations are built up from this simple idea: the relations of three or more tables are always a property of the relations between each pair. Sometimes both elements of a pair can be the same table! This is needed if, for example, you have a table of people, and each person has a reference to their parents.
-
-To work with relational data you need verbs that work with pairs of tables. There are three families of verbs designed to work with relational data:
-
-* __Mutating joins__, which add new variables to one data frame from matching
-  observations in another.
-
-* __Filtering joins__, which filter observations from one data frame based on
-  whether or not they match an observation in the other table.
-
-* __Set operations__, which treat observations as if they were set elements.
-
-The most common place to find relational data is in a _relational_ database management system (or RDBMS), a term that encompasses almost all modern databases. If you've used a database before, you've almost certainly used SQL. If so, you should find the concepts in this chapter familiar, although their expression in dplyr is a little different. Generally, dplyr is a little easier to use than SQL because dplyr is specialised to do data analysis: it makes common data analysis operations easier, at the expense of making it more difficult to do other things that aren't commonly needed for data analysis.
-
-### Prerequisites
-
-We will explore relational data from `nycflights13` using the two-table verbs from dplyr.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-library(nycflights13)
-```
-
-## nycflights13 {#nycflights13-relational}
-
-We will use the nycflights13 package to learn about relational data. nycflights13 contains four tibbles that are related to the `flights` table that you used in [data transformation]:
-
-*   `airlines` lets you look up the full carrier name from its abbreviated
-    code:
-
-    ```{r}
-    airlines
-    ```
-
-*   `airports` gives information about each airport, identified by the `faa`
-    airport code:
-
-    ```{r}
-    airports
-    ```
-
-*   `planes` gives information about each plane, identified by its `tailnum`:
-
-    ```{r}
-    planes
-    ```
-
-*   `weather` gives the weather at each NYC airport for each hour:
-
-    ```{r}
-    weather
-    ```
-
-One way to show the relationships between the different tables is with a drawing:
-
-```{r, echo = FALSE}
-knitr::include_graphics("diagrams/relational-nycflights.png")
-```
-
-This diagram is a little overwhelming, but it's simple compared to some you'll see in the wild! The key to understanding diagrams like this is to remember each relation always concerns a pair of tables. You don't need to understand the whole thing; you just need to understand the chain of relations between the tables that you are interested in.
-
-For nycflights13:
-
-* `flights` connects to `planes` via a single variable, `tailnum`. 
-
-* `flights` connects to `airlines` through the `carrier` variable.
-
-* `flights` connects to `airports` in two ways: via the `origin` and
-  `dest` variables.
-
-* `flights` connects to `weather` via `origin` (the location), and
-  `year`, `month`, `day` and `hour` (the time).
-
-### Exercises
-
-1.  Imagine you wanted to draw (approximately) the route each plane flies from
-    its origin to its destination. What variables would you need? What tables
-    would you need to combine?
-
-1.  I forgot to draw the relationship between `weather` and `airports`.
-    What is the relationship and how should it appear in the diagram?
-
-1.  `weather` only contains information for the origin (NYC) airports. If
-    it contained weather records for all airports in the USA, what additional
-    relation would it define with `flights`?
-
-1.  We know that some days of the year are "special", and fewer people than
-    usual fly on them. How might you represent that data as a data frame?
-    What would be the primary keys of that table? How would it connect to the
-    existing tables?
-
-## Keys
-
-The variables used to connect each pair of tables are called __keys__. A key is a variable (or set of variables) that uniquely identifies an observation. In simple cases, a single variable is sufficient to identify an observation. For example, each plane is uniquely identified by its `tailnum`. In other cases, multiple variables may be needed. For example, to identify an observation in `weather` you need five variables: `year`, `month`, `day`, `hour`, and `origin`.
-
-There are two types of keys:
-
-* A __primary key__ uniquely identifies an observation in its own table.
-  For example, `planes$tailnum` is a primary key because it uniquely identifies
-  each plane in the `planes` table.
-
-* A __foreign key__ uniquely identifies an observation in another table.
-  For example, the `flights$tailnum` is a foreign key because it appears in the 
-  `flights` table where it matches each flight to a unique plane.
-
-A variable can be both a primary key _and_ a foreign key. For example, `origin` is part of the `weather` primary key, and is also a foreign key for the `airport` table.
-
-Once you've identified the primary keys in your tables, it's good practice to verify that they do indeed uniquely identify each observation. One way to do that is to `count()` the primary keys and look for entries where `n` is greater than one:
-
-```{r}
-planes %>% 
-  count(tailnum) %>% 
-  filter(n > 1)
-
-weather %>% 
-  count(year, month, day, hour, origin) %>% 
-  filter(n > 1)
-```
-
-Sometimes a table doesn't have an explicit primary key: each row is an observation, but no combination of variables reliably identifies it. For example, what's the primary key in the `flights` table? You might think it would be the date plus the flight or tail number, but neither of those are unique:
-
-```{r}
-flights %>% 
-  count(year, month, day, flight) %>% 
-  filter(n > 1)
-
-flights %>% 
-  count(year, month, day, tailnum) %>% 
-  filter(n > 1)
-```
-
-When starting to work with this data, I had naively assumed that each flight number would be only used once per day: that would make it much easier to communicate problems with a specific flight. Unfortunately that is not the case! If a table lacks a primary key, it's sometimes useful to add one with `mutate()` and `row_number()`. That makes it easier to match observations if you've done some filtering and want to check back in with the original data. This is called a __surrogate key__.
-
-A primary key and the corresponding foreign key in another table form a __relation__. Relations are typically one-to-many. For example, each flight has one plane, but each plane has many flights. In other data, you'll occasionally see a 1-to-1 relationship. You can think of this as a special case of 1-to-many. You can model many-to-many relations with a many-to-1 relation plus a 1-to-many relation. For example, in this data there's a many-to-many relationship between airlines and airports: each airline flies to many airports; each airport hosts many airlines.
-
-### Exercises
-
-1.  Add a surrogate key to `flights`.
-
-1.  Identify the keys in the following datasets
-
-    1.  `Lahman::Batting`,
-    1.  `babynames::babynames`
-    1.  `nasaweather::atmos`
-    1.  `fueleconomy::vehicles`
-    1.  `ggplot2::diamonds`
-    
-    (You might need to install some packages and read some documentation.)
-
-1.  Draw a diagram illustrating the connections between the `Batting`,
-    `Master`, and `Salaries` tables in the Lahman package. Draw another diagram
-    that shows the relationship between `Master`, `Managers`, `AwardsManagers`.
-
-    How would you characterise the relationship between the `Batting`,
-    `Pitching`, and `Fielding` tables?
-
-## Mutating joins {#mutating-joins}
-
-The first tool we'll look at for combining a pair of tables is the __mutating join__. A mutating join allows you to combine variables from two tables. It first matches observations by their keys, then copies across variables from one table to the other.
-
-Like `mutate()`, the join functions add variables to the right, so if you have a lot of variables already, the new variables won't get printed out. For these examples, we'll make it easier to see what's going on in the examples by creating a narrower dataset:
-
-```{r}
-flights2 <- flights %>% 
-  select(year:day, hour, origin, dest, tailnum, carrier)
-flights2
-```
-
-(Remember, when you're in RStudio, you can also use `View()` to avoid this problem.)
-
-Imagine you want to add the full airline name to the `flights2` data. You can combine the `airlines` and `flights2` data frames with `left_join()`:
-
-```{r}
-flights2 %>%
-  select(-origin, -dest) %>% 
-  left_join(airlines, by = "carrier")
-```
-
-The result of joining airlines to flights2 is an additional variable: `name`. This is why I call this type of join a mutating join. In this case, you could have got to the same place using `mutate()` and R's base subsetting:
-
-```{r}
-flights2 %>%
-  select(-origin, -dest) %>% 
-  mutate(name = airlines$name[match(carrier, airlines$carrier)])
-```
-
-But this is hard to generalise when you need to match multiple variables, and takes close reading to figure out the overall intent.
-
-The following sections explain, in detail, how mutating joins work. You'll start by learning a useful visual representation of joins. We'll then use that to explain the four mutating join functions: the inner join, and the three outer joins. When working with real data, keys don't always uniquely identify observations, so next we'll talk about what happens when there isn't a unique match. Finally, you'll learn how to tell dplyr which variables are the keys for a given join.
-
-### Understanding joins
-
-To help you learn how joins work, I'm going to use a visual representation:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-setup.png")
-```
-```{r}
-x <- tribble(
-  ~key, ~val_x,
-     1, "x1",
-     2, "x2",
-     3, "x3"
-)
-y <- tribble(
-  ~key, ~val_y,
-     1, "y1",
-     2, "y2",
-     4, "y3"
-)
-```
-
-The coloured column represents the "key" variable: these are used to match the rows between the tables. The grey column represents the "value" column that is carried along for the ride. In these examples I'll show a single key variable, but the idea generalises in a straightforward way to multiple keys and multiple values.
-
-A join is a way of connecting each row in `x` to zero, one, or more rows in `y`. The following diagram shows each potential match as an intersection of a pair of lines.
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-setup2.png")
-```
-
-(If you look closely, you might notice that we've switched the order of the key and value columns in `x`. This is to emphasise that joins match based on the key; the value is just carried along for the ride.)
-
-In an actual join, matches will be indicated with dots. The number of dots = the number of matches = the number of rows in the output.
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-inner.png")
-```
-
-### Inner join {#inner-join}
-
-The simplest type of join is the __inner join__. An inner join matches pairs of observations whenever their keys are equal:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-inner.png")
-```
-
-(To be precise, this is an inner __equijoin__ because the keys are matched using the equality operator. Since most joins are equijoins we usually drop that specification.)
-
-The output of an inner join is a new data frame that contains the key, the x values, and the y values. We use `by` to tell dplyr which variable is the key:
-
-```{r}
-x %>% 
-  inner_join(y, by = "key")
-```
-
-The most important property of an inner join is that unmatched rows are not included in the result. This means that generally inner joins are usually not appropriate for use in analysis because it's too easy to lose observations.
-
-### Outer joins {#outer-join}
-
-An inner join keeps observations that appear in both tables. An __outer join__ keeps observations that appear in at least one of the tables. There are three types of outer joins:
-
-* A __left join__ keeps all observations in `x`.
-* A __right join__ keeps all observations in `y`.
-* A __full join__ keeps all observations in `x` and `y`.
-
-These joins work by adding an additional "virtual" observation to each table. This observation has a key that always matches (if no other key matches), and a value filled with `NA`.
-
-Graphically, that looks like:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-outer.png")
-```
-
-The most commonly used join is the left join: you use this whenever you look up additional data from another table, because it preserves the original observations even when there isn't a match. The left join should be your default join: use it unless you have a strong reason to prefer one of the others.
-
-Another way to depict the different types of joins is with a Venn diagram:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-venn.png")
-```
-
-However, this is not a great representation. It might jog your memory about which join preserves the observations in which table, but it suffers from a major limitation: a Venn diagram can't show what happens when keys don't uniquely identify an observation.
-
-### Duplicate keys {#join-matches}
-
-So far all the diagrams have assumed that the keys are unique. But that's not always the case. This section explains what happens when the keys are not unique. There are two possibilities:
-
-1.  One table has duplicate keys. This is useful when you want to
-    add in additional information as there is typically a one-to-many
-    relationship.
-
-    ```{r, echo = FALSE, out.width = NULL}
-    knitr::include_graphics("diagrams/join-one-to-many.png")
-    ```
-
-    Note that I've put the key column in a slightly different position
-    in the output. This reflects that the key is a primary key in `y`
-    and a foreign key in `x`.
-
-    ```{r}
-    x <- tribble(
-      ~key, ~val_x,
-         1, "x1",
-         2, "x2",
-         2, "x3",
-         1, "x4"
-    )
-    y <- tribble(
-      ~key, ~val_y,
-         1, "y1",
-         2, "y2"
-    )
-    left_join(x, y, by = "key")
-    ```
-
-1.  Both tables have duplicate keys. This is usually an error because in
-    neither table do the keys uniquely identify an observation. When you join
-    duplicated keys, you get all possible combinations, the Cartesian product:
-
-    ```{r, echo = FALSE, out.width = NULL}
-    knitr::include_graphics("diagrams/join-many-to-many.png")
-    ```
-
-    ```{r}
-    x <- tribble(
-      ~key, ~val_x,
-         1, "x1",
-         2, "x2",
-         2, "x3",
-         3, "x4"
-    )
-    y <- tribble(
-      ~key, ~val_y,
-         1, "y1",
-         2, "y2",
-         2, "y3",
-         3, "y4"
-    )
-    left_join(x, y, by = "key")
-    ```
-
-### Defining the key columns {#join-by}
-
-So far, the pairs of tables have always been joined by a single variable, and that variable has the same name in both tables. That constraint was encoded by `by = "key"`. You can use other values for `by` to connect the tables in other ways:
-
-  * The default, `by = NULL`, uses all variables that appear in both tables,
-    the so called __natural__ join. For example, the flights and weather tables
-    match on their common variables: `year`, `month`, `day`, `hour` and
-    `origin`.
-
-    ```{r}
-    flights2 %>% 
-      left_join(weather)
-    ```
-
-  * A character vector, `by = "x"`. This is like a natural join, but uses only
-    some of the common variables. For example, `flights` and `planes` have
-    `year` variables, but they mean different things so we only want to join by
-    `tailnum`.
-
-    ```{r}
-    flights2 %>% 
-      left_join(planes, by = "tailnum")
-    ```
-
-    Note that the `year` variables (which appear in both input data frames,
-    but are not constrained to be equal) are disambiguated in the output with
-    a suffix.
-
-  * A named character vector: `by = c("a" = "b")`. This will
-    match variable `a` in table `x` to variable `b` in table `y`. The
-    variables from `x` will be used in the output.
-
-    For example, if we want to draw a map we need to combine the flights data
-    with the airports data which contains the location (`lat` and `lon`) of
-    each airport. Each flight has an origin and destination `airport`, so we
-    need to specify which one we want to join to:
-
-    ```{r}
-    flights2 %>% 
-      left_join(airports, c("dest" = "faa"))
-    
-    flights2 %>% 
-      left_join(airports, c("origin" = "faa"))
-    ```
-
-### Exercises
-
-1.  Compute the average delay by destination, then join on the `airports`
-    data frame so you can show the spatial distribution of delays. Here's an
-    easy way to draw a map of the United States:
-
-    ```{r, eval = FALSE}
-    airports %>%
-      semi_join(flights, c("faa" = "dest")) %>%
-      ggplot(aes(lon, lat)) +
-        borders("state") +
-        geom_point() +
-        coord_quickmap()
-    ```
-
-    (Don't worry if you don't understand what `semi_join()` does --- you'll
-    learn about it next.)
-
-    You might want to use the `size` or `colour` of the points to display
-    the average delay for each airport.
-
-1.  Add the location of the origin _and_ destination (i.e. the `lat` and `lon`)
-    to `flights`.
-
-1.  Is there a relationship between the age of a plane and its delays?
-
-1.  What weather conditions make it more likely to see a delay?
-
-1.  What happened on June 13 2013? Display the spatial pattern of delays,
-    and then use Google to cross-reference with the weather.
-
-    ```{r, eval = FALSE, include = FALSE}
-    worst <- filter(flights, !is.na(dep_time), month == 6, day == 13)
-    worst %>%
-      group_by(dest) %>%
-      summarise(delay = mean(arr_delay), n = n()) %>%
-      filter(n > 5) %>%
-      inner_join(airports, by = c("dest" = "faa")) %>%
-      ggplot(aes(lon, lat)) +
-        borders("state") +
-        geom_point(aes(size = n, colour = delay)) +
-        coord_quickmap()
-    ```
-
-### Other implementations
-
-`base::merge()` can perform all four types of mutating join:
-
-dplyr              | merge
--------------------|-------------------------------------------
-`inner_join(x, y)` | `merge(x, y)`
-`left_join(x, y)`  | `merge(x, y, all.x = TRUE)`
-`right_join(x, y)` | `merge(x, y, all.y = TRUE)`,
-`full_join(x, y)`  | `merge(x, y, all.x = TRUE, all.y = TRUE)`
-
-The advantages of the specific dplyr verbs is that they more clearly convey the intent of your code: the difference between the joins is really important but concealed in the arguments of `merge()`. dplyr's joins are considerably faster and don't mess with the order of the rows.
-
-SQL is the inspiration for dplyr's conventions, so the translation is straightforward:
-
-dplyr                        | SQL
------------------------------|-------------------------------------------
-`inner_join(x, y, by = "z")` | `SELECT * FROM x INNER JOIN y USING (z)`
-`left_join(x, y, by = "z")`  | `SELECT * FROM x LEFT OUTER JOIN y USING (z)`
-`right_join(x, y, by = "z")` | `SELECT * FROM x RIGHT OUTER JOIN y USING (z)`
-`full_join(x, y, by = "z")`  | `SELECT * FROM x FULL OUTER JOIN y USING (z)`
-
-Note that "INNER" and "OUTER" are optional, and often omitted.
-
-Joining different variables between the tables, e.g. `inner_join(x, y, by = c("a" = "b"))` uses a slightly different syntax in SQL: `SELECT * FROM x INNER JOIN y ON x.a = y.b`. As this syntax suggests, SQL supports a wider  range of join types than dplyr because you can connect the tables using constraints other than equality (sometimes called non-equijoins).
-
-## Filtering joins {#filtering-joins}
-
-Filtering joins match observations in the same way as mutating joins, but affect the observations, not the variables. There are two types:
-
-* `semi_join(x, y)` __keeps__ all observations in `x` that have a match in `y`.
-* `anti_join(x, y)` __drops__ all observations in `x` that have a match in `y`.
-
-Semi-joins are useful for matching filtered summary tables back to the original rows. For example, imagine you've found the top ten most popular destinations:
-
-```{r}
-top_dest <- flights %>%
-  count(dest, sort = TRUE) %>%
-  head(10)
-top_dest
-```
-
-Now you want to find each flight that went to one of those destinations. You could construct a filter yourself:
-
-```{r}
-flights %>% 
-  filter(dest %in% top_dest$dest)
-```
-
-But it's difficult to extend that approach to multiple variables. For example, imagine that you'd found the 10 days with highest average delays. How would you construct the filter statement that used `year`, `month`, and `day` to match it back to `flights`?
-
-Instead you can use a semi-join, which connects the two tables like a mutating join, but instead of adding new columns, only keeps the rows in `x` that have a match in `y`:
-
-```{r}
-flights %>% 
-  semi_join(top_dest)
-```
-
-Graphically, a semi-join looks like this:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-semi.png")
-```
-
-Only the existence of a match is important; it doesn't matter which observation is matched. This means that filtering joins never duplicate rows like mutating joins do:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-semi-many.png")
-```
-
-The inverse of a semi-join is an anti-join. An anti-join keeps the rows that _don't_ have a match:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("diagrams/join-anti.png")
-```
-
-Anti-joins are useful for diagnosing join mismatches. For example, when connecting `flights` and `planes`, you might be interested to know that there are many `flights` that don't have a match in `planes`:
-
-```{r}
-flights %>%
-  anti_join(planes, by = "tailnum") %>%
-  count(tailnum, sort = TRUE)
-```
-
-### Exercises
-
-1.  What does it mean for a flight to have a missing `tailnum`? What do the
-    tail numbers that don't have a matching record in `planes` have in common?
-    (Hint: one variable explains ~90% of the problems.)
-
-1.  Filter flights to only show flights with planes that have flown at least 100
-    flights.
-
-1.  Combine `fueleconomy::vehicles` and `fueleconomy::common` to find only the
-    records for the most common models.
-
-1.  Find the 48 hours (over the course of the whole year) that have the worst
-    delays. Cross-reference it with the `weather` data. Can you see any
-    patterns?
-
-1.  What does `anti_join(flights, airports, by = c("dest" = "faa"))` tell you?
-    What does `anti_join(airports, flights, by = c("faa" = "dest"))` tell you?
-
-1.  You might expect that there's an implicit relationship between plane
-    and airline, because each plane is flown by a single airline. Confirm
-    or reject this hypothesis using the tools you've learned above.
-
-## Join problems
-
-The data you've been working with in this chapter has been cleaned up so that you'll have as few problems as possible. Your own data is unlikely to be so nice, so there are a few things that you should do with your own data to make your joins go smoothly.
-
-1.  Start by identifying the variables that form the primary key in each table.
-    You should usually do this based on your understanding of the data, not
-    empirically by looking for a combination of variables that give a
-    unique identifier. If you just look for variables without thinking about
-    what they mean, you might get (un)lucky and find a combination that's
-    unique in your current data but the relationship might not be true in
-    general.
-
-    For example, the altitude and longitude uniquely identify each airport,
-    but they are not good identifiers!
-
-    ```{r}
-    airports %>% count(alt, lon) %>% filter(n > 1)
-    ```
-
-1.  Check that none of the variables in the primary key are missing. If
-    a value is missing then it can't identify an observation!
-
-1.  Check that your foreign keys match primary keys in another table. The
-    best way to do this is with an `anti_join()`. It's common for keys
-    not to match because of data entry errors. Fixing these is often a lot of
-    work.
-
-    If you do have missing keys, you'll need to be thoughtful about your
-    use of inner vs. outer joins, carefully considering whether or not you
-    want to drop rows that don't have a match.
-
-Be aware that simply checking the number of rows before and after the join is not sufficient to ensure that your join has gone smoothly. If you have an inner join with duplicate keys in both tables, you might get unlucky as the number of dropped rows might exactly equal the number of duplicated rows!
-
-## Set operations {#set-operations}
-
-The final type of two-table verb are the set operations. Generally, I use these the least frequently, but they are occasionally useful when you want to break a single complex filter into simpler pieces. All these operations work with a complete row, comparing the values of every variable. These expect the `x` and `y` inputs to have the same variables, and treat the observations like sets:
-
-* `intersect(x, y)`: return only observations in both `x` and `y`.
-* `union(x, y)`: return unique observations in `x` and `y`.
-* `setdiff(x, y)`: return observations in `x`, but not in `y`.
-
-Given this simple data:
-
-```{r}
-df1 <- tribble(
-  ~x, ~y,
-   1,  1,
-   2,  1
-)
-df2 <- tribble(
-  ~x, ~y,
-   1,  1,
-   1,  2
-)
-```
-
-The four possibilities are:
-
-```{r}
-intersect(df1, df2)
-
-# Note that we get 3 rows, not 4
-union(df1, df2)
-
-setdiff(df1, df2)
-
-setdiff(df2, df1)
-```
diff --git a/rmarkdown-formats.Rmd b/rmarkdown-formats.Rmd
deleted file mode 100644
index fd77a81e4..000000000
--- a/rmarkdown-formats.Rmd
+++ /dev/null
@@ -1,279 +0,0 @@
-# R Markdown formats
-
-## Introduction
-
-So far you've seen R Markdown used to produce HTML documents. This chapter gives a brief overview of some of the many other types of output you can produce with R Markdown. There are two ways to set the output of a document:
-
-1.  Permanently, by modifying the YAML header: 
-    
-    ```yaml
-    title: "Viridis Demo"
-    output: html_document
-    ```
-    
-1.  Transiently, by calling `rmarkdown::render()` by hand:
-    
-    ```{r eval = FALSE}
-    rmarkdown::render("diamond-sizes.Rmd", output_format = "word_document")
-    ```
-    
-    This is useful if you want to programmatically produce multiple types of
-    output.
-
-RStudio's knit button renders a file to the first format listed in its `output` field. You can render to additional formats by clicking the dropdown menu beside the knit button.
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("screenshots/rmarkdown-knit.png")
-```
-
-## Output options
-
-Each output format is associated with an R function. You can either write `foo` or `pkg::foo`. If you omit `pkg`, the default is assumed to be rmarkdown. It's important to know the name of the function that makes the output because that's where you get help. For example, to figure out what parameters you can set with `html_document`, look at `?rmarkdown::html_document`.
-
-To override the default parameter values, you need to use an expanded `output` field. For example, if you wanted to render an `html_document` with a floating table of contents, you'd use:
-
-```yaml
-output:
-  html_document:
-    toc: true
-    toc_float: true
-```
-
-You can even render to multiple outputs by supplying a list of formats:
-
-```yaml
-output:
-  html_document:
-    toc: true
-    toc_float: true
-  pdf_document: default
-```
-
-Note the special syntax if you don't want to override any of the default options.
-
-## Documents
-
-The previous chapter focused on the default `html_document` output. There are a number of basic variations on that theme, generating different types of documents:
-
-*   `pdf_document` makes a PDF with LaTeX (an open source document layout 
-    system), which you'll need to install. RStudio will prompt you if you 
-    don't already have it.
-  
-*   `word_document` for Microsoft Word documents (`.docx`).
-  
-*   `odt_document` for OpenDocument Text documents (`.odt`).
-  
-*   `rtf_document` for Rich Text Format (`.rtf`) documents.
-  
-*   `md_document` for a Markdown document. This isn't typically useful by 
-    itself, but you might use it if, for example, your corporate CMS or
-    lab wiki uses markdown.
-    
-*   `github_document`: this is a tailored version of `md_document` 
-    designed for sharing on GitHub. 
-
-Remember, when generating a document to share with decision makers, you can turn off the default display of code by setting global options in the setup chunk:
-
-```{r, eval = FALSE}
-knitr::opts_chunk$set(echo = FALSE)
-```
-
-For `html_document`s another option is to make the code chunks hidden by default, but visible with a click:
-
-```yaml
-output:
-  html_document:
-    code_folding: hide
-```
-
-## Notebooks
-
-A notebook, `html_notebook`, is a variation on a `html_document`. The rendered outputs are very similar, but the purpose is different. A `html_document` is focused on communicating with decision makers, while a notebook is focused on collaborating with other data scientists. These different purposes lead to using the HTML output in different ways. Both HTML outputs will contain the fully rendered output, but the notebook also contains the full source code. That means you can use the `.nb.html` generated by the notebook in two ways:
-
-1. You can view it in a web browser, and see the rendered output. Unlike
-   `html_document`, this rendering always includes an embedded copy of 
-   the source code that generated it.
-
-1. You can edit it in RStudio. When you open an `.nb.html` file, RStudio will
-   automatically recreate the `.Rmd` file that generated it. In the future, you 
-   will also be able to include supporting files (e.g. `.csv` data files), which 
-   will be automatically extracted when needed. 
-
-Emailing `.nb.html` files is a simple way to share analyses with your colleagues. But things will get painful as soon as they want to make changes. If this starts to happen, it's a good time to learn Git and GitHub. Learning Git and GitHub is definitely painful at first, but the collaboration payoff is huge. As mentioned earlier, Git and GitHub are outside the scope of the book, but there's one tip that's useful if you're already using them: use both `html_notebook` and `github_document` outputs:
-
-```yaml
-output:
-  html_notebook: default
-  github_document: default
-```
-
-`html_notebook` gives you a local preview, and a file that you can share via email. `github_document` creates a minimal md file that you can check into git. You can easily see how the results of your analysis (not just the code) change over time, and GitHub will render it for you nicely online.
-
-## Presentations
-
-You can also use R Markdown to produce presentations. You get less visual control than with a tool like Keynote or PowerPoint, but automatically inserting the results of your R code into a presentation can save a huge amount of time. Presentations work by dividing your content into slides, with a new slide beginning at each first (`#`) or second (`##`) level header. You can also insert a horizontal rule (`***`) to create a new slide without a header. 
-
-R Markdown comes with three presentation formats built-in:
-
-1.  `ioslides_presentation` - HTML presentation with ioslides
-
-1.  `slidy_presentation` - HTML presentation with W3C Slidy
-
-1.  `beamer_presentation` - PDF presentation with LaTeX Beamer.
-
-Two other popular formats are provided by packages:
-
-1.  `revealjs::revealjs_presentation` - HTML presentation with reveal.js. 
-    Requires the __revealjs__ package.
-
-1.  __rmdshower__, <https://github.com/MangoTheCat/rmdshower>, provides a 
-    wrapper around the __shower__, <https://github.com/shower/shower>, 
-    presentation engine
-
-## Dashboards
-
-Dashboards are a useful way to communicate large amounts of information visually and quickly. Flexdashboard makes it particularly easy to create dashboards using R Markdown and a convention for how the headers affect the layout:
-
-* Each level 1 header (`#`) begins a new page in the dashboard.
-* Each level 2 header (`##`) begins a new column.
-* Each level 3 header (`###`) begins a new row.
-
-For example, you can produce this dashboard:
-
-```{r, echo = FALSE, out.width = "75%"}
-knitr::include_graphics("screenshots/rmarkdown-flexdashboard.png")
-```
-
-Using this code:
-
-```{r comment = "", echo = FALSE}
-cat(readr::read_file("rmarkdown/dashboard.Rmd"))
-```
-
-Flexdashboard also provides simple tools for creating sidebars, tabsets, value boxes, and gauges. To learn more about flexdashboard visit <http://rmarkdown.rstudio.com/flexdashboard/>.
-
-## Interactivity
-
-Any HTML format (document, notebook, presentation, or dashboard) can contain interactive components.
-
-### htmlwidgets
-
-HTML is an interactive format, and you can take advantage of that interactivity with __htmlwidgets__, R functions that produce interactive HTML visualisations. For example, take the __leaflet__ map below. If you're viewing this page on the web, you can drag the map around, zoom in and out, etc. You obviously can't do that in a book, so rmarkdown automatically inserts a static screenshot for you.
-
-```{r}
-library(leaflet)
-leaflet() %>%
-  setView(174.764, -36.877, zoom = 16) %>% 
-  addTiles() %>%
-  addMarkers(174.764, -36.877, popup = "Maungawhau") 
-```
-
-The great thing about htmlwidgets is that you don't need to know anything about HTML or JavaScript to use them. All the details are wrapped inside the package, so you don't need to worry about it.  
-
-There are many packages that provide htmlwidgets, including:
-
-* __dygraphs__, <http://rstudio.github.io/dygraphs/>, for interactive time 
-  series visualisations.
-
-* __DT__, <http://rstudio.github.io/DT/>, for interactive tables.
-
-* __threejs__, <https://github.com/bwlewis/rthreejs> for interactive 3d plots.
-
-* __DiagrammeR__, <http://rich-iannone.github.io/DiagrammeR/> for diagrams
-  (like flow charts and simple node-link diagrams).
-
-To learn more about htmlwidgets and see a more complete list of packages that provide them visit <http://www.htmlwidgets.org/>.
-
-### Shiny
-
-htmlwidgets provide __client-side__ interactivity --- all the interactivity happens in the browser, independently of R. On one hand, that's great because you can distribute the HTML file without any connection to R. However, that fundamentally limits what you can do to things that have been implemented in HTML and JavaScript.  An alternative approach is to use __shiny__, a package that allows you to create interactivity using R code, not JavaScript.
-
-To call Shiny code from an R Markdown document, add `runtime: shiny` to the header:
-
-```yaml
-title: "Shiny Web App"
-output: html_document
-runtime: shiny
-```
-
-Then you can use the "input" functions to add interactive components to the document:
-
-```{r, eval = FALSE}
-library(shiny)
-
-textInput("name", "What is your name?")
-numericInput("age", "How old are you?", NA, min = 0, max = 150)
-```
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("screenshots/rmarkdown-shiny.png")
-```
-You can then refer to the values with `input$name` and `input$age`, and the code that uses them will be automatically re-run whenever they change. 
-
-I can't show you a live shiny app here because shiny interactions occur on the __server-side__. This means that you can write interactive apps without knowing JavaScript, but you need a server to run them on. This introduces a logistical issue: Shiny apps need a Shiny server to be run online. When you run shiny apps on your own computer, shiny automatically sets up a shiny server for you, but you need a public facing shiny server if you want to publish this sort of interactivity online. That's the fundamental trade-off of shiny: you can do anything in a shiny document that you can do in R, but it requires someone to be running R.
-
-Learn more about Shiny at <http://shiny.rstudio.com/>.
-
-## Websites
-
-With a little additional infrastructure you can use R Markdown to generate a complete website:
-
-*   Put your `.Rmd` files in a single directory. `index.Rmd` will become 
-    the home page.
-
-*   Add a YAML file named `_site.yml` provides the navigation for the site.
-    For example:
-
-    ```{r echo = FALSE, comment = ""}
-    cat(readr::read_file("rmarkdown/example-site.yml"))
-    ```
-
-Execute `rmarkdown::render_site()` to build `_site`, a directory of files ready to deploy as a standalone static website, or if you use an RStudio Project for your website directory. RStudio will add a Build tab to the IDE that you can use to build and preview your site. 
-
-Read more at <http://rmarkdown.rstudio.com/rmarkdown_websites.html>.
-
-## Other formats
-
-Other packages provide even more output formats:
-
-*   The __bookdown__ package, <https://github.com/rstudio/bookdown>, 
-    makes it easy to write books, like this one. To learn more, read 
-    [_Authoring Books with R Markdown_](https://bookdown.org/yihui/bookdown/),
-    by Yihui Xie, which is, of course, written in bookdown. Visit
-    <http://www.bookdown.org> to see other bookdown books written by the 
-    wider R community.
-
-*   The __prettydoc__ package, <https://github.com/yixuan/prettydoc/>, 
-    provides lightweight document formats with a range of attractive
-    themes.
-
-*   The __rticles__ package, <https://github.com/rstudio/rticles>, compiles a
-    selection of formats tailored for specific scientific journals.
-
-See <http://rmarkdown.rstudio.com/formats.html> for a list of even more formats.  You can also create your own by following the instructions at <http://rmarkdown.rstudio.com/developer_custom_formats.html>.
-
-## Learning more
-
-To learn more about effective communication in these different formats I recommend the following resources:
-
-* To improve your presentation skills, I recommend 
-  [_Presentation Patterns_](https://amzn.com/0321820800), by Neal Ford,
-  Matthew McCollough, and Nathaniel Schutta. It provides a set of effective
-  patterns (both low- and high-level) that you can apply to improve your 
-  presentations.
-  
-* If you give academic talks, I recommend reading the [_Leek group guide
-  to giving talks_](https://github.com/jtleek/talkguide).
-  
-* I haven't taken it myself, but I've heard good things about Matt 
-  McGarrity's online course on public speaking: 
-  <https://www.coursera.org/learn/public-speaking>.
-
-* If you are creating a lot of dashboards, make sure to read Stephen Few's
-  [Information Dashboard Design: The Effective Visual Communication 
-  of Data](https://amzn.com/0596100167). It will help you create dashboards
-  that are truly useful, not just pretty to look at.
-
-* Effectively communicating your ideas often benefits from some
-  knowledge of graphic design. [_The Non-Designer's Design
-  Book_](http://amzn.com/0133966151) is a great place to start.
-
diff --git a/rmarkdown-workflow.Rmd b/rmarkdown-workflow.Rmd
deleted file mode 100644
index f6567d66a..000000000
--- a/rmarkdown-workflow.Rmd
+++ /dev/null
@@ -1,65 +0,0 @@
-# R Markdown workflow
-
-Earlier, we discussed a basic workflow for capturing your R code where you work  interactively in the _console_, then capture what works in the _script editor_. R Markdown brings together the console and the script editor, blurring the lines between interactive exploration and long-term code capture. You can rapidly iterate within a chunk, editing and re-executing with Cmd/Ctrl + Shift + Enter. When you're happy, you move on and start a new chunk.
-
-R Markdown is also important because it so tightly integrates prose and code. This makes it a great __analysis notebook__ because it lets you develop code and record your thoughts. An analysis notebook shares many of the same goals as a classic lab notebook in the physical sciences. It:
-
-*   Records what you did and why you did it. Regardless of how great your
-    memory is, if you don't record what you do, there will come a time when
-    you have forgotten important details. Write them down so you don't forget!
-
-*   Supports rigorous thinking. You are more likely to come up with a strong
-    analysis if you record your thoughts as you go, and continue to reflect
-    on them. This also saves you time when you eventually write up your
-    analysis to share with others.
-
-*   Helps others understand your work. It is rare to do data analysis by
-    yourself, and you'll often be working as part of a team. A lab notebook
-    helps you share not only what you've done, but why you did it with your
-    colleagues or lab mates.
-
-Much of the good advice about using lab notebooks effectively can also be translated to analysis notebooks. I've drawn on my own experiences and Colin Purrington's advice on lab notebooks  (<http://colinpurrington.com/tips/lab-notebooks>) to come up with the following tips:
-
-*   Ensure each notebook has a descriptive title, an evocative filename, and a
-    first paragraph that briefly describes the aims of the analysis.
-
-*   Use the YAML header date field to record the date you started working on the
-    notebook:
-
-    ```yaml
-    date: 2016-08-23
-    ```
-
-    Use ISO8601 YYYY-MM-DD format so that's there no ambiguity. Use it
-    even if you don't normally write dates that way!
-
-*   If you spend a lot of time on an analysis idea and it turns out to be a
-    dead end, don't delete it! Write up a brief note about why it failed and
-    leave it in the notebook. That will help you avoid going down the same
-    dead end when you come back to the analysis in the future.
-
-*   Generally, you're better off doing data entry outside of R. But if you 
-    do need to record a small snippet of data, clearly lay it out using
-    `tibble::tribble()`.
-
-*   If you discover an error in a data file, never modify it directly, but
-    instead write code to correct the value. Explain why you made the fix.
-
-*   Before you finish for the day, make sure you can knit the notebook
-    (if you're using caching, make sure to clear the caches). That will
-    let you fix any problems while the code is still fresh in your mind.
-
-*   If you want your code to be reproducible in the long-run (i.e. so you can
-    come back to run it next month or next year), you'll need to track the
-    versions of the packages that your code uses. A rigorous approach is to use
-    __packrat__, <http://rstudio.github.io/packrat/>, which stores packages 
-    in your project directory, or __checkpoint__,
-    <https://github.com/RevolutionAnalytics/checkpoint>, which will reinstall
-    packages available on a specified date. A quick and dirty hack is to include
-    a chunk that runs `sessionInfo()` --- that won't let you easily recreate 
-    your packages as they are today, but at least you'll know what they were.
-
-*   You are going to create many, many, many analysis notebooks over the course
-    of your career. How are you going to organise them so you can find them
-    again in the future? I recommend storing them in individual projects,
-    and coming up with a good naming scheme.
diff --git a/rmarkdown.Rmd b/rmarkdown.Rmd
deleted file mode 100644
index 255483f00..000000000
--- a/rmarkdown.Rmd
+++ /dev/null
@@ -1,432 +0,0 @@
-# R Markdown
-
-## Introduction
-
-R Markdown provides an unified authoring framework for data science, combining your code, its results, and your prose commentary. R Markdown documents are fully reproducible and support dozens of output formats, like PDFs, Word files, slideshows, and more. 
-
-R Markdown files are designed to be used in three ways:
-
-1.  For communicating to decision makers, who want to focus on the conclusions,
-    not the code behind the analysis.
-
-1.  For collaborating with other data scientists (including future you!), who
-    are interested in both your conclusions, and how you reached them (i.e.
-    the code).
-    
-1.  As an environment in which to _do_ data science, as a modern day lab 
-    notebook where you can capture not only what you did, but also what you
-    were thinking.
-
-R Markdown integrates a number of R packages and external tools. This means that help is, by-and-large, not available through `?`. Instead, as you work through this chapter, and use R Markdown in the future, keep these resources close to hand:
-
-*   R Markdown Cheat Sheet: _Help > Cheatsheets > R Markdown Cheat Sheet_,
-
-*   R Markdown Reference Guide: _Help > Cheatsheets > R Markdown Reference 
-    Guide_.
-
-Both cheatsheets are also available at <http://rstudio.com/cheatsheets>.
-
-### Prerequisites
-
-You need the __rmarkdown__ package, but you don't need to explicitly install it or load it, as RStudio automatically does both when needed.
-
-```{r setup, include = FALSE}
-chunk <- "```"
-inline <- function(x = "") paste0("`` `r ", x, "` ``")
-library(tidyverse)
-```
-
-## R Markdown basics
-
-This is an R Markdown file, a plain text file that has the extension `.Rmd`:
-
-```{r echo = FALSE, comment = ""}
-cat(htmltools::includeText("rmarkdown/diamond-sizes.Rmd"))
-```
-
-It contains three important types of content:
-
-1.  An (optional) __YAML header__ surrounded by `---`s.
-1.  __Chunks__ of R code surrounded by ```` ``` ````.
-1.  Text mixed with simple text formatting like `# heading` and `_italics_`.
-
-When you open an `.Rmd`, you get a notebook interface where code and output are interleaved. You can run each code chunk by clicking the Run icon (it looks like a play button at the top of the chunk), or by pressing Cmd/Ctrl + Shift + Enter. RStudio executes the code and displays the results inline with the code:
-
-```{r, echo = FALSE, out.width = "75%"}
-knitr::include_graphics("rmarkdown/diamond-sizes-notebook.png")
-```
-
-To produce a complete report containing all text, code, and results, click "Knit" or press Cmd/Ctrl + Shift + K.  You can also do this programmatically with `rmarkdown::render("1-example.Rmd")`. This will display the report in the viewer pane, and create a self-contained HTML file that you can share with others.
-
-```{r, echo = FALSE, out.width = "75%"}
-knitr::include_graphics("rmarkdown/diamond-sizes-report.png")
-```
-
-When you __knit__ the document, R Markdown sends the .Rmd file to __knitr__, http://yihui.name/knitr/, which executes all of the code chunks and creates a new markdown (.md) document which includes the code and its output. The markdown file generated by knitr is then processed by __pandoc__, <http://pandoc.org/>, which is responsible for creating the finished file. The advantage of this two step workflow is that you can create a very wide range of output formats, as you'll learn about in [R markdown formats].
-
-```{r, echo = FALSE, out.width = "75%"}
-knitr::include_graphics("images/RMarkdownFlow.png")
-```
-
-To get started with your own `.Rmd` file, select *File > New File > R Markdown...* in the menubar. RStudio will launch a wizard that you can use to pre-populate your file with useful content that reminds you how the key features of R Markdown work. 
-
-The following sections dive into the three components of an R Markdown document in more details: the markdown text, the code chunks, and the YAML header.
-
-### Exercises
-
-1.  Create a new notebook using _File > New File > R Notebook_. Read the 
-    instructions. Practice running the chunks. Verify that you can modify
-    the code, re-run it, and see modified output.
-    
-1.  Create a new R Markdown document with _File > New File > R Markdown..._
-    Knit it by clicking the appropriate button. Knit it by using the 
-    appropriate keyboard short cut. Verify that you can modify the
-    input and see the output update.
-    
-1.  Compare and contrast the R notebook and R markdown files you created
-    above. How are the outputs similar? How are they different? How are
-    the inputs similar? How are they different? What happens if you
-    copy the YAML header from one to the other?
-
-1.  Create one new R Markdown document for each of the three built-in
-    formats: HTML, PDF and Word. Knit each of the three documents.
-    How does the output differ? How does the input differ? (You may need
-    to install LaTeX in order to build the PDF output --- RStudio will
-    prompt you if this is necessary.)
-
-## Text formatting with Markdown
-
-Prose in `.Rmd` files is written in Markdown, a lightweight set of conventions for formatting plain text files. Markdown is designed to be easy to read and easy to write. It is also very easy to learn. The guide below shows how to use Pandoc's Markdown, a slightly extended version of Markdown that R Markdown understands.
-
-```{r, echo = FALSE, comment = ""}
-cat(readr::read_file("rmarkdown/markdown.Rmd"))
-```
-
-The best way to learn these is simply to try them out. It will take a few days, but soon they will become second nature, and you won't need to think about them. If you forget, you can get to a handy reference sheet with *Help > Markdown Quick Reference*.
-
-### Exercises
-
-1.  Practice what you've learned by creating a brief CV. The title should be
-    your name, and you should include headings for (at least) education or
-    employment. Each of the sections should include a bulleted list of
-    jobs/degrees. Highlight the year in bold.
-    
-1.  Using the R Markdown quick reference, figure out how to:
-
-    1.  Add a footnote.
-    1.  Add a horizontal rule.
-    1.  Add a block quote.
-    
-1.  Copy and paste the contents of `diamond-sizes.Rmd` from
-    <https://github.com/hadley/r4ds/tree/master/rmarkdown> in to a local
-    R markdown document. Check that you can run it, then add text after the 
-    frequency polygon that describes its most striking features.
-
-## Code chunks
-
-To run code inside an R Markdown document, you need to insert a chunk. There are three ways to do so:
-
-1. The keyboard shortcut Cmd/Ctrl + Alt + I
-
-1. The "Insert" button icon in the editor toolbar.
-
-1. By manually typing the chunk delimiters ` ```{r} ` and ` ``` `.
-
-Obviously, I'd recommend you learn the keyboard shortcut. It will save you a lot of time in the long run!
-
-You can continue to run the code using the keyboard shortcut that by now (I hope!) you know and love: Cmd/Ctrl + Enter. However, chunks get a new keyboard shortcut: Cmd/Ctrl + Shift + Enter, which runs all the code in the chunk. Think of a chunk like a function. A chunk should be relatively self-contained, and focussed around a single task. 
-
-The following sections describe the chunk header which consists of ```` ```{r ````, followed by an optional chunk name, followed by comma separated options, followed by `}`. Next comes your R code and the chunk end is indicated by a final ```` ``` ````.
-
-### Chunk name
-
-Chunks can be given an optional name: ```` ```{r by-name} ````. This has three advantages:
-
-1.  You can more easily navigate to specific chunks using the drop-down
-    code navigator in the bottom-left of the script editor:
-
-    ```{r, echo = FALSE, out.width = "30%"}
-    knitr::include_graphics("screenshots/rmarkdown-chunk-nav.png")
-    ```
-
-1.  Graphics produced by the chunks will have useful names that make
-    them easier to use elsewhere. More on that in [other important options].
-    
-1.  You can set up networks of cached chunks to avoid re-performing expensive
-    computations on every run. More on that below.
-
-There is one chunk name that imbues special behaviour: `setup`. When you're in a notebook mode, the chunk named setup will be run automatically once, before any other code is run.
-
-### Chunk options
-
-Chunk output can be customised with __options__, arguments supplied to chunk header. Knitr provides almost 60 options that you can use to customize your code chunks. Here we'll cover the most important chunk options that you'll use frequently. You can see the full list at <http://yihui.name/knitr/options/>. 
-
-The most important set of options controls if your code block is executed and what results are inserted in the finished report:
-  
-*   `eval = FALSE` prevents code from being evaluated. (And obviously if the
-    code is not run, no results will be generated). This is useful for 
-    displaying example code, or for disabling a large block of code without 
-    commenting each line.
-
-*   `include = FALSE` runs the code, but doesn't show the code or results 
-    in the final document. Use this for setup code that you don't want
-    cluttering your report.
-
-*   `echo = FALSE` prevents code, but not the results from appearing in the 
-    finished file. Use this when writing reports aimed at people who don't
-    want to see the underlying R code.
-    
-*   `message = FALSE` or `warning = FALSE` prevents messages or warnings 
-    from appearing in the finished file.
-
-*   `results = 'hide'` hides printed output; `fig.show = 'hide'` hides
-    plots.
-
-*   `error = TRUE` causes the render to continue even if code returns an error.
-    This is rarely something you'll want to include in the final version
-    of your report, but can be very useful if you need to debug exactly
-    what is going on inside your `.Rmd`. It's also useful if you're teaching R
-    and want to deliberately include an error. The default, `error = FALSE` causes 
-    knitting to fail if there is a single error in the document.
-    
-The following table summarises which types of output each option supressess:
-
-Option             | Run code | Show code | Output | Plots | Messages | Warnings 
--------------------|----------|-----------|--------|-------|----------|---------
-`eval = FALSE`     | -        |           | -      | -     | -        | -
-`include = FALSE`  |          | -         | -      | -     | -        | -
-`echo = FALSE`     |          | -         |        |       |          |
-`results = "hide"` |          |           | -      |       |          | 
-`fig.show = "hide"`|          |           |        | -     |          |
-`message = FALSE`  |          |           |        |       | -        |
-`warning = FALSE`  |          |           |        |       |          | -
-
-### Table
-
-By default, R Markdown prints data frames and matrices as you'd see them in the console:
-
-```{r}
-mtcars[1:5, ]
-```
-
-If you prefer that data be displayed with additional formatting you can use the `knitr::kable` function. The code below generates Table \@ref(tab:kable).
-
-```{r kable}
-knitr::kable(
-  mtcars[1:5, ], 
-  caption = "A knitr kable."
-)
-```
-
-Read the documentation for `?knitr::kable` to see the other ways in which you can customise the table. For even deeper customisation, consider the __xtable__, __stargazer__, __pander__, __tables__, and __ascii__ packages. Each provides a set of tools for returning formatted tables from R code.
-
-There is also a rich set of options for controlling how figures are embedded. You'll learn about these in [saving your plots].
-
-### Caching
-
-Normally, each knit of a document starts from a completely clean slate. This is great for reproducibility, because it ensures that you've captured every important computation in code. However, it can be painful if you have some computations that take a long time. The solution is `cache = TRUE`. When set, this will save the output of the chunk to a specially named file on disk. On subsequent runs, knitr will check to see if the code has changed, and if it hasn't, it will reuse the cached results.
-
-The caching system must be used with care, because by default it is based on the code only, not its dependencies. For example, here the `processed_data` chunk depends on the `raw_data` chunk:
-
-    `r chunk`{r raw_data}
-    rawdata <- readr::read_csv("a_very_large_file.csv")
-    `r chunk`
-    
-    `r chunk`{r processed_data, cache = TRUE}
-    processed_data <- rawdata %>% 
-      filter(!is.na(import_var)) %>% 
-      mutate(new_variable = complicated_transformation(x, y, z))
-    `r chunk`
-
-Caching the `processed_data` chunk means that it will get re-run if the dplyr pipeline is changed, but it won't get rerun if the `read_csv()` call changes. You can avoid that problem with the `dependson` chunk option:
-
-    `r chunk`{r processed_data, cache = TRUE, dependson = "raw_data"}
-    processed_data <- rawdata %>% 
-      filter(!is.na(import_var)) %>% 
-      mutate(new_variable = complicated_transformation(x, y, z))
-    `r chunk`
-
-`dependson` should contain a character vector of *every* chunk that the cached chunk depends on. Knitr will update the results for the cached chunk whenever it detects that one of its dependencies have changed.
-
-Note that the chunks won't update if `a_very_large_file.csv` changes, because knitr caching only tracks changes within the `.Rmd` file. If you want to also track changes to that file you can use the `cache.extra` option. This is an arbitrary R expression that will invalidate the cache whenever it changes. A good function to use is `file.info()`: it returns a bunch of information about the file including when it was last modified. Then you can write:
-
-    `r chunk`{r raw_data, cache.extra = file.info("a_very_large_file.csv")}
-    rawdata <- readr::read_csv("a_very_large_file.csv")
-    `r chunk`
-
-As your caching strategies get progressively more complicated, it's a good idea to regularly clear out all your caches with `knitr::clean_cache()`.
-
-I've used the advice of [David Robinson](https://twitter.com/drob/status/738786604731490304) to name these chunks: each chunk is named after the primary object that it creates. This makes it easier to understand the `dependson` specification.
-
-### Global options
-
-As you work more with knitr, you will discover that some of the default chunk options don't fit your needs and you want to change them. You can do this by calling `knitr::opts_chunk$set()` in a code chunk. For example, when writing books and tutorials I set:
-
-```{r, eval = FALSE}
-knitr::opts_chunk$set(
-  comment = "#>",
-  collapse = TRUE
-)
-```
-
-This uses my preferred comment formatting, and ensures that the code and output are kept closely entwined. On the other hand, if you were preparing a report, you might set:
-
-```{r eval = FALSE}
-knitr::opts_chunk$set(
-  echo = FALSE
-)
-```
-
-That will hide the code by default, so only showing the chunks you deliberately choose to show (with `echo = TRUE`). You might consider setting `message = FALSE` and `warning = FALSE`, but that would make it harder to debug problems because you wouldn't see any messages in the final document.
-
-### Inline code
-
-There is one other way to embed R code into an R Markdown document: directly into the text, with:  `r inline()`. This can be very useful if you mention properties of your data in the text. For example, in the example document I used at the start of the chapter I had:
-
-> We have data about `r inline('nrow(diamonds)')` diamonds. 
-> Only `r inline('nrow(diamonds) - nrow(smaller)')` are larger 
-> than 2.5 carats. The distribution of the remainder is shown below:
-
-When the report is knit, the results of these computations are inserted into the text:
-
-> We have data about 53940 diamonds. Only 126 are larger than 
-> 2.5 carats. The distribution of the remainder is shown below:
-
-When inserting numbers into text, `format()` is your friend. It allows you to set the number of `digits` so you don't print to a ridiculous degree of accuracy, and a `big.mark` to make numbers easier to read. I'll often combine these into a helper function:
-
-```{r}
-comma <- function(x) format(x, digits = 2, big.mark = ",")
-comma(3452345)
-comma(.12358124331)
-```
-
-### Exercises
-
-1.  Add a section that explores how diamond sizes vary by cut, colour,
-    and clarity. Assume you're writing a report for someone who doesn't know
-    R, and instead of setting `echo = FALSE` on each chunk, set a global 
-    option.
-
-1.  Download `diamond-sizes.Rmd` from
-    <https://github.com/hadley/r4ds/tree/master/rmarkdown>. Add a section
-    that describes the largest 20 diamonds, including a table that displays
-    their most important attributes.
-
-1.  Modify `diamonds-sizes.Rmd` to use `comma()` to produce nicely
-    formatted output. Also include the percentage of diamonds that are
-    larger than 2.5 carats.
-
-1.  Set up a network of chunks where `d` depends on `c` and `b`, and
-    both `b` and `c` depend on `a`. Have each chunk print `lubridate::now()`,
-    set `cache = TRUE`, then verify your understanding of caching.
-
-## Troubleshooting
-
-Troubleshooting R Markdown documents can be challenging because you are no longer in an interactive R environment, and you will need to learn some new tricks. The first thing you should always try is to recreate the problem in an interactive session. Restart R, then "Run all chunks" (either from Code menu, under Run region), or with the keyboard shortcut Ctrl + Alt + R. If you're lucky, that will recreate the problem, and you can figure out what's going on interactively.
-
-If that doesn't help, there must be something different between your interactive environment and the R markdown environment. You're going to need to systematically explore the options. The most common difference is the working directory: the working directory of an R Markdown is the directory in which it lives. Check the working directory is what you expect by including `getwd()` in a chunk.
-
-Next, brainstorm all the things that might cause the bug. You'll need to systematically check that they're the same in your R session and your R markdown session. The easiest way to do that is to set `error = TRUE` on the chunk causing the problem, then use `print()` and `str()` to check that settings are as you expect.
-
-## YAML header
-
-You can control many other "whole document" settings by tweaking the parameters of the YAML header.  You might wonder what YAML stands for: it's "yet another markup language", which is designed for representing hierarchical data in a way that's easy for humans to read and write. R Markdown uses it to control many details of the output. Here we'll discuss two: document parameters and bibliographies.
-
-### Parameters
-
-R Markdown documents can include one or more parameters whose values can be set when you render the report. Parameters are useful when you want to re-render the same report with distinct values for various key inputs. For example, you might be producing sales reports per branch, exam results by student, or demographic summaries by country. To declare one or more parameters, use the `params` field. 
-
-This example uses a `my_class` parameter to determine which class of cars to display:
-
-```{r, echo = FALSE, out.width = "100%", comment = ""}
-cat(readr::read_file("rmarkdown/fuel-economy.Rmd"))
-```
-
-As you can see, parameters are available within the code chunks as a read-only list named `params`.
-
-You can write atomic vectors directly into the YAML header. You can also run arbitrary R expressions by prefacing the parameter value with `!r`. This is a good way to specify date/time parameters.
-
-```yaml
-params:
-  start: !r lubridate::ymd("2015-01-01")
-  snapshot: !r lubridate::ymd_hms("2015-01-01 12:30:00")
-```
-
-In RStudio, you can click the "Knit with Parameters" option in the Knit dropdown menu to set parameters, render, and preview the report in a single user friendly step. You can customise the dialog by setting other options in the header. See <http://rmarkdown.rstudio.com/developer_parameterized_reports.html#parameter_user_interfaces> for more details.
-
-Alternatively, if you need to produce many such paramterised reports, you can call `rmarkdown::render()` with a list of `params`:
-
-```{r eval = FALSE}
-rmarkdown::render("fuel-economy.Rmd", params = list(my_class = "suv"))
-```
-
-This is particularly powerful in conjunction with `purrr:pwalk()`. The following example creates a report for each value of `class` found in `mpg`. First we create a data frame that has one row for each class, giving the `filename` of the report and the `params`:
-
-```{r}
-reports <- tibble(
-  class = unique(mpg$class),
-  filename = stringr::str_c("fuel-economy-", class, ".html"),
-  params = purrr::map(class, ~ list(my_class = .))
-)
-reports
-```
-
-Then we match the column names to the argument names of `render()`, and use purrr's **parallel** walk to call `render()` once for each row:
-
-```{r, eval = FALSE}
-reports %>% 
-  select(output_file = filename, params) %>% 
-  purrr::pwalk(rmarkdown::render, input = "fuel-economy.Rmd")
-```
-
-### Bibliographies and Citations
-
-Pandoc can automatically generate citations and a bibliography in a number of styles. To use this feature, specify a bibliography file using the `bibliography` field in your file's header. The field should contain a path from the directory that contains your .Rmd file to the file that contains the bibliography file:
-
-```yaml
-bibliography: rmarkdown.bib
-```
-
-You can use many common bibliography formats including BibLaTeX, BibTeX, endnote, medline.
-
-To create a citation within your .Rmd file, use a key composed of ‘@’ + the citation identifier from the bibliography file. Then place the citation in square brackets. Here are some examples:
-
-```markdown
-Separate multiple citations with a `;`: Blah blah [@smith04; @doe99].
-
-You can add arbitrary comments inside the square brackets: 
-Blah blah [see @doe99, pp. 33-35; also @smith04, ch. 1].
-
-Remove the square brackets to create an in-text citation: @smith04 
-says blah, or @smith04 [p. 33] says blah.
-
-Add a `-` before the citation to suppress the author's name: 
-Smith says blah [-@smith04].
-```
-
-When R Markdown renders your file, it will build and append a bibliography to the end of your document. The bibliography will contain each of the cited references from your bibliography file, but it will not contain a section heading. As a result it is common practice to end your file with a section header for the bibliography, such as `# References` or `# Bibliography`.
-
-You can change the style of your citations and bibliography by referencing a CSL (citation style language) file in the `csl` field:
-
-```yaml
-bibliography: rmarkdown.bib
-csl: apa.csl
-```
-
-As with the bibliography field, your csl file should contain a path to the file. Here I assume that the csl file is in the same directory as the .Rmd file. A good place to find CSL style files for common bibliography styles is  <http://github.com/citation-style-language/styles>.
-
-## Learning more
-
-R Markdown is still relatively young, and is still growing rapidly. The best place to stay on top of innovations is the official R Markdown website: <http://rmarkdown.rstudio.com>.
-
-There are two important topics that we haven't covered here: collaboration, and the details of accurately communicating your ideas to other humans. Collaboration is a vital part of modern data science, and you can make your life much easier by using version control tools, like Git and GitHub. We recommend two free resources that will teach you about Git:
-
-1.  "Happy Git with R": a user friendly introduction to Git and GitHub from 
-    R users, by Jenny Bryan. The book is freely available online:
-    <http://happygitwithr.com>
-    
-1.  The "Git and GitHub" chapter of _R Packages_, by Hadley. You can also 
-    read it for free online: <http://r-pkgs.had.co.nz/git.html>.
-
-I have also not touched on what you should actually write in order to clearly communicate the results of your analysis. To improve your writing, I highly recommend reading either [_Style: Lessons in Clarity and Grace_](https://amzn.com/0134080416) by Joseph M. Williams & Joseph Bizup, or [_The Sense of Structure: Writing from the Reader's Perspective_](https://amzn.com/0205296327) by George Gopen. Both books will help you understand the structure of sentences and paragraphs, and give you the tools to make your writing more clear. (These books are rather expensive if purchased new, but they're used by many English classes so there are plenty of cheap second-hand copies). George Gopen also has a number of short articles on writing at <https://www.georgegopen.com/the-litigation-articles.html>. They are aimed at lawyers, but almost everything applies to data scientists too. 
-  
diff --git a/rmarkdown/dashboard.Rmd b/rmarkdown/dashboard.Rmd
deleted file mode 100644
index 9ca75961e..000000000
--- a/rmarkdown/dashboard.Rmd
+++ /dev/null
@@ -1,42 +0,0 @@
----
-title: "Diamonds distribution dashboard"
-output: flexdashboard::flex_dashboard
----
-
-```{r setup, include = FALSE}
-library(ggplot2)
-library(dplyr)
-knitr::opts_chunk$set(fig.width = 5, fig.asp = 1/3)
-```
-
-## Column 1
-
-### Carat
-
-```{r}
-ggplot(diamonds, aes(carat)) + geom_histogram(binwidth = 0.1)
-```
-
-### Cut
-
-```{r}
-ggplot(diamonds, aes(cut)) + geom_bar()
-```
-
-### Colour
-
-```{r}
-ggplot(diamonds, aes(color)) + geom_bar()
-```
-
-## Column 2
-
-### The largest diamonds
-
-```{r}
-diamonds %>% 
-  arrange(desc(carat)) %>% 
-  head(100) %>% 
-  select(carat, cut, color, price) %>% 
-  DT::datatable()
-```
diff --git a/rmarkdown/diamond-sizes-notebook.png b/rmarkdown/diamond-sizes-notebook.png
deleted file mode 100644
index 48d88d58e..000000000
Binary files a/rmarkdown/diamond-sizes-notebook.png and /dev/null differ
diff --git a/rmarkdown/diamond-sizes-report.png b/rmarkdown/diamond-sizes-report.png
deleted file mode 100644
index 8cf69c395..000000000
Binary files a/rmarkdown/diamond-sizes-report.png and /dev/null differ
diff --git a/rmarkdown/diamond-sizes.Rmd b/rmarkdown/diamond-sizes.Rmd
deleted file mode 100644
index 33fe1b012..000000000
--- a/rmarkdown/diamond-sizes.Rmd
+++ /dev/null
@@ -1,24 +0,0 @@
----
-title: "Diamond sizes"
-date: 2016-08-25
-output: html_document
----
-
-```{r setup, include = FALSE}
-library(ggplot2)
-library(dplyr)
-
-smaller <- diamonds %>% 
-  filter(carat <= 2.5)
-```
-
-We have data about `r nrow(diamonds)` diamonds. Only 
-`r nrow(diamonds) - nrow(smaller)` are larger than
-2.5 carats. The distribution of the remainder is shown
-below:
-
-```{r, echo = FALSE}
-smaller %>% 
-  ggplot(aes(carat)) + 
-  geom_freqpoly(binwidth = 0.01)
-```
diff --git a/rmarkdown/example-site.yml b/rmarkdown/example-site.yml
deleted file mode 100644
index 55bd3c3f1..000000000
--- a/rmarkdown/example-site.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-name: "my-website"
-navbar:
-  title: "My Website"
-  left:
-    - text: "Home"
-      href: index.html
-    - text: "Viridis Colors"
-      href: 1-example.html
-    - text: "Terrain Colors"
-      href: 3-inline.html
diff --git a/rmarkdown/fuel-economy.Rmd b/rmarkdown/fuel-economy.Rmd
deleted file mode 100644
index 14efa3e91..000000000
--- a/rmarkdown/fuel-economy.Rmd
+++ /dev/null
@@ -1,21 +0,0 @@
----
-output: html_document
-params:
-  my_class: "suv"
----
-
-```{r setup, include = FALSE}
-library(ggplot2)
-library(dplyr)
-
-class <- mpg %>% filter(class == params$my_class)
-```
-
-# Fuel economy for `r params$my_class`s
-
-```{r, message = FALSE}
-ggplot(class, aes(displ, hwy)) + 
-  geom_point() + 
-  geom_smooth(se = FALSE)
-```
-
diff --git a/rmarkdown/markdown.Rmd b/rmarkdown/markdown.Rmd
deleted file mode 100644
index 7f73ee53b..000000000
--- a/rmarkdown/markdown.Rmd
+++ /dev/null
@@ -1,48 +0,0 @@
-Text formatting 
-------------------------------------------------------------
-
-*italic*  or _italic_
-**bold**   __bold__
-`code`
-superscript^2^ and subscript~2~
-
-Headings
-------------------------------------------------------------
-
-# 1st Level Header
-
-## 2nd Level Header
-
-### 3rd Level Header
-
-Lists
-------------------------------------------------------------
-
-*   Bulleted list item 1
-
-*   Item 2
-
-    * Item 2a
-
-    * Item 2b
-
-1.  Numbered list item 1
-
-1.  Item 2. The numbers are incremented automatically in the output.
-
-Links and images
-------------------------------------------------------------
-
-<http://example.com>
-
-[linked phrase](http://example.com)
-
-![optional caption text](path/to/img.png)
-
-Tables 
-------------------------------------------------------------
-
-First Header  | Second Header
-------------- | -------------
-Content Cell  | Content Cell
-Content Cell  | Content Cell
diff --git a/screenshots/View-1.png b/screenshots/View-1.png
new file mode 100644
index 000000000..8aeb78279
Binary files /dev/null and b/screenshots/View-1.png differ
diff --git a/screenshots/View-2.png b/screenshots/View-2.png
new file mode 100644
index 000000000..e80418c0e
Binary files /dev/null and b/screenshots/View-2.png differ
diff --git a/screenshots/View-3.png b/screenshots/View-3.png
new file mode 100644
index 000000000..00cc92c78
Binary files /dev/null and b/screenshots/View-3.png differ
diff --git a/screenshots/import-googlesheets-students.png b/screenshots/import-googlesheets-students.png
new file mode 100644
index 000000000..d3ab5708b
Binary files /dev/null and b/screenshots/import-googlesheets-students.png differ
diff --git a/screenshots/import-spreadsheets-bake-sale.png b/screenshots/import-spreadsheets-bake-sale.png
new file mode 100644
index 000000000..3790dfdeb
Binary files /dev/null and b/screenshots/import-spreadsheets-bake-sale.png differ
diff --git a/screenshots/import-spreadsheets-deaths.png b/screenshots/import-spreadsheets-deaths.png
new file mode 100644
index 000000000..2ef7d495b
Binary files /dev/null and b/screenshots/import-spreadsheets-deaths.png differ
diff --git a/screenshots/import-spreadsheets-penguins-islands.png b/screenshots/import-spreadsheets-penguins-islands.png
new file mode 100644
index 000000000..ca60db6ed
Binary files /dev/null and b/screenshots/import-spreadsheets-penguins-islands.png differ
diff --git a/screenshots/import-spreadsheets-roster.png b/screenshots/import-spreadsheets-roster.png
new file mode 100644
index 000000000..2295c674d
Binary files /dev/null and b/screenshots/import-spreadsheets-roster.png differ
diff --git a/screenshots/import-spreadsheets-sales.png b/screenshots/import-spreadsheets-sales.png
new file mode 100644
index 000000000..2b5c6fe30
Binary files /dev/null and b/screenshots/import-spreadsheets-sales.png differ
diff --git a/screenshots/import-spreadsheets-students.png b/screenshots/import-spreadsheets-students.png
new file mode 100644
index 000000000..7ce5f0b21
Binary files /dev/null and b/screenshots/import-spreadsheets-students.png differ
diff --git a/screenshots/import-spreadsheets-survey.png b/screenshots/import-spreadsheets-survey.png
new file mode 100644
index 000000000..f9cd2d79e
Binary files /dev/null and b/screenshots/import-spreadsheets-survey.png differ
diff --git a/screenshots/quarto-chunk-nav.png b/screenshots/quarto-chunk-nav.png
new file mode 100644
index 000000000..00e29d752
Binary files /dev/null and b/screenshots/quarto-chunk-nav.png differ
diff --git a/screenshots/rmarkdown-chunk-nav.png b/screenshots/rmarkdown-chunk-nav.png
deleted file mode 100644
index aefa9f336..000000000
Binary files a/screenshots/rmarkdown-chunk-nav.png and /dev/null differ
diff --git a/screenshots/rmarkdown-flexdashboard.png b/screenshots/rmarkdown-flexdashboard.png
deleted file mode 100644
index 08a87f621..000000000
Binary files a/screenshots/rmarkdown-flexdashboard.png and /dev/null differ
diff --git a/screenshots/rmarkdown-knit.png b/screenshots/rmarkdown-knit.png
deleted file mode 100644
index fcb84cae9..000000000
Binary files a/screenshots/rmarkdown-knit.png and /dev/null differ
diff --git a/screenshots/rmarkdown-shiny.png b/screenshots/rmarkdown-shiny.png
deleted file mode 100644
index 4136bd516..000000000
Binary files a/screenshots/rmarkdown-shiny.png and /dev/null differ
diff --git a/screenshots/rstudio-diagnostic-tip.png b/screenshots/rstudio-diagnostic-tip.png
index 9ed1e4bbd..93038a5dc 100644
Binary files a/screenshots/rstudio-diagnostic-tip.png and b/screenshots/rstudio-diagnostic-tip.png differ
diff --git a/screenshots/rstudio-diagnostic-warn.png b/screenshots/rstudio-diagnostic-warn.png
index a515e6a54..e83ed7c99 100644
Binary files a/screenshots/rstudio-diagnostic-warn.png and b/screenshots/rstudio-diagnostic-warn.png differ
diff --git a/screenshots/rstudio-diagnostic.png b/screenshots/rstudio-diagnostic.png
index f4941e7c1..610e78d6d 100644
Binary files a/screenshots/rstudio-diagnostic.png and b/screenshots/rstudio-diagnostic.png differ
diff --git a/screenshots/rstudio-env.png b/screenshots/rstudio-env.png
index 49efcd764..1080ca3a1 100644
Binary files a/screenshots/rstudio-env.png and b/screenshots/rstudio-env.png differ
diff --git a/screenshots/rstudio-palette.png b/screenshots/rstudio-palette.png
new file mode 100644
index 000000000..2b448cada
Binary files /dev/null and b/screenshots/rstudio-palette.png differ
diff --git a/screenshots/rstudio-pipe-options.png b/screenshots/rstudio-pipe-options.png
new file mode 100644
index 000000000..b389890ab
Binary files /dev/null and b/screenshots/rstudio-pipe-options.png differ
diff --git a/screenshots/rstudio-project-1.png b/screenshots/rstudio-project-1.png
deleted file mode 100644
index f42cde181..000000000
Binary files a/screenshots/rstudio-project-1.png and /dev/null differ
diff --git a/screenshots/rstudio-project-2.png b/screenshots/rstudio-project-2.png
deleted file mode 100644
index 1d0ff27b2..000000000
Binary files a/screenshots/rstudio-project-2.png and /dev/null differ
diff --git a/screenshots/rstudio-project-3.png b/screenshots/rstudio-project-3.png
deleted file mode 100644
index 6e1f0abfc..000000000
Binary files a/screenshots/rstudio-project-3.png and /dev/null differ
diff --git a/screenshots/rstudio-wd.png b/screenshots/rstudio-wd.png
index 316277941..5401607c1 100644
Binary files a/screenshots/rstudio-wd.png and b/screenshots/rstudio-wd.png differ
diff --git a/screenshots/rstudio-workspace.png b/screenshots/rstudio-workspace.png
deleted file mode 100644
index ab61ac2cf..000000000
Binary files a/screenshots/rstudio-workspace.png and /dev/null differ
diff --git a/screenshots/scraping-imdb.png b/screenshots/scraping-imdb.png
new file mode 100644
index 000000000..ac6eee57e
Binary files /dev/null and b/screenshots/scraping-imdb.png differ
diff --git a/screenshots/style-options.png b/screenshots/style-options.png
deleted file mode 100644
index 5ee82950f..000000000
Binary files a/screenshots/style-options.png and /dev/null differ
diff --git a/spreadsheets.qmd b/spreadsheets.qmd
new file mode 100644
index 000000000..26ddad07a
--- /dev/null
+++ b/spreadsheets.qmd
@@ -0,0 +1,667 @@
+# Spreadsheets {#sec-import-spreadsheets}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+In @sec-data-import you learned about importing data from plain text files like `.csv` and `.tsv`.
+Now it's time to learn how to get data out of a spreadsheet, either an Excel spreadsheet or a Google Sheet.
+This will build on much of what you've learned in @sec-data-import, but we will also discuss additional considerations and complexities when working with data from spreadsheets.
+
+If you or your collaborators are using spreadsheets for organizing data, we strongly recommend reading the paper "Data Organization in Spreadsheets" by Karl Broman and Kara Woo: <https://doi.org/10.1080/00031305.2017.1375989>.
+The best practices presented in this paper will save you much headache when you import data from a spreadsheet into R to analyze and visualize.
+
+## Excel
+
+Microsoft Excel is a widely used spreadsheet software program where data are organized in worksheets inside of spreadsheet files.
+
+### Prerequisites
+
+In this section, you'll learn how to load data from Excel spreadsheets in R with the **readxl** package.
+This package is non-core tidyverse, so you need to load it explicitly, but it is installed automatically when you install the tidyverse package.
+Later, we'll also use the writexl package, which allows us to create Excel spreadsheets.
+
+```{r}
+#| message: false
+
+library(readxl)
+library(tidyverse)
+library(writexl)
+```
+
+### Getting started
+
+Most of readxl's functions allow you to load Excel spreadsheets into R:
+
+-   `read_xls()` reads Excel files with `xls` format.
+-   `read_xlsx()` read Excel files with `xlsx` format.
+-   `read_excel()` can read files with both `xls` and `xlsx` format. It guesses the file type based on the input.
+
+These functions all have similar syntax just like other functions we have previously introduced for reading other types of files, e.g., `read_csv()`, `read_table()`, etc.
+For the rest of the chapter we will focus on using `read_excel()`.
+
+### Reading Excel spreadsheets {#sec-reading-spreadsheets-excel}
+
+@fig-students-excel shows what the spreadsheet we're going to read into R looks like in Excel.
+
+```{r}
+#| label: fig-students-excel
+#| echo: false
+#| fig-width: 5
+#| fig-cap: |
+#|   Spreadsheet called students.xlsx in Excel.
+#| fig-alt: |
+#|   A look at the students spreadsheet in Excel. The spreadsheet contains 
+#|   information on 6 students, their ID, full name, favourite food, meal plan, 
+#|   and age.
+
+knitr::include_graphics("screenshots/import-spreadsheets-students.png")
+```
+
+The first argument to `read_excel()` is the path to the file to read.
+
+```{r}
+students <- read_excel("data/students.xlsx")
+```
+
+`read_excel()` will read the file in as a tibble.
+
+```{r}
+students
+```
+
+We have six students in the data and five variables on each student.
+However there are a few things we might want to address in this dataset:
+
+1.  The column names are all over the place.
+    You can provide column names that follow a consistent format; we recommend `snake_case` using the `col_names` argument.
+
+    ```{r}
+    #| include: false
+
+    options(
+      dplyr.print_min = 7,
+      dplyr.print_max = 7
+    )
+    ```
+
+    ```{r}
+    read_excel(
+      "data/students.xlsx",
+      col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age")
+    )
+    ```
+
+    ```{r}
+    #| include: false
+
+    options(
+      dplyr.print_min = 6,
+      dplyr.print_max = 6
+    )
+    ```
+
+    Unfortunately, this didn't quite do the trick.
+    We now have the variable names we want, but what was previously the header row now shows up as the first observation in the data.
+    You can explicitly skip that row using the `skip` argument.
+
+    ```{r}
+    read_excel(
+      "data/students.xlsx",
+      col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
+      skip = 1
+    )
+    ```
+
+2.  In the `favourite_food` column, one of the observations is `N/A`, which stands for "not available" but it's currently not recognized as an `NA` (note the contrast between this `N/A` and the age of the fourth student in the list).
+    You can specify which character strings should be recognized as `NA`s with the `na` argument.
+    By default, only `""` (empty string, or, in the case of reading from a spreadsheet, an empty cell or a cell with the formula `=NA()`) is recognized as an `NA`.
+
+    ```{r}
+    read_excel(
+      "data/students.xlsx",
+      col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
+      skip = 1,
+      na = c("", "N/A")
+    )
+    ```
+
+3.  One other remaining issue is that `age` is read in as a character variable, but it really should be numeric.
+    Just like with `read_csv()` and friends for reading data from flat files, you can supply a `col_types` argument to `read_excel()` and specify the column types for the variables you read in.
+    The syntax is a bit different, though.
+    Your options are `"skip"`, `"guess"`, `"logical"`, `"numeric"`, `"date"`, `"text"` or `"list"`.
+
+    ```{r}
+    read_excel(
+      "data/students.xlsx",
+      col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
+      skip = 1,
+      na = c("", "N/A"),
+      col_types = c("numeric", "text", "text", "text", "numeric")
+    )
+    ```
+
+    However, this didn't quite produce the desired result either.
+    By specifying that `age` should be numeric, we have turned the one cell with the non-numeric entry (which had the value `five`) into an `NA`.
+    In this case, we should read age in as `"text"` and then make the change once the data is loaded in R.
+
+    ```{r}
+    students <- read_excel(
+      "data/students.xlsx",
+      col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
+      skip = 1,
+      na = c("", "N/A"),
+      col_types = c("numeric", "text", "text", "text", "text")
+    )
+
+    students <- students |>
+      mutate(
+        age = if_else(age == "five", "5", age),
+        age = parse_number(age)
+      )
+
+    students
+    ```
+
+It took us multiple steps and trial-and-error to load the data in exactly the format we want, and this is not unexpected.
+Data science is an iterative process, and the process of iteration can be even more tedious when reading data in from spreadsheets compared to other plain text, rectangular data files because humans tend to input data into spreadsheets and use them not just for data storage but also for sharing and communication.
+
+There is no way to know exactly what the data will look like until you load it and take a look at it.
+Well, there is one way, actually.
+You can open the file in Excel and take a peek.
+If you're going to do so, we recommend making a copy of the Excel file to open and browse interactively while leaving the original data file untouched and reading into R from the untouched file.
+This will ensure you don't accidentally overwrite anything in the spreadsheet while inspecting it.
+You should also not be afraid of doing what we did here: load the data, take a peek, make adjustments to your code, load it again, and repeat until you're happy with the result.
+
+### Reading worksheets
+
+An important feature that distinguishes spreadsheets from flat files is the notion of multiple sheets, called worksheets.
+@fig-penguins-islands shows an Excel spreadsheet with multiple worksheets.
+The data come from the **palmerpenguins** package.
+Each worksheet contains information on penguins from a different island where data were collected.
+
+```{r}
+#| label: fig-penguins-islands
+#| echo: false
+#| fig-cap: |
+#|   Spreadsheet called penguins.xlsx in Excel containing three worksheets.
+#| fig-alt: |
+#|   A look at the penguins spreadsheet in Excel. The spreadsheet contains has 
+#|   three worksheets: Torgersen Island, Biscoe Island, and Dream Island.
+
+knitr::include_graphics("screenshots/import-spreadsheets-penguins-islands.png")
+```
+
+You can read a single worksheet from a spreadsheet with the `sheet` argument in `read_excel()`.
+The default, which we've been relying on up until now, is the first sheet.
+
+```{r}
+read_excel("data/penguins.xlsx", sheet = "Torgersen Island")
+```
+
+Some variables that appear to contain numerical data are read in as characters due to the character string `"NA"` not being recognized as a true `NA`.
+
+```{r}
+penguins_torgersen <- read_excel("data/penguins.xlsx", sheet = "Torgersen Island", na = "NA")
+
+penguins_torgersen
+```
+
+Alternatively, you can use `excel_sheets()` to get information on all worksheets in an Excel spreadsheet, and then read the one(s) you're interested in.
+
+```{r}
+excel_sheets("data/penguins.xlsx")
+```
+
+Once you know the names of the worksheets, you can read them in individually with `read_excel()`.
+
+```{r}
+penguins_biscoe <- read_excel("data/penguins.xlsx", sheet = "Biscoe Island", na = "NA")
+penguins_dream  <- read_excel("data/penguins.xlsx", sheet = "Dream Island", na = "NA")
+```
+
+In this case the full penguins dataset is spread across three worksheets in the spreadsheet.
+Each worksheet has the same number of columns but different numbers of rows.
+
+```{r}
+dim(penguins_torgersen)
+dim(penguins_biscoe)
+dim(penguins_dream)
+```
+
+We can put them together with `bind_rows()`.
+
+```{r}
+penguins <- bind_rows(penguins_torgersen, penguins_biscoe, penguins_dream)
+penguins
+```
+
+In @sec-iteration we'll talk about ways of doing this sort of task without repetitive code.
+
+### Reading part of a sheet
+
+Since many use Excel spreadsheets for presentation as well as for data storage, it's quite common to find cell entries in a spreadsheet that are not part of the data you want to read into R.
+@fig-deaths-excel shows such a spreadsheet: in the middle of the sheet is what looks like a data frame but there is extraneous text in cells above and below the data.
+
+```{r}
+#| label: fig-deaths-excel
+#| echo: false
+#| fig-cap: |
+#|   Spreadsheet called deaths.xlsx in Excel.
+#| fig-alt: |
+#|   A look at the deaths spreadsheet in Excel. The spreadsheet has four rows 
+#|   on top that contain non-data information; the text 'For the same of 
+#|   consistency in the data layout, which is really a beautiful thing, I will 
+#|   keep making notes up here.' is spread across cells in these top four rows. 
+#|   Then, there is a data frame that includes information on deaths of 10 
+#|   famous people, including their names, professions, ages, whether they have 
+#|   kids or not, date of birth and death. At the bottom, there are four more 
+#|   rows of non-data information; the text 'This has been really fun, but 
+#|   we're signing off now!' is spread across cells in these bottom four rows.
+
+knitr::include_graphics("screenshots/import-spreadsheets-deaths.png")
+```
+
+This spreadsheet is one of the example spreadsheets provided in the readxl package.
+You can use the `readxl_example()` function to locate the spreadsheet on your system in the directory where the package is installed.
+This function returns the path to the spreadsheet, which you can use in `read_excel()` as usual.
+
+```{r}
+deaths_path <- readxl_example("deaths.xlsx")
+deaths <- read_excel(deaths_path)
+deaths
+```
+
+The top three rows and the bottom four rows are not part of the data frame.
+It's possible to eliminate these extraneous rows using the `skip` and `n_max` arguments, but we recommend using cell ranges.
+In Excel, the top left cell is `A1`.
+As you move across columns to the right, the cell label moves down the alphabet, i.e.
+`B1`, `C1`, etc.
+And as you move down a column, the number in the cell label increases, i.e.
+`A2`, `A3`, etc.
+
+Here the data we want to read in starts in cell `A5` and ends in cell `F15`.
+In spreadsheet notation, this is `A5:F15`, which we supply to the `range` argument:
+
+```{r}
+read_excel(deaths_path, range = "A5:F15")
+```
+
+### Data types
+
+In CSV files, all values are strings.
+This is not particularly true to the data, but it is simple: everything is a string.
+
+The underlying data in Excel spreadsheets is more complex.
+A cell can be one of four things:
+
+-   A boolean, like `TRUE`, `FALSE`, or `NA`.
+
+-   A number, like "10" or "10.5".
+
+-   A datetime, which can also include time like "11/1/21" or "11/1/21 3:00 PM".
+
+-   A text string, like "ten".
+
+When working with spreadsheet data, it's important to keep in mind that the underlying data can be very different than what you see in the cell.
+For example, Excel has no notion of an integer.
+All numbers are stored as floating points, but you can choose to display the data with a customizable number of decimal points.
+Similarly, dates are actually stored as numbers, specifically the number of seconds since January 1, 1970.
+You can customize how you display the date by applying formatting in Excel.
+Confusingly, it's also possible to have something that looks like a number but is actually a string (e.g., type `'10` into a cell in Excel).
+
+These differences between how the underlying data are stored vs. how they're displayed can cause surprises when the data are loaded into R.
+By default readxl will guess the data type in a given column.
+A recommended workflow is to let readxl guess the column types, confirm that you're happy with the guessed column types, and if not, go back and re-import specifying `col_types` as shown in @sec-reading-spreadsheets-excel.
+
+Another challenge is when you have a column in your Excel spreadsheet that has a mix of these types, e.g., some cells are numeric, others text, others dates.
+When importing the data into R readxl has to make some decisions.
+In these cases you can set the type for this column to `"list"`, which will load the column as a list of length 1 vectors, where the type of each element of the vector is guessed.
+
+::: callout-note
+Sometimes data is stored in more exotic ways, like the color of the cell background, or whether or not the text is bold.
+In such cases, you might find the [tidyxl package](https://nacnudus.github.io/tidyxl/) useful.
+See <https://nacnudus.github.io/spreadsheet-munging-strategies/> for more on strategies for working with non-tabular data from Excel.
+:::
+
+### Writing to Excel {#sec-writing-to-excel}
+
+Let's create a small data frame that we can then write out.
+Note that `item` is a factor and `quantity` is an integer.
+
+```{r}
+bake_sale <- tibble(
+  item     = factor(c("brownie", "cupcake", "cookie")),
+  quantity = c(10, 5, 8)
+)
+
+bake_sale
+```
+
+You can write data back to disk as an Excel file using the `write_xlsx()` from the [writexl package](https://docs.ropensci.org/writexl/):
+
+```{r}
+#| eval: false
+
+write_xlsx(bake_sale, path = "data/bake-sale.xlsx")
+```
+
+@fig-bake-sale-excel shows what the data looks like in Excel.
+Note that column names are included and bolded.
+These can be turned off by setting `col_names` and `format_headers` arguments to `FALSE`.
+
+```{r}
+#| label: fig-bake-sale-excel
+#| echo: false
+#| fig-width: 5
+#| fig-cap: |
+#|   Spreadsheet called bake_sale.xlsx in Excel.
+#| fig-alt: |
+#|   Bake sale data frame created earlier in Excel.
+
+knitr::include_graphics("screenshots/import-spreadsheets-bake-sale.png")
+```
+
+Just like reading from a CSV, information on data type is lost when we read the data back in.
+This makes Excel files unreliable for caching interim results as well.
+For alternatives, see @sec-writing-to-a-file.
+
+```{r}
+read_excel("data/bake-sale.xlsx")
+```
+
+### Formatted output
+
+The writexl package is a light-weight solution for writing a simple Excel spreadsheet, but if you're interested in additional features like writing to sheets within a spreadsheet and styling, you will want to use the [openxlsx package](https://ycphs.github.io/openxlsx).
+We won't go into the details of using this package here, but we recommend reading <https://ycphs.github.io/openxlsx/articles/Formatting.html> for an extensive discussion on further formatting functionality for data written from R to Excel with openxlsx.
+
+Note that this package is not part of the tidyverse so the functions and workflows may feel unfamiliar.
+For example, function names are camelCase, multiple functions can't be composed in pipelines, and arguments are in a different order than they tend to be in the tidyverse.
+However, this is ok.
+As your R learning and usage expands outside of this book you will encounter lots of different styles used in various R packages that you might use to accomplish specific goals in R.
+A good way of familiarizing yourself with the coding style used in a new package is to run the examples provided in function documentation to get a feel for the syntax and the output formats as well as reading any vignettes that might come with the package.
+
+### Exercises
+
+1.  In an Excel file, create the following dataset and save it as `survey.xlsx`.
+    Alternatively, you can download it as an Excel file from [here](https://docs.google.com/spreadsheets/d/1yc5gL-a2OOBr8M7B3IsDNX5uR17vBHOyWZq6xSTG2G8).
+
+    ```{r}
+    #| echo: false
+    #| fig-width: 4
+    #| fig-alt: |
+    #|   A spreadsheet with 3 columns (group, subgroup, and id) and 12 rows. 
+    #|   The group column has two values: 1 (spanning 7 merged rows) and 2 
+    #|   (spanning 5 merged rows). The subgroup column has four values: A 
+    #|   (spanning 3 merged rows), B (spanning 4 merged rows), A (spanning 2 
+    #|   merged rows), and B (spanning 3 merged rows). The id column has twelve 
+    #|   values, numbers 1 through 12.
+
+    knitr::include_graphics("screenshots/import-spreadsheets-survey.png")
+    ```
+
+    Then, read it into R, with `survey_id` as a character variable and `n_pets` as a numerical variable.
+
+    ```{r}
+    #| echo: false
+
+    read_excel("data/survey.xlsx", na = c("", "N/A"), col_types = c("text", "text")) |>
+      mutate(
+        n_pets = case_when(
+          n_pets == "none" ~ "0",
+          n_pets == "two"  ~ "2",
+          TRUE             ~ n_pets
+        ),
+        n_pets = as.numeric(n_pets)
+      )
+    ```
+
+2.  In another Excel file, create the following dataset and save it as `roster.xlsx`.
+    Alternatively, you can download it as an Excel file from [here](https://docs.google.com/spreadsheets/d/1LgZ0Bkg9d_NK8uTdP2uHXm07kAlwx8-Ictf8NocebIE).
+
+    ```{r}
+    #| echo: false
+    #| fig-width: 4
+    #| fig-alt: |
+    #|   A spreadsheet with 3 columns (group, subgroup, and id) and 12 rows. The 
+    #|   group column has two values: 1 (spanning 7 merged rows) and 2 (spanning 
+    #|   5 merged rows). The subgroup column has four values: A (spanning 3 merged 
+    #|   rows), B (spanning 4 merged rows), A (spanning 2 merged rows), and B 
+    #|   (spanning 3 merged rows). The id column has twelve values, numbers 1 
+    #|   through 12.
+
+    knitr::include_graphics("screenshots/import-spreadsheets-roster.png")
+    ```
+
+    Then, read it into R.
+    The resulting data frame should be called `roster` and should look like the following.
+
+    ```{r}
+    #| echo: false
+    #| message: false
+
+    read_excel("data/roster.xlsx") |>
+      fill(group, subgroup) |>
+      print(n = 12)
+    ```
+
+3.  In a new Excel file, create the following dataset and save it as `sales.xlsx`.
+    Alternatively, you can download it as an Excel file from [here](https://docs.google.com/spreadsheets/d/1oCqdXUNO8JR3Pca8fHfiz_WXWxMuZAp3YiYFaKze5V0).
+
+    ```{r}
+    #| echo: false
+    #| fig-alt: |
+    #|   A spreadsheet with 2 columns and 13 rows. The first two rows have text 
+    #|   containing information about the sheet. Row 1 says "This file contains
+    #|   information on sales". Row 2 says "Data are organized by brand name, and 
+    #|   for each brand, we have the ID number for the item sold, and how many are 
+    #|   sold.". Then there are two empty rows, and then 9 rows of data.
+
+    knitr::include_graphics("screenshots/import-spreadsheets-sales.png")
+    ```
+
+    a\.
+    Read `sales.xlsx` in and save as `sales`.
+    The data frame should look like the following, with `id` and `n` as column names and with 9 rows.
+
+    ```{r}
+    #| echo: false
+    #| message: false
+
+    read_excel("data/sales.xlsx", skip = 3, col_names = c("id", "n")) |>
+      print(n = 9)
+    ```
+
+    b\.
+    Modify `sales` further to get it into the following tidy format with three columns (`brand`, `id`, and `n`) and 7 rows of data.
+    Note that `id` and `n` are numeric, `brand` is a character variable.
+
+    ```{r}
+    #| echo: false
+    #| message: false
+
+    read_excel("data/sales.xlsx", skip = 3, col_names = c("id", "n")) |>
+      mutate(brand = if_else(str_detect(id, "Brand"), id, NA)) |>
+      fill(brand) |>
+      filter(n != "n") |>
+      relocate(brand) |>
+      mutate(
+        id = as.numeric(id),
+        n = as.numeric(n)
+      ) |>
+      print(n = 7)
+    ```
+
+4.  Recreate the `bake_sale` data frame, write it out to an Excel file using the `write.xlsx()` function from the openxlsx package.
+
+5.  In @sec-data-import you learned about the `janitor::clean_names()` function to turn columns names into snake case.
+    Read the `students.xlsx` file that we introduced earlier in this section and use this function to "clean" the column names.
+
+6.  What happens if you try to read in a file with `.xlsx` extension with `read_xls()`?
+
+## Google Sheets
+
+Google Sheets is another widely used spreadsheet program.
+It's free and web-based.
+Just like with Excel, in Google Sheets data are organized in worksheets (also called sheets) inside of spreadsheet files.
+
+### Prerequisites
+
+This section will also focus on spreadsheets, but this time you'll be loading data from a Google Sheet with the **googlesheets4** package.
+This package is non-core tidyverse as well, you need to load it explicitly.
+
+```{r}
+library(googlesheets4)
+library(tidyverse)
+```
+
+A quick note about the name of the package: googlesheets4 uses v4 of the [Sheets API v4](https://developers.google.com/sheets/api/) to provide an R interface to Google Sheets, hence the name.
+
+### Getting started
+
+The main function of the googlesheets4 package is `read_sheet()`, which reads a Google Sheet from a URL or a file id.
+This function also goes by the name `range_read()`.
+
+You can also create a brand new sheet with `gs4_create()` or write to an existing sheet with `sheet_write()` and friends.
+
+In this section we'll work with the same datasets as the ones in the Excel section to highlight similarities and differences between workflows for reading data from Excel and Google Sheets.
+readxl and googlesheets4 packages are both designed to mimic the functionality of the readr package, which provides the `read_csv()` function you've seen in @sec-data-import.
+Therefore, many of the tasks can be accomplished with simply swapping out `read_excel()` for `read_sheet()`.
+However you'll also see that Excel and Google Sheets don't behave in exactly the same way, therefore other tasks may require further updates to the function calls.
+
+### Reading Google Sheets
+
+@fig-students-googlesheets shows what the spreadsheet we're going to read into R looks like in Google Sheets.
+This is the same dataset as in @fig-students-excel, except it's stored in a Google Sheet instead of Excel.
+
+```{r}
+#| label: fig-students-googlesheets
+#| echo: false
+#| fig-cap: |
+#|   Google Sheet called students in a browser window.
+#| fig-alt: |
+#|   A look at the students spreadsheet in Google Sheets. The spreadsheet contains 
+#|   information on 6 students, their ID, full name, favourite food, meal plan, 
+#|   and age.
+
+knitr::include_graphics("screenshots/import-googlesheets-students.png")
+```
+
+The first argument to `read_sheet()` is the URL of the file to read, and it returns a tibble:\
+<https://docs.google.com/spreadsheets/d/1V1nPp1tzOuutXFLb3G9Eyxi3qxeEhnOXUzL5_BcCQ0w>.
+These URLs are not pleasant to work with, so you'll often want to identify a sheet by its ID.
+
+```{r}
+#| include: false
+
+gs4_deauth()
+```
+
+```{r}
+students_sheet_id <- "1V1nPp1tzOuutXFLb3G9Eyxi3qxeEhnOXUzL5_BcCQ0w"
+students <- read_sheet(students_sheet_id)
+students
+```
+
+Just like we did with `read_excel()`, we can supply column names, NA strings, and column types to `read_sheet()`.
+
+```{r}
+students <- read_sheet(
+  students_sheet_id,
+  col_names = c("student_id", "full_name", "favourite_food", "meal_plan", "age"),
+  skip = 1,
+  na = c("", "N/A"),
+  col_types = "dcccc"
+)
+
+students
+```
+
+Note that we defined column types a bit differently here, using short codes.
+For example, "dcccc" stands for "double, character, character, character, character".
+
+It's also possible to read individual sheets from Google Sheets as well.
+Let's read the "Torgersen Island" sheet from the [penguins Google Sheet](https://pos.it/r4ds-penguins):
+
+```{r}
+penguins_sheet_id <- "1aFu8lnD_g0yjF5O-K6SFgSEWiHPpgvFCF0NY9D6LXnY"
+read_sheet(penguins_sheet_id, sheet = "Torgersen Island")
+```
+
+You can obtain a list of all sheets within a Google Sheet with `sheet_names()`:
+
+```{r}
+sheet_names(penguins_sheet_id)
+```
+
+Finally, just like with `read_excel()`, we can read in a portion of a Google Sheet by defining a `range` in `read_sheet()`.
+Note that we're also using the `gs4_example()` function below to locate an example Google Sheet that comes with the googlesheets4 package.
+
+```{r}
+deaths_url <- gs4_example("deaths")
+deaths <- read_sheet(deaths_url, range = "A5:F15")
+deaths
+```
+
+### Writing to Google Sheets
+
+You can write from R to Google Sheets with `write_sheet()`.
+The first argument is the data frame to write, and the second argument is the name (or other identifier) of the Google Sheet to write to:
+
+```{r}
+#| eval: false
+
+write_sheet(bake_sale, ss = "bake-sale")
+```
+
+If you'd like to write your data to a specific (work)sheet inside a Google Sheet, you can specify that with the `sheet` argument as well.
+
+```{r}
+#| eval: false
+
+write_sheet(bake_sale, ss = "bake-sale", sheet = "Sales")
+```
+
+### Authentication
+
+While you can read from a public Google Sheet without authenticating with your Google account, reading a private sheet or writing to a sheet requires authentication so that googlesheets4 can view and manage *your* Google Sheets.
+
+When you attempt to read in a sheet that requires authentication, googlesheets4 will direct you to a web browser with a prompt to sign in to your Google account and grant permission to operate on your behalf with Google Sheets.
+However, if you want to specify a specific Google account, authentication scope, etc. you can do so with `gs4_auth()`, e.g., `gs4_auth(email = "mine@example.com")`, which will force the use of a token associated with a specific email.
+For further authentication details, we recommend reading the documentation googlesheets4 auth vignette: <https://googlesheets4.tidyverse.org/articles/auth.html>.
+
+### Exercises
+
+1.  Read the `students` dataset from earlier in the chapter from Excel and also from Google Sheets, with no additional arguments supplied to the `read_excel()` and `read_sheet()` functions.
+    Are the resulting data frames in R exactly the same?
+    If not, how are they different?
+
+2.  Read the Google Sheet titled survey from <https://pos.it/r4ds-survey>, with `survey_id` as a character variable and `n_pets` as a numerical variable.
+
+3.  Read the Google Sheet titled roster from <https://pos.it/r4ds-roster>.
+    The resulting data frame should be called `roster` and should look like the following.
+
+    ```{r}
+    #| echo: false
+    #| message: false
+
+    read_sheet("https://docs.google.com/spreadsheets/d/1LgZ0Bkg9d_NK8uTdP2uHXm07kAlwx8-Ictf8NocebIE/") |>
+      fill(group, subgroup) |>
+      print(n = 12)
+    ```
+
+## Summary
+
+Microsoft Excel and Google Sheets are two of the most popular spreadsheet systems.
+Being able to interact with data stored in Excel and Google Sheets files directly from R is a superpower!
+In this chapter you learned how to read data into R from spreadsheets from Excel with `read_excel()` from the readxl package and from Google Sheets with `read_sheet()` from the googlesheets4 package.
+These functions work very similarly to each other and have similar arguments for specifying column names, NA strings, rows to skip on top of the file you're reading in, etc.
+Additionally, both functions make it possible to read a single sheet from a spreadsheet as well.
+
+On the other hand, writing to an Excel file requires a different package and function (`writexl::write_xlsx()`) while you can write to a Google Sheet with the googlesheets4 package, with `write_sheet()`.
+
+In the next chapter, you'll learn about a different data source and how to read data from that source into R: databases.
diff --git a/strings.Rmd b/strings.Rmd
deleted file mode 100644
index ae5207fb8..000000000
--- a/strings.Rmd
+++ /dev/null
@@ -1,997 +0,0 @@
-# Strings
-
-## Introduction
-
-This chapter introduces you to string manipulation in R. You'll learn the basics of how strings work and how to create them by hand, but the focus of this chapter will be on regular expressions, or regexps for short. Regular expressions are useful because strings usually contain unstructured or semi-structured data, and regexps are a concise language for describing patterns in strings. When you first look at a regexp, you'll think a cat walked across your keyboard, but as your understanding improves they will soon start to make sense.
-
-### Prerequisites
-
-This chapter will focus on the __stringr__ package for string manipulation. stringr is not part of the core tidyverse because you don't always have textual data, so we need to load it explicitly.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-library(stringr)
-```
-
-## String basics
-
-You can create strings with either single quotes or double quotes. Unlike other languages, there is no difference in behaviour. I recommend always using `"`, unless you want to create a string that contains multiple `"`.
-
-```{r}
-string1 <- "This is a string"
-string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
-```
-
-If you forget to close a quote, you'll see `+`, the continuation character:
-
-```
-> "This is a string without a closing quote
-+ 
-+ 
-+ HELP I'M STUCK
-```
-
-If this happen to you, press Escape and try again!
-
-To include a literal single or double quote in a string you can use `\` to "escape" it:
-
-```{r}
-double_quote <- "\"" # or '"'
-single_quote <- '\'' # or "'"
-```
-
-That means if you want to include a literal backslash, you'll need to double it up: `"\\"`.
-
-Beware that the printed representation of a string is not the same as string itself, because the printed representation shows the escapes. To see the raw contents of the string, use `writeLines()`:
-
-```{r}
-x <- c("\"", "\\")
-x
-writeLines(x)
-```
-
-There are a handful of other special characters. The most common are `"\n"`, newline, and `"\t"`, tab, but you can see the complete list by requesting help on `"`: `?'"'`, or `?"'"`. You'll also sometimes see strings like `"\u00b5"`, this is a way of writing non-English characters that works on all platforms:
-
-```{r}
-x <- "\u00b5"
-x
-```
-
-Multiple strings are often stored in a character vector, which you can create with `c()`:
-
-```{r}
-c("one", "two", "three")
-```
-
-### String length
-
-Base R contains many functions to work with strings but we'll avoid them because they can be inconsistent, which makes them hard to remember. Instead we'll use functions from stringr. These have more intuitive names, and all start with `str_`. For example, `str_length()` tells you the number of characters in a string:
-
-```{r}
-str_length(c("a", "R for data science", NA))
-```
-
-The common `str_` prefix is particularly useful if you use RStudio, because typing `str_` will trigger autocomplete, allowing you to see all stringr functions:
-
-```{r, echo = FALSE}
-knitr::include_graphics("screenshots/stringr-autocomplete.png")
-```
-
-### Combining strings
-
-To combine two or more strings, use `str_c()`:
-
-```{r}
-str_c("x", "y")
-str_c("x", "y", "z")
-```
-
-Use the `sep` argument to control how they're separated:
-
-```{r}
-str_c("x", "y", sep = ", ")
-```
-
-Like most other functions in R, missing values are contagious. If you want them to print as `"NA"`, use `str_replace_na()`:
-
-```{r}
-x <- c("abc", NA)
-str_c("|-", x, "-|")
-str_c("|-", str_replace_na(x), "-|")
-```
-
-As shown above, `str_c()` is vectorised, and it automatically recycles shorter vectors to the same length as the longest:
-
-```{r}
-str_c("prefix-", c("a", "b", "c"), "-suffix")
-```
-
-Objects of length 0 are silently dropped. This is particularly useful in conjunction with `if`:
-
-```{r}
-name <- "Hadley"
-time_of_day <- "morning"
-birthday <- FALSE
-
-str_c(
-  "Good ", time_of_day, " ", name,
-  if (birthday) " and HAPPY BIRTHDAY",
-  "."
-)
-```
-
-To collapse a vector of strings into a single string, use `collapse`:
-
-```{r}
-str_c(c("x", "y", "z"), collapse = ", ")
-```
-
-### Subsetting strings
-
-You can extract parts of a string using `str_sub()`. As well as the string, `str_sub()` takes `start` and `end` arguments which give the (inclusive) position of the substring:
-
-```{r}
-x <- c("Apple", "Banana", "Pear")
-str_sub(x, 1, 3)
-# negative numbers count backwards from end
-str_sub(x, -3, -1)
-```
-
-Note that `str_sub()` won't fail if the string is too short: it will just return as much as possible:
-
-```{r}
-str_sub("a", 1, 5)
-```
-
-You can also use the assignment form of `str_sub()` to modify strings:
-
-```{r}
-str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
-x
-```
-
-### Locales
-
-Above I used `str_to_lower()` to change the text to lower case. You can also use `str_to_upper()` or `str_to_title()`. However, changing case is more complicated than it might at first appear because different languages have different rules for changing case. You can pick which set of rules to use by specifying a locale:
-
-```{r}
-# Turkish has two i's: with and without a dot, and it
-# has a different rule for capitalising them:
-str_to_upper(c("i", "ı"))
-str_to_upper(c("i", "ı"), locale = "tr")
-```
-
-The locale is specified as a ISO 639 language code, which is a two or three letter abbreviation. If you don't already know the code for your language, [Wikipedia](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) has a good list. If you leave the locale blank, it will use the current locale, as provided by your operating system.
-
-Another important operation that's affected by the locale is sorting. The base R `order()` and `sort()` functions sort strings using the current locale. If you want robust behaviour across different computers, you may want to use `str_sort()` and `str_order()` which take an additional `locale` argument:
-
-```{r}
-x <- c("apple", "eggplant", "banana")
-
-str_sort(x, locale = "en")  # English
-
-str_sort(x, locale = "haw") # Hawaiian
-```
-
-### Exercises
-
-1.  In code that doesn't use stringr, you'll often see `paste()` and `paste0()`.
-    What's the difference between the two functions? What stringr function are
-    they equivalent to? How do the functions differ in their handling of 
-    `NA`?
-    
-1.  In your own words, describe the difference between the `sep` and `collapse`
-    arguments to `str_c()`.
-
-1.  Use `str_length()` and `str_sub()` to extract the middle character from 
-    a string. What will you do if the string has an even number of characters?
-
-1.  What does `str_wrap()` do? When might you want to use it?
-
-1.  What does `str_trim()` do? What's the opposite of `str_trim()`?
-
-1.  Write a function that turns (e.g.) a vector `c("a", "b", "c")` into 
-    the string `a, b, and c`. Think carefully about what it should do if
-    given a vector of length 0, 1, or 2.
-
-## Matching patterns with regular expressions
-
-Regexps are a very terse language that allow you to describe patterns in strings. They take a little while to get your head around, but once you understand them, you'll find them extremely useful. 
-
-To learn regular expressions, we'll use `str_view()` and `str_view_all()`. These functions take a character vector and a regular expression, and show you how they match. We'll start with very simple regular expressions and then gradually get more and more complicated. Once you've mastered pattern matching, you'll learn how to apply those ideas with various stringr functions.
-
-### Basic matches
-
-The simplest patterns match exact strings:
-
-```{r}
-x <- c("apple", "banana", "pear")
-str_view(x, "an")
-```
-
-The next step up in complexity is `.`, which matches any character (except a newline):
-
-```{r}
-str_view(x, ".a.")
-```
-
-But if "`.`" matches any character, how do you match the character "`.`"? You need to use an "escape" to tell the regular expression you want to match it exactly, not use its special behaviour. Like strings, regexps use the backslash, `\`, to escape special behaviour. So to match an `.`, you need the regexp `\.`. Unfortunately this creates a problem. We use strings to represent regular expressions, and `\` is also used as an escape symbol in strings. So to create the regular expression `\.` we need the string `"\\."`. 
-
-```{r}
-# To create the regular expression, we need \\
-dot <- "\\."
-
-# But the expression itself only contains one:
-writeLines(dot)
-
-# And this tells R to look for an explicit .
-str_view(c("abc", "a.c", "bef"), "a\\.c")
-```
-
-If `\` is used as an escape character in regular expressions, how do you match a literal `\`? Well you need to escape it, creating the regular expression `\\`. To create that regular expression, you need to use a string, which also needs to escape `\`. That means to match a literal `\` you need to write `"\\\\"` --- you need four backslashes to match one!
-
-```{r}
-x <- "a\\b"
-writeLines(x)
-
-str_view(x, "\\\\")
-```
-
-In this book, I'll write regular expression as `\.` and strings that represent the regular expression as `"\\."`.
-
-#### Exercises
-
-1.  Explain why each of these strings don't match a `\`: `"\"`, `"\\"`, `"\\\"`.
-
-1.  How would you match the sequence `"'\`?
-
-1.  What patterns will the regular expression `\..\..\..` match? 
-    How would you represent it as a string?
-
-### Anchors
-
-By default, regular expressions will match any part of a string. It's often useful to _anchor_ the regular expression so that it matches from the start or end of the string. You can use:
-
-* `^` to match the start of the string.
-* `$` to match the end of the string.
-
-```{r}
-x <- c("apple", "banana", "pear")
-str_view(x, "^a")
-str_view(x, "a$")
-```
-
-To remember which is which, try this mnemonic which I learned from [Evan Misshula](https://twitter.com/emisshula/status/323863393167613953): if you begin with power (`^`), you end up with money (`$`).
-
-To force a regular expression to only match a complete string, anchor it with both `^` and `$`:
-
-```{r}
-x <- c("apple pie", "apple", "apple cake")
-str_view(x, "apple")
-str_view(x, "^apple$")
-```
-
-You can also match the boundary between words with `\b`. I don't often use this in R, but I will sometimes use it when I'm doing a search in RStudio when I want to find the name of a function that's a component of other functions. For example, I'll search for `\bsum\b` to avoid matching `summarise`, `summary`, `rowsum` and so on.
-
-#### Exercises
-
-1.  How would you match the literal string `"$^$"`?
-
-1.  Given the corpus of common words in `stringr::words`, create regular
-    expressions that find all words that:
-    
-    1. Start with "y".
-    1. End with "x"
-    1. Are exactly three letters long. (Don't cheat by using `str_length()`!)
-    1. Have seven letters or more.
-
-    Since this list is long, you might want to use the `match` argument to
-    `str_view()` to show only the matching or non-matching words.
-
-### Character classes and alternatives
-
-There are a number of special patterns that match more than one character. You've already seen `.`, which matches any character apart from a newline. There are four other useful tools:
-
-* `\d`: matches any digit.
-* `\s`: matches any whitespace (e.g. space, tab, newline).
-* `[abc]`: matches a, b, or c.
-* `[^abc]`: matches anything except a, b, or c.
-
-Remember, to create a regular expression containing `\d` or `\s`, you'll need to escape the `\` for the string, so you'll type `"\\d"` or `"\\s"`.
-
-A character class containing a single character is a nice alternative to backslash escapes when you want to include a single metacharacter in a regex. Many people find this more readable.
-
-```{r}
-# Look for a literal character that normally has special meaning in a regex
-str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
-str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")
-str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]")
-```
-
-This works for most (but not all) regex metacharacters: `$` `.` `|` `?` `*` `+` `(` `)` `[` `{`. Unfortunately, a few characters have special meaning even inside a character class and must be handled with backslash escapes: `]` `\` `^` and `-`.
-
-You can use _alternation_ to pick between one or more alternative patterns. For example, `abc|d..f` will match either '"abc"', or `"deaf"`. Note that the precedence for `|` is low, so that `abc|xyz` matches `abc` or `xyz` not `abcyz` or `abxyz`. Like with mathematical expressions, if precedence ever gets confusing, use parentheses to make it clear what you want:
-
-```{r}
-str_view(c("grey", "gray"), "gr(e|a)y")
-```
-
-#### Exercises
-
-1.  Create regular expressions to find all words that:
-
-    1. Start with a vowel.
-
-    1. That only contain consonants. (Hint: thinking about matching 
-       "not"-vowels.)
-
-    1. End with `ed`, but not with `eed`.
-    
-    1. End with `ing` or `ise`.
-    
-1.  Empirically verify the rule "i before e except after c".
-
-1.  Is "q" always followed by a "u"?
-
-1.  Write a regular expression that matches a word if it's probably written
-    in British English, not American English.
-
-1.  Create a regular expression that will match telephone numbers as commonly
-    written in your country.
-
-### Repetition
-
-The next step up in power involves controlling how many times a pattern matches:
-
-* `?`: 0 or 1
-* `+`: 1 or more
-* `*`: 0 or more
-
-```{r}
-x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
-str_view(x, "CC?")
-str_view(x, "CC+")
-str_view(x, 'C[LX]+')
-```
-
-Note that the precedence of these operators is high, so you can write: `colou?r` to match either American or British spellings. That means most uses will need parentheses, like `bana(na)+`.
-
-You can also specify the number of matches precisely:
-
-* `{n}`: exactly n
-* `{n,}`: n or more
-* `{,m}`: at most m
-* `{n,m}`: between n and m
-
-```{r}
-str_view(x, "C{2}")
-str_view(x, "C{2,}")
-str_view(x, "C{2,3}")
-```
-
-By default these matches are "greedy": they will match the longest string possible. You can make them "lazy", matching the shortest string possible by putting a `?` after them. This is an advanced feature of regular expressions, but it's useful to know that it exists:
-
-```{r}
-str_view(x, 'C{2,3}?')
-str_view(x, 'C[LX]+?')
-```
-
-#### Exercises
-
-1.  Describe the equivalents of `?`, `+`, `*` in `{m,n}` form.
-
-1.  Describe in words what these regular expressions match:
-    (read carefully to see if I'm using a regular expression or a string
-    that defines a regular expression.)
-
-    1. `^.*$`
-    1. `"\\{.+\\}"`
-    1. `\d{4}-\d{2}-\d{2}`
-    1. `"\\\\{4}"`
-
-1.  Create regular expressions to find all words that:
-
-    1. Start with three consonants.
-    1. Have three or more vowels in a row.
-    1. Have two or more vowel-consonant pairs in a row.
-
-1.  Solve the beginner regexp crosswords at
-    <https://regexcrossword.com/challenges/beginner>.
-
-### Grouping and backreferences
-
-Earlier, you learned about parentheses as a way to disambiguate complex expressions. Parentheses also create a _numbered_ capturing group (number 1, 2 etc.). A capturing group stores _the part of the string_ matched by the part of the regular expression inside the parentheses. You can refer to the same text as previously matched by a capturing group with _backreferences_, like `\1`, `\2` etc. For example, the following regular expression finds all fruits that have a repeated pair of letters.
-
-```{r}
-str_view(fruit, "(..)\\1", match = TRUE)
-```
-
-(Shortly, you'll also see how they're useful in conjunction with `str_match()`.)
-
-#### Exercises
-
-1.  Describe, in words, what these expressions will match:
-
-    1. `(.)\1\1`
-    1. `"(.)(.)\\2\\1"`
-    1. `(..)\1`
-    1. `"(.).\\1.\\1"`
-    1. `"(.)(.)(.).*\\3\\2\\1"`
-
-1.  Construct regular expressions to match words that:
-
-    1. Start and end with the same character.
-    
-    1. Contain a repeated pair of letters
-       (e.g. "church" contains "ch" repeated twice.)
-    
-    1. Contain one letter repeated in at least three places
-       (e.g. "eleven" contains three "e"s.)
-
-## Tools
-
-Now that you've learned the basics of regular expressions, it's time to learn how to apply them to real problems. In this section you'll learn a wide array of stringr functions that let you:
-
-* Determine which strings match a pattern.
-* Find the positions of matches.
-* Extract the content of matches.
-* Replace matches with new values.
-* Split a string based on a match.
-
-A word of caution before we continue: because regular expressions are so powerful, it's easy to try and solve every problem with a single regular expression. In the words of Jamie Zawinski:
-
-> Some people, when confronted with a problem, think “I know, I’ll use regular
-> expressions.” Now they have two problems. 
-
-As a cautionary tale, check out this regular expression that checks if a email address is valid:
-
-```
-(?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t]
-)+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:
-\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(
-?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ 
-\t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\0
-31]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\
-](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+
-(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:
-(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z
-|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)
-?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\
-r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[
- \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)
-?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t]
-)*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[
- \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*
-)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t]
-)+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)
-*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+
-|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r
-\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:
-\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t
-]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031
-]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](
-?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?
-:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?
-:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)|(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?
-:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?
-[ \t]))*"(?:(?:\r\n)?[ \t])*)*:(?:(?:\r\n)?[ \t])*(?:(?:(?:[^()<>@,;:\\".\[\] 
-\000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|
-\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>
-@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"
-(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t]
-)*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\
-".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?
-:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[
-\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:[^()<>@,;:\\".\[\] \000-
-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(
-?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)?[ \t])*(?:@(?:[^()<>@,;
-:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([
-^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\"
-.\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\
-]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\
-[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\
-r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] 
-\000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]
-|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?(?:[^()<>@,;:\\".\[\] \0
-00-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\
-.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,
-;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|"(?
-:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*))*@(?:(?:\r\n)?[ \t])*
-(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".
-\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t])*(?:[
-^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\]
-]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(?:\r\n)?[ \t])*)(?:,\s*(
-?:(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\
-".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(
-?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[
-\["()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t
-])*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t
-])+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?
-:\.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|
-\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*|(?:
-[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".\[\
-]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)*\<(?:(?:\r\n)
-?[ \t])*(?:@(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["
-()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)
-?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>
-@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*(?:,@(?:(?:\r\n)?[
- \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,
-;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\.(?:(?:\r\n)?[ \t]
-)*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\
-".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*)*:(?:(?:\r\n)?[ \t])*)?
-(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\["()<>@,;:\\".
-\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])*)(?:\.(?:(?:
-\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z|(?=[\[
-"()<>@,;:\\".\[\]]))|"(?:[^\"\r\\]|\\.|(?:(?:\r\n)?[ \t]))*"(?:(?:\r\n)?[ \t])
-*))*@(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])
-+|\Z|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*)(?:\
-.(?:(?:\r\n)?[ \t])*(?:[^()<>@,;:\\".\[\] \000-\031]+(?:(?:(?:\r\n)?[ \t])+|\Z
-|(?=[\["()<>@,;:\\".\[\]]))|\[([^\[\]\r\\]|\\.)*\](?:(?:\r\n)?[ \t])*))*\>(?:(
-?:\r\n)?[ \t])*))*)?;\s*)
-```
-
-This is a somewhat pathological example (because email addresses are actually surprisingly complex), but is used in real code. See the stackoverflow discussion at <http://stackoverflow.com/a/201378> for more details. 
-
-Don't forget that you're in a programming language and you have other tools at your disposal. Instead of creating one complex regular expression, it's often easier to write a series of simpler regexps. If you get stuck trying to create a single regexp that solves your problem, take a step back and think if you could break the problem down into smaller pieces, solving each challenge before moving onto the next one.
-
-### Detect matches
-
-To determine if a character vector matches a pattern, use `str_detect()`. It returns a logical vector the same length as the input:
-
-```{r}
-x <- c("apple", "banana", "pear")
-str_detect(x, "e")
-```
-
-Remember that when you use a logical vector in a numeric context, `FALSE` becomes 0 and `TRUE` becomes 1. That makes `sum()` and `mean()` useful if you want to answer questions about matches across a larger vector:
-
-```{r}
-# How many common words start with t?
-sum(str_detect(words, "^t"))
-# What proportion of common words end with a vowel?
-mean(str_detect(words, "[aeiou]$"))
-```
-
-When you have complex logical conditions (e.g. match a or b but not c unless d) it's often easier to combine multiple `str_detect()` calls with logical operators, rather than trying to create a single regular expression. For example, here are two ways to find all words that don't contain any vowels:
-
-```{r}
-# Find all words containing at least one vowel, and negate
-no_vowels_1 <- !str_detect(words, "[aeiou]")
-# Find all words consisting only of consonants (non-vowels)
-no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
-identical(no_vowels_1, no_vowels_2)
-```
-
-The results are identical, but I think the first approach is significantly easier to understand. If your regular expression gets overly complicated, try breaking it up into smaller pieces, giving each piece a name, and then combining the pieces with logical operations.
-
-A common use of `str_detect()` is to select the elements that match a pattern. You can do this with logical subsetting, or the convenient `str_subset()` wrapper:
-
-```{r}
-words[str_detect(words, "x$")]
-str_subset(words, "x$")
-```
-
-Typically, however, your strings will be one column of a data frame, and you'll want to use filter instead:
-
-```{r}
-df <- tibble(
-  word = words, 
-  i = seq_along(word)
-)
-df %>% 
-  filter(str_detect(words, "x$"))
-```
-
-
-A variation on `str_detect()` is `str_count()`: rather than a simple yes or no, it tells you how many matches there are in a string:
-
-```{r}
-x <- c("apple", "banana", "pear")
-str_count(x, "a")
-
-# On average, how many vowels per word?
-mean(str_count(words, "[aeiou]"))
-```
-
-It's natural to use `str_count()` with `mutate()`:
-
-```{r}
-df %>% 
-  mutate(
-    vowels = str_count(word, "[aeiou]"),
-    consonants = str_count(word, "[^aeiou]")
-  )
-```
-
-Note that matches never overlap. For example, in `"abababa"`, how many times will the pattern `"aba"` match? Regular expressions say two, not three:
-
-```{r}
-str_count("abababa", "aba")
-str_view_all("abababa", "aba")
-```
-
-Note the use of `str_view_all()`. As you'll shortly learn, many stringr functions come in pairs: one function works with a single match, and the other works with all matches. The second function will have the suffix `_all`.
-
-### Exercises
-
-1.  For each of the following challenges, try solving it by using both a single
-    regular expression, and a combination of multiple `str_detect()` calls.
-    
-    1.  Find all words that start or end with `x`.
-    
-    1.  Find all words that start with a vowel and end with a consonant.
-    
-    1.  Are there any words that contain at least one of each different
-        vowel?
-
-1.  What word has the highest number of vowels? What word has the highest
-    proportion of vowels? (Hint: what is the denominator?)
-
-### Extract matches
-
-To extract the actual text of a match, use `str_extract()`. To show that off, we're going to need a more complicated example. I'm going to use the [Harvard sentences](https://en.wikipedia.org/wiki/Harvard_sentences), which were designed to test VOIP systems, but are also useful for practicing regexps. These are provided in `stringr::sentences`:
-
-```{r}
-length(sentences)
-head(sentences)
-```
-
-Imagine we want to find all sentences that contain a colour. We first create a vector of colour names, and then turn it into a single regular expression:
-
-```{r}
-colours <- c("red", "orange", "yellow", "green", "blue", "purple")
-colour_match <- str_c(colours, collapse = "|")
-colour_match
-```
-
-Now we can select the sentences that contain a colour, and then extract the colour to figure out which one it is:
-
-```{r}
-has_colour <- str_subset(sentences, colour_match)
-matches <- str_extract(has_colour, colour_match)
-head(matches)
-```
-
-Note that `str_extract()` only extracts the first match. We can see that most easily by first selecting all the sentences that have more than 1 match:
-
-```{r}
-more <- sentences[str_count(sentences, colour_match) > 1]
-str_view_all(more, colour_match)
-
-str_extract(more, colour_match)
-```
-
-This is a common pattern for stringr functions, because working with a single match allows you to use much simpler data structures. To get all matches, use `str_extract_all()`. It returns a list:
-
-```{r}
-str_extract_all(more, colour_match)
-```
-
-You'll learn more about lists in [lists](#lists) and [iteration].
-
-If you use `simplify = TRUE`, `str_extract_all()` will return a matrix with short matches expanded to the same length as the longest:
-
-```{r}
-str_extract_all(more, colour_match, simplify = TRUE)
-
-x <- c("a", "a b", "a b c")
-str_extract_all(x, "[a-z]", simplify = TRUE)
-```
-
-#### Exercises
-
-1.  In the previous example, you might have noticed that the regular
-    expression matched "flickered", which is not a colour. Modify the 
-    regex to fix the problem.
-
-1.  From the Harvard sentences data, extract:
-
-    1. The first word from each sentence.
-    1. All words ending in `ing`.
-    1. All plurals.
-
-### Grouped matches
-
-Earlier in this chapter we talked about the use of parentheses for clarifying precedence and for backreferences when matching. You can also use parentheses to extract parts of a complex match. For example, imagine we want to extract nouns from the sentences. As a heuristic, we'll look for any word that comes after "a" or "the". Defining a "word" in a regular expression is a little tricky, so here I use a simple approximation: a sequence of at least one character that isn't a space.
-
-```{r}
-noun <- "(a|the) ([^ ]+)"
-
-has_noun <- sentences %>%
-  str_subset(noun) %>%
-  head(10)
-has_noun %>% 
-  str_extract(noun)
-```
-
-`str_extract()` gives us the complete match; `str_match()` gives each individual component. Instead of a character vector, it returns a matrix, with one column for the complete match followed by one column for each group:
-
-```{r}
-has_noun %>% 
-  str_match(noun)
-```
-
-(Unsurprisingly, our heuristic for detecting nouns is poor, and also picks up adjectives like smooth and parked.)
-
-If your data is in a tibble, it's often easier to use `tidyr::extract()`. It works like `str_match()` but requires you to name the matches, which are then placed in new columns:
-
-```{r}
-tibble(sentence = sentences) %>% 
-  tidyr::extract(
-    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
-    remove = FALSE
-  )
-```
-
-Like `str_extract()`, if you want all matches for each string, you'll need `str_match_all()`.
-
-#### Exercises
-
-1. Find all words that come after a "number" like "one", "two", "three" etc.
-   Pull out both the number and the word.
-
-1. Find all contractions. Separate out the pieces before and after the 
-   apostrophe.
-
-### Replacing matches
-
-`str_replace()` and `str_replace_all()` allow you to replace matches with new strings. The simplest use is to replace a pattern with a fixed string:
-
-```{r}
-x <- c("apple", "pear", "banana")
-str_replace(x, "[aeiou]", "-")
-str_replace_all(x, "[aeiou]", "-")
-```
-
-With `str_replace_all()` you can perform multiple replacements by supplying a named vector:
-
-```{r}
-x <- c("1 house", "2 cars", "3 people")
-str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
-```
-
-Instead of replacing with a fixed string you can use backreferences to insert components of the match. In the following code, I flip the order of the second and third words.
-
-```{r}
-sentences %>% 
-  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
-  head(5)
-```
-
-#### Exercises
-
-1.   Replace all forward slashes in a string with backslashes.
-
-1.   Implement a simple version of `str_to_lower()` using `replace_all()`.
-
-1.   Switch the first and last letters in `words`. Which of those strings
-     are still words?
-
-### Splitting
-
-Use `str_split()` to split a string up into pieces. For example, we could split sentences into words:
-
-```{r}
-sentences %>%
-  head(5) %>% 
-  str_split(" ")
-```
-
-Because each component might contain a different number of pieces, this returns a list. If you're working with a length-1 vector, the easiest thing is to just extract the first element of the list:
-
-```{r}
-"a|b|c|d" %>% 
-  str_split("\\|") %>% 
-  .[[1]]
-```
-
-Otherwise, like the other stringr functions that return a list, you can use `simplify = TRUE` to return a matrix:
-
-```{r}
-sentences %>%
-  head(5) %>% 
-  str_split(" ", simplify = TRUE)
-```
-
-You can also request a maximum number of pieces:
-
-```{r}
-fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
-fields %>% str_split(": ", n = 2, simplify = TRUE)
-```
-
-Instead of splitting up strings by patterns, you can also split up by character, line, sentence and word `boundary()`s:
-
-```{r}
-x <- "This is a sentence.  This is another sentence."
-str_view_all(x, boundary("word"))
-
-str_split(x, " ")[[1]]
-str_split(x, boundary("word"))[[1]]
-```
-
-#### Exercises
-
-1.  Split up a string like `"apples, pears, and bananas"` into individual
-    components.
-    
-1.  Why is it better to split up by `boundary("word")` than `" "`?
-
-1.  What does splitting with an empty string (`""`) do? Experiment, and
-    then read the documentation.
-
-### Find matches
-
-`str_locate()` and `str_locate_all()` give you the starting and ending positions of each match. These are particularly useful when none of the other functions does exactly what you want. You can use `str_locate()` to find the matching pattern, `str_sub()` to extract and/or modify them.
-
-## Other types of pattern
-
-When you use a pattern that's a string, it's automatically wrapped into a call to `regex()`:
-
-```{r, eval = FALSE}
-# The regular call:
-str_view(fruit, "nana")
-# Is shorthand for
-str_view(fruit, regex("nana"))
-```
-
-You can use the other arguments of `regex()` to control details of the match:
-
-*   `ignore_case = TRUE` allows characters to match either their uppercase or 
-    lowercase forms. This always uses the current locale.
-    
-    ```{r}
-    bananas <- c("banana", "Banana", "BANANA")
-    str_view(bananas, "banana")
-    str_view(bananas, regex("banana", ignore_case = TRUE))
-    ```
-    
-*   `multiline = TRUE` allows `^` and `$` to match the start and end of each
-    line rather than the start and end of the complete string.
-    
-    ```{r}
-    x <- "Line 1\nLine 2\nLine 3"
-    str_extract_all(x, "^Line")[[1]]
-    str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
-    ```
-    
-*   `comments = TRUE` allows you to use comments and white space to make 
-    complex regular expressions more understandable. Spaces are ignored, as is 
-    everything after `#`. To match a literal space, you'll need to escape it: 
-    `"\\ "`.
-    
-    ```{r}
-    phone <- regex("
-      \\(?     # optional opening parens
-      (\\d{3}) # area code
-      [) -]?   # optional closing parens, space, or dash
-      (\\d{3}) # another three numbers
-      [ -]?    # optional space or dash
-      (\\d{3}) # three more numbers
-      ", comments = TRUE)
-    
-    str_match("514-791-8141", phone)
-    ```
-
-*   `dotall = TRUE` allows `.` to match everything, including `\n`.
-
-There are three other functions you can use instead of `regex()`:
-
-*   `fixed()`: matches exactly the specified sequence of bytes. It ignores
-    all special regular expressions and operates at a very low level. 
-    This allows you to avoid complex escaping and can be much faster than 
-    regular expressions. The following microbenchmark shows that it's about
-    3x faster for a simple example.
-  
-    ```{r}
-    microbenchmark::microbenchmark(
-      fixed = str_detect(sentences, fixed("the")),
-      regex = str_detect(sentences, "the"),
-      times = 20
-    )
-    ```
-    
-    Beware using `fixed()` with non-English data. It is problematic because 
-    there are often multiple ways of representing the same character. For 
-    example, there are two ways to define "á": either as a single character or 
-    as an "a" plus an accent:
-    
-    ```{r}
-    a1 <- "\u00e1"
-    a2 <- "a\u0301"
-    c(a1, a2)
-    a1 == a2
-    ```
-
-    They render identically, but because they're defined differently, 
-    `fixed()` doesn't find a match. Instead, you can use `coll()`, defined
-    next, to respect human character comparison rules:
-
-    ```{r}
-    str_detect(a1, fixed(a2))
-    str_detect(a1, coll(a2))
-    ```
-    
-*   `coll()`: compare strings using standard **coll**ation rules. This is 
-    useful for doing case insensitive matching. Note that `coll()` takes a
-    `locale` parameter that controls which rules are used for comparing
-    characters. Unfortunately different parts of the world use different rules!
-
-    ```{r}
-    # That means you also need to be aware of the difference
-    # when doing case insensitive matches:
-    i <- c("I", "İ", "i", "ı")
-    i
-    
-    str_subset(i, coll("i", ignore_case = TRUE))
-    str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
-    ```
-    
-    Both `fixed()` and `regex()` have `ignore_case` arguments, but they
-    do not allow you to pick the locale: they always use the default locale.
-    You can see what that is with the following code; more on stringi
-    later.
-    
-    ```{r}
-    stringi::stri_locale_info()
-    ```
-    
-    The downside of `coll()` is speed; because the rules for recognising which
-    characters are the same are complicated, `coll()` is relatively slow
-    compared to `regex()` and `fixed()`.
-
-*   As you saw with `str_split()` you can use `boundary()` to match boundaries.
-    You can also use it with the other functions: 
-    
-    ```{r}
-    x <- "This is a sentence."
-    str_view_all(x, boundary("word"))
-    str_extract_all(x, boundary("word"))
-    ```
-
-### Exercises
-
-1.  How would you find all strings containing `\` with `regex()` vs.
-    with `fixed()`?
-
-1.  What are the five most common words in `sentences`?
-
-## Other uses of regular expressions
-
-There are two useful function in base R that also use regular expressions:
-
-*   `apropos()` searches all objects available from the global environment. This
-    is useful if you can't quite remember the name of the function.
-    
-    ```{r}
-    apropos("replace")
-    ```
-    
-*   `dir()` lists all the files in a directory. The `pattern` argument takes
-    a regular expression and only returns file names that match the pattern.
-    For example, you can find all the R Markdown files in the current
-    directory with:
-    
-    ```{r}
-    head(dir(pattern = "\\.Rmd$"))
-    ```
-    
-    (If you're more comfortable with "globs" like `*.Rmd`, you can convert
-    them to regular expressions with `glob2rx()`):
-
-## stringi
-
-stringr is built on top of the __stringi__ package. stringr is useful when you're learning because it exposes a minimal set of functions, which have been carefully picked to handle the most common string manipulation functions. stringi, on the other hand, is designed to be comprehensive. It contains almost every function you might ever need: stringi has `r length(getNamespaceExports("stringi"))` functions to stringr's `r length(getNamespaceExports("stringr"))`.
-
-If you find yourself struggling to do something in stringr, it's worth taking a look at stringi. The packages work very similarly, so you should be able to translate your stringr knowledge in a natural way. The main difference is the prefix: `str_` vs. `stri_`.
-
-### Exercises
-
-1.  Find the stringi functions that:
-
-    1. Count the number of words.
-    1. Find duplicated strings.
-    1. Generate random text.
-
-1.  How do you control the language that `stri_sort()` uses for 
-    sorting?
diff --git a/strings.qmd b/strings.qmd
new file mode 100644
index 000000000..9bafb4c76
--- /dev/null
+++ b/strings.qmd
@@ -0,0 +1,649 @@
+# Strings {#sec-strings}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+So far, you've used a bunch of strings without learning much about the details.
+Now it's time to dive into them, learn what makes strings tick, and master some of the powerful string manipulation tools you have at your disposal.
+
+We'll begin with the details of creating strings and character vectors.
+You'll then dive into creating strings from data, then the opposite: extracting strings from data.
+We'll then discuss tools that work with individual letters.
+The chapter finishes with functions that work with individual letters and a brief discussion of where your expectations from English might steer you wrong when working with other languages.
+
+We'll keep working with strings in the next chapter, where you'll learn more about the power of regular expressions.
+
+### Prerequisites
+
+In this chapter, we'll use functions from the stringr package, which is part of the core tidyverse.
+We'll also use the babynames data since it provides some fun strings to manipulate.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(babynames)
+```
+
+You can quickly tell when you're using a stringr function because all stringr functions start with `str_`.
+This is particularly useful if you use RStudio because typing `str_` will trigger autocomplete, allowing you to jog your memory of the available functions.
+
+```{r}
+#| echo: false
+#| fig-alt: |
+#|   str_c typed into the RStudio console with the autocomplete tooltip shown 
+#|   on top, which lists functions beginning with str_c. The funtion signature 
+#|   and beginning of the man page for the highlighted function from the 
+#|   autocomplete list are shown in a panel to its right.
+
+knitr::include_graphics("screenshots/stringr-autocomplete.png")
+```
+
+## Creating a string
+
+We've created strings in passing earlier in the book but didn't discuss the details.
+Firstly, you can create a string using either single quotes (`'`) or double quotes (`"`).
+There's no difference in behavior between the two, so in the interests of consistency, the [tidyverse style guide](https://style.tidyverse.org/syntax.html#character-vectors) recommends using `"`, unless the string contains multiple `"`.
+
+```{r}
+string1 <- "This is a string"
+string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
+```
+
+If you forget to close a quote, you'll see `+`, the continuation prompt:
+
+```         
+> "This is a string without a closing quote
++ 
++ 
++ HELP I'M STUCK IN A STRING
+```
+
+If this happens to you and you can't figure out which quote to close, press Escape to cancel and try again.
+
+### Escapes
+
+To include a literal single or double quote in a string, you can use `\` to "escape" it:
+
+```{r}
+double_quote <- "\"" # or '"'
+single_quote <- '\'' # or "'"
+```
+
+So if you want to include a literal backslash in your string, you'll need to escape it: `"\\"`:
+
+```{r}
+backslash <- "\\"
+```
+
+Beware that the printed representation of a string is not the same as the string itself because the printed representation shows the escapes (in other words, when you print a string, you can copy and paste the output to recreate that string).
+To see the raw contents of the string, use `str_view()`[^strings-1]:
+
+[^strings-1]: Or use the base R function `writeLines()`.
+
+```{r}
+x <- c(single_quote, double_quote, backslash)
+x
+
+str_view(x)
+```
+
+### Raw strings {#sec-raw-strings}
+
+Creating a string with multiple quotes or backslashes gets confusing quickly.
+To illustrate the problem, let's create a string that contains the contents of the code block where we define the `double_quote` and `single_quote` variables:
+
+```{r}
+tricky <- "double_quote <- \"\\\"\" # or '\"'
+single_quote <- '\\'' # or \"'\""
+str_view(tricky)
+```
+
+That's a lot of backslashes!
+(This is sometimes called [leaning toothpick syndrome](https://en.wikipedia.org/wiki/Leaning_toothpick_syndrome).) To eliminate the escaping, you can instead use a **raw string**[^strings-2]:
+
+[^strings-2]: Available in R 4.0.0 and above.
+
+```{r}
+tricky <- r"(double_quote <- "\"" # or '"'
+single_quote <- '\'' # or "'")"
+str_view(tricky)
+```
+
+A raw string usually starts with `r"(` and finishes with `)"`.
+But if your string contains `)"` you can instead use `r"[]"` or `r"{}"`, and if that's still not enough, you can insert any number of dashes to make the opening and closing pairs unique, e.g., `` `r"--()--" ``, `` `r"---()---" ``, etc. Raw strings are flexible enough to handle any text.
+
+### Other special characters
+
+As well as `\"`, `\'`, and `\\`, there are a handful of other special characters that may come in handy. The most common are `\n`, a new line, and `\t`, tab. You'll also sometimes see strings containing Unicode escapes that start with `\u` or `\U`. This is a way of writing non-English characters that work on all systems. You can see the complete list of other special characters in `?Quotes`.
+
+```{r}
+x <- c("one\ntwo", "one\ttwo", "\u00b5", "\U0001f604")
+x
+str_view(x)
+```
+
+Note that `str_view()` uses a blue background for tabs to make them easier to spot.
+One of the challenges of working with text is that there's a variety of ways that white space can end up in the text, so this background helps you recognize that something strange is going on.
+
+### Exercises
+
+1.  Create strings that contain the following values:
+
+    1.  `He said "That's amazing!"`
+
+    2.  `\a\b\c\d`
+
+    3.  `\\\\\\`
+
+2.  Create the string in your R session and print it.
+    What happens to the special "\\u00a0"?
+    How does `str_view()` display it?
+    Can you do a little googling to figure out what this special character is?
+
+    ```{r}
+    x <- "This\u00a0is\u00a0tricky"
+    ```
+
+## Creating many strings from data
+
+Now that you've learned the basics of creating a string or two by "hand", we'll go into the details of creating strings from other strings.
+This will help you solve the common problem where you have some text you wrote that you want to combine with strings from a data frame.
+For example, you might combine "Hello" with a `name` variable to create a greeting.
+We'll show you how to do this with `str_c()` and `str_glue()` and how you can use them with `mutate()`.
+That naturally raises the question of what stringr functions you might use with `summarize()`, so we'll finish this section with a discussion of `str_flatten()`, which is a summary function for strings.
+
+### `str_c()`
+
+`str_c()` takes any number of vectors as arguments and returns a character vector:
+
+```{r}
+str_c("x", "y")
+str_c("x", "y", "z")
+str_c("Hello ", c("John", "Susan"))
+```
+
+`str_c()` is very similar to the base `paste0()`, but is designed to be used with `mutate()` by obeying the usual tidyverse rules for recycling and propagating missing values:
+
+```{r}
+df <- tibble(name = c("Flora", "David", "Terra", NA))
+df |> mutate(greeting = str_c("Hi ", name, "!"))
+```
+
+If you want missing values to display in another way, use `coalesce()` to replace them.
+Depending on what you want, you might use it either inside or outside of `str_c()`:
+
+```{r}
+df |> 
+  mutate(
+    greeting1 = str_c("Hi ", coalesce(name, "you"), "!"),
+    greeting2 = coalesce(str_c("Hi ", name, "!"), "Hi!")
+  )
+```
+
+### `str_glue()` {#sec-glue}
+
+If you are mixing many fixed and variable strings with `str_c()`, you'll notice that you type a lot of `"`s, making it hard to see the overall goal of the code. An alternative approach is provided by the [glue package](https://glue.tidyverse.org) via `str_glue()`[^strings-3]. You give it a single string that has a special feature: anything inside `{}` will be evaluated like it's outside of the quotes:
+
+[^strings-3]: If you're not using stringr, you can also access it directly with `glue::glue()`.
+
+```{r}
+df |> mutate(greeting = str_glue("Hi {name}!"))
+```
+
+As you can see, `str_glue()` currently converts missing values to the string `"NA"` unfortunately making it inconsistent with `str_c()`.
+
+You also might wonder what happens if you need to include a regular `{` or `}` in your string.
+You're on the right track if you guess you'll need to escape it somehow.
+The trick is that glue uses a slightly different escaping technique: instead of prefixing with special character like `\`, you double up the special characters:
+
+```{r}
+df |> mutate(greeting = str_glue("{{Hi {name}!}}"))
+```
+
+### `str_flatten()`
+
+`str_c()` and `str_glue()` work well with `mutate()` because their output is the same length as their inputs.
+What if you want a function that works well with `summarize()`, i.e. something that always returns a single string?
+That's the job of `str_flatten()`[^strings-4]: it takes a character vector and combines each element of the vector into a single string:
+
+[^strings-4]: The base R equivalent is `paste()` used with the `collapse` argument.
+
+```{r}
+str_flatten(c("x", "y", "z"))
+str_flatten(c("x", "y", "z"), ", ")
+str_flatten(c("x", "y", "z"), ", ", last = ", and ")
+```
+
+This makes it work well with `summarize()`:
+
+```{r}
+df <- tribble(
+  ~ name, ~ fruit,
+  "Carmen", "banana",
+  "Carmen", "apple",
+  "Marvin", "nectarine",
+  "Terence", "cantaloupe",
+  "Terence", "papaya",
+  "Terence", "mandarin"
+)
+df |>
+  group_by(name) |> 
+  summarize(fruits = str_flatten(fruit, ", "))
+```
+
+### Exercises
+
+1.  Compare and contrast the results of `paste0()` with `str_c()` for the following inputs:
+
+    ```{r}
+    #| eval: false
+
+    str_c("hi ", NA)
+    str_c(letters[1:2], letters[1:3])
+    ```
+
+2.  What's the difference between `paste()` and `paste0()`?
+    How can you recreate the equivalent of `paste()` with `str_c()`?
+
+3.  Convert the following expressions from `str_c()` to `str_glue()` or vice versa:
+
+    a.  `str_c("The price of ", food, " is ", price)`
+
+    b.  `str_glue("I'm {age} years old and live in {country}")`
+
+    c.  `str_c("\\section{", title, "}")`
+
+## Extracting data from strings
+
+It's very common for multiple variables to be crammed together into a single string.
+In this section, you'll learn how to use four tidyr functions to extract them:
+
+-   `df |> separate_longer_delim(col, delim)`
+-   `df |> separate_longer_position(col, width)`
+-   `df |> separate_wider_delim(col, delim, names)`
+-   `df |> separate_wider_position(col, widths)`
+
+If you look closely, you can see there's a common pattern here: `separate_`, then `longer` or `wider`, then `_`, then by `delim` or `position`.
+That's because these four functions are composed of two simpler primitives:
+
+-   Just like with `pivot_longer()` and `pivot_wider()`, `_longer` functions make the input data frame longer by creating new rows and `_wider` functions make the input data frame wider by generating new columns.
+-   `delim` splits up a string with a delimiter like `", "` or `" "`; `position` splits at specified widths, like `c(3, 5, 2)`.
+
+We'll return to the last member of this family, `separate_wider_regex()`, in @sec-regular-expressions.
+It's the most flexible of the `wider` functions, but you need to know something about regular expressions before you can use it.
+
+The following two sections will give you the basic idea behind these separate functions, first separating into rows (which is a little simpler) and then separating into columns.
+We'll finish off by discussing the tools that the `wider` functions give you to diagnose problems.
+
+### Separating into rows
+
+Separating a string into rows tends to be most useful when the number of components varies from row to row.
+The most common case is requiring `separate_longer_delim()` to split based on a delimiter:
+
+```{r}
+df1 <- tibble(x = c("a,b,c", "d,e", "f"))
+df1 |> 
+  separate_longer_delim(x, delim = ",")
+```
+
+It's rarer to see `separate_longer_position()` in the wild, but some older datasets do use a very compact format where each character is used to record a value:
+
+```{r}
+df2 <- tibble(x = c("1211", "131", "21"))
+df2 |> 
+  separate_longer_position(x, width = 1)
+```
+
+### Separating into columns {#sec-string-columns}
+
+Separating a string into columns tends to be most useful when there are a fixed number of components in each string, and you want to spread them into columns.
+They are slightly more complicated than their `longer` equivalents because you need to name the columns.
+For example, in this following dataset, `x` is made up of a code, an edition number, and a year, separated by `"."`.
+To use `separate_wider_delim()`, we supply the delimiter and the names in two arguments:
+
+```{r}
+df3 <- tibble(x = c("a10.1.2022", "b10.2.2011", "e15.1.2015"))
+df3 |> 
+  separate_wider_delim(
+    x,
+    delim = ".",
+    names = c("code", "edition", "year")
+  )
+```
+
+If a specific piece is not useful you can use an `NA` name to omit it from the results:
+
+```{r}
+df3 |> 
+  separate_wider_delim(
+    x,
+    delim = ".",
+    names = c("code", NA, "year")
+  )
+```
+
+`separate_wider_position()` works a little differently because you typically want to specify the width of each column.
+So you give it a named integer vector, where the name gives the name of the new column, and the value is the number of characters it occupies.
+You can omit values from the output by not naming them:
+
+```{r}
+df4 <- tibble(x = c("202215TX", "202122LA", "202325CA")) 
+df4 |> 
+  separate_wider_position(
+    x,
+    widths = c(year = 4, age = 2, state = 2)
+  )
+```
+
+### Diagnosing widening problems
+
+`separate_wider_delim()`[^strings-5] requires a fixed and known set of columns.
+What happens if some of the rows don't have the expected number of pieces?
+There are two possible problems, too few or too many pieces, so `separate_wider_delim()` provides two arguments to help: `too_few` and `too_many`. Let's first look at the `too_few` case with the following sample dataset:
+
+[^strings-5]: The same principles apply to `separate_wider_position()` and `separate_wider_regex()`.
+
+```{r}
+#| error: true
+df <- tibble(x = c("1-1-1", "1-1-2", "1-3", "1-3-2", "1"))
+
+df |> 
+  separate_wider_delim(
+    x,
+    delim = "-",
+    names = c("x", "y", "z")
+  )
+```
+
+You'll notice that we get an error, but the error gives us some suggestions on how you might proceed.
+Let's start by debugging the problem:
+
+```{r}
+debug <- df |> 
+  separate_wider_delim(
+    x,
+    delim = "-",
+    names = c("x", "y", "z"),
+    too_few = "debug"
+  )
+debug
+```
+
+When you use the debug mode, you get three extra columns added to the output: `x_ok`, `x_pieces`, and `x_remainder` (if you separate a variable with a different name, you'll get a different prefix).
+Here, `x_ok` lets you quickly find the inputs that failed:
+
+```{r}
+debug |> filter(!x_ok)
+```
+
+`x_pieces` tells us how many pieces were found, compared to the expected 3 (the length of `names`).
+`x_remainder` isn't useful when there are too few pieces, but we'll see it again shortly.
+
+Sometimes looking at this debugging information will reveal a problem with your delimiter strategy or suggest that you need to do more preprocessing before separating.
+In that case, fix the problem upstream and make sure to remove `too_few = "debug"` to ensure that new problems become errors.
+
+In other cases, you may want to fill in the missing pieces with `NA`s and move on.
+That's the job of `too_few = "align_start"` and `too_few = "align_end"` which allow you to control where the `NA`s should go:
+
+```{r}
+df |> 
+  separate_wider_delim(
+    x,
+    delim = "-",
+    names = c("x", "y", "z"),
+    too_few = "align_start"
+  )
+```
+
+The same principles apply if you have too many pieces:
+
+```{r}
+#| error: true
+df <- tibble(x = c("1-1-1", "1-1-2", "1-3-5-6", "1-3-2", "1-3-5-7-9"))
+
+df |> 
+  separate_wider_delim(
+    x,
+    delim = "-",
+    names = c("x", "y", "z")
+  )
+```
+
+But now, when we debug the result, you can see the purpose of `x_remainder`:
+
+```{r}
+debug <- df |> 
+  separate_wider_delim(
+    x,
+    delim = "-",
+    names = c("x", "y", "z"),
+    too_many = "debug"
+  )
+debug |> filter(!x_ok)
+```
+
+You have a slightly different set of options for handling too many pieces: you can either silently "drop" any additional pieces or "merge" them all into the final column:
+
+```{r}
+df |> 
+  separate_wider_delim(
+    x,
+    delim = "-",
+    names = c("x", "y", "z"),
+    too_many = "drop"
+  )
+
+
+df |> 
+  separate_wider_delim(
+    x,
+    delim = "-",
+    names = c("x", "y", "z"),
+    too_many = "merge"
+  )
+```
+
+## Letters
+
+In this section, we'll introduce you to functions that allow you to work with the individual letters within a string.
+You'll learn how to find the length of a string, extract substrings, and handle long strings in plots and tables.
+
+### Length
+
+`str_length()` tells you the number of letters in the string:
+
+```{r}
+str_length(c("a", "R for data science", NA))
+```
+
+You could use this with `count()` to find the distribution of lengths of US babynames and then with `filter()` to look at the longest names, which happen to have 15 letters[^strings-6]:
+
+[^strings-6]: Looking at these entries, we'd guess that the babynames data drops spaces or hyphens and truncates after 15 letters.
+
+```{r}
+babynames |>
+  count(length = str_length(name), wt = n)
+
+babynames |> 
+  filter(str_length(name) == 15) |> 
+  count(name, wt = n, sort = TRUE)
+```
+
+### Subsetting
+
+You can extract parts of a string using `str_sub(string, start, end)`, where `start` and `end` are the positions where the substring should start and end.
+The `start` and `end` arguments are inclusive, so the length of the returned string will be `end - start + 1`:
+
+```{r}
+x <- c("Apple", "Banana", "Pear")
+str_sub(x, 1, 3)
+```
+
+You can use negative values to count back from the end of the string: -1 is the last character, -2 is the second to last character, etc.
+
+```{r}
+str_sub(x, -3, -1)
+```
+
+Note that `str_sub()` won't fail if the string is too short: it will just return as much as possible:
+
+```{r}
+str_sub("a", 1, 5)
+```
+
+We could use `str_sub()` with `mutate()` to find the first and last letter of each name:
+
+```{r}
+babynames |> 
+  mutate(
+    first = str_sub(name, 1, 1),
+    last = str_sub(name, -1, -1)
+  )
+```
+
+### Exercises
+
+1.  When computing the distribution of the length of babynames, why did we use `wt = n`?
+2.  Use `str_length()` and `str_sub()` to extract the middle letter from each baby name. What will you do if the string has an even number of characters?
+3.  Are there any major trends in the length of babynames over time? What about the popularity of first and last letters?
+
+## Non-English text {#sec-other-languages}
+
+So far, we've focused on English language text which is particularly easy to work with for two reasons.
+Firstly, the English alphabet is relatively simple: there are just 26 letters.
+Secondly (and maybe more importantly), the computing infrastructure we use today was predominantly designed by English speakers.
+Unfortunately, we don't have room for a full treatment of non-English languages.
+Still, we wanted to draw your attention to some of the biggest challenges you might encounter: encoding, letter variations, and locale-dependent functions.
+
+### Encoding
+
+When working with non-English text, the first challenge is often the **encoding**.
+To understand what's going on, we need to dive into how computers represent strings.
+In R, we can get at the underlying representation of a string using `charToRaw()`:
+
+```{r}
+charToRaw("Hadley")
+```
+
+Each of these six hexadecimal numbers represents one letter: `48` is H, `61` is a, and so on.
+The mapping from hexadecimal number to character is called the encoding, and in this case, the encoding is called ASCII.
+ASCII does a great job of representing English characters because it's the **American** Standard Code for Information Interchange.
+
+Things aren't so easy for languages other than English.
+In the early days of computing, there were many competing standards for encoding non-English characters.
+For example, there were two different encodings for Europe: Latin1 (aka ISO-8859-1) was used for Western European languages, and Latin2 (aka ISO-8859-2) was used for Central European languages.
+In Latin1, the byte `b1` is "±", but in Latin2, it's "ą"!
+Fortunately, today there is one standard that is supported almost everywhere: UTF-8.
+UTF-8 can encode just about every character used by humans today and many extra symbols like emojis.
+
+readr uses UTF-8 everywhere.
+This is a good default but will fail for data produced by older systems that don't use UTF-8.
+If this happens, your strings will look weird when you print them.
+Sometimes just one or two characters might be messed up; other times, you'll get complete gibberish.
+For example here are two inline CSVs with unusual encodings[^strings-7]:
+
+[^strings-7]: Here I'm using the special `\x` to encode binary data directly into a string.
+
+```{r}
+#| eval: false
+
+x1 <- "text\nEl Ni\xf1o was particularly bad this year"
+read_csv(x1)$text
+#> [1] "El Ni\xf1o was particularly bad this year"
+
+x2 <- "text\n\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
+read_csv(x2)$text
+#> [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
+```
+
+To read these correctly, you specify the encoding via the `locale` argument:
+
+```{r}
+#| eval: false
+read_csv(x1, locale = locale(encoding = "Latin1"))$text
+#> [1] "El Niño was particularly bad this year"
+
+read_csv(x2, locale = locale(encoding = "Shift-JIS"))$text
+#> [1] "こんにちは"
+```
+
+How do you find the correct encoding?
+If you're lucky, it'll be included somewhere in the data documentation.
+Unfortunately, that's rarely the case, so readr provides `guess_encoding()` to help you figure it out.
+It's not foolproof and works better when you have lots of text (unlike here), but it's a reasonable place to start.
+Expect to try a few different encodings before you find the right one.
+
+Encodings are a rich and complex topic; we've only scratched the surface here.
+If you'd like to learn more, we recommend reading the detailed explanation at <http://kunststube.net/encoding/>.
+
+### Letter variations
+
+Working in languages with accents poses a significant challenge when determining the position of letters (e.g., with `str_length()` and `str_sub()`) as accented letters might be encoded as a single individual character (e.g., ü) or as two characters by combining an unaccented letter (e.g., u) with a diacritic mark (e.g., ¨).
+For example, this code shows two ways of representing ü that look identical:
+
+```{r}
+u <- c("\u00fc", "u\u0308")
+str_view(u)
+```
+
+But both strings differ in length, and their first characters are different:
+
+```{r}
+str_length(u)
+str_sub(u, 1, 1)
+```
+
+Finally, note that a comparison of these strings with `==` interprets these strings as different, while the handy `str_equal()` function in stringr recognizes that both have the same appearance:
+
+```{r}
+u[[1]] == u[[2]]
+
+str_equal(u[[1]], u[[2]])
+```
+
+### Locale-dependent functions
+
+Finally, there are a handful of stringr functions whose behavior depends on your **locale**.
+A locale is similar to a language but includes an optional region specifier to handle regional variations within a language.
+A locale is specified by a lower-case language abbreviation, optionally followed by a `_` and an upper-case region identifier.
+For example, "en" is English, "en_GB" is British English, and "en_US" is American English.
+If you don't already know the code for your language, [Wikipedia](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) has a good list, and you can see which are supported in stringr by looking at `stringi::stri_locale_list()`.
+
+Base R string functions automatically use the locale set by your operating system.
+This means that base R string functions do what you expect for your language, but your code might work differently if you share it with someone who lives in a different country.
+To avoid this problem, stringr defaults to English rules by using the "en" locale and requires you to specify the `locale` argument to override it.
+Fortunately, there are two sets of functions where the locale really matters: changing case and sorting.
+
+The rules for changing cases differ among languages.
+For example, Turkish has two i's: with and without a dot.
+Since they're two distinct letters, they're capitalized differently:
+
+```{r}
+str_to_upper(c("i", "ı"))
+str_to_upper(c("i", "ı"), locale = "tr")
+```
+
+Sorting strings depends on the order of the alphabet, and the order of the alphabet is not the same in every language[^strings-8]!
+Here's an example: in Czech, "ch" is a compound letter that appears after `h` in the alphabet.
+
+[^strings-8]: Sorting in languages that don't have an alphabet, like Chinese, is more complicated still.
+
+```{r}
+str_sort(c("a", "c", "ch", "h", "z"))
+str_sort(c("a", "c", "ch", "h", "z"), locale = "cs")
+```
+
+This also comes up when sorting strings with `dplyr::arrange()`, which is why it also has a `locale` argument.
+
+## Summary
+
+In this chapter, you've learned about some of the power of the stringr package: how to create, combine, and extract strings, and about some of the challenges you might face with non-English strings.
+Now it's time to learn one of the most important and powerful tools for working with strings: regular expressions.
+Regular expressions are a very concise but very expressive language for describing patterns within strings and are the topic of the next chapter.
diff --git a/students.feather b/students.feather
new file mode 100644
index 000000000..eec3eb925
Binary files /dev/null and b/students.feather differ
diff --git a/tibble.Rmd b/tibble.Rmd
deleted file mode 100644
index 596c2add2..000000000
--- a/tibble.Rmd
+++ /dev/null
@@ -1,184 +0,0 @@
-# Tibbles
-
-## Introduction
-
-Throughout this book we work with "tibbles" instead of R's traditional `data.frame`. Tibbles _are_ data frames, but they tweak some older behaviours to make life a little easier. R is an old language, and some things that were useful 10 or 20 years ago now get in your way. It's difficult to change base R without breaking existing code, so most innovation occurs in packages. Here we will describe the __tibble__ package, which provides opinionated data frames that make working in the tidyverse a little easier. In most places, I'll use the term tibble and data frame interchangeably; when I want to draw particular attention to R's built-in data frame, I'll call them `data.frame`s. 
-
-If this chapter leaves you wanting to learn more about tibbles, you might enjoy `vignette("tibble")`.
-
-### Prerequisites
-
-In this chapter we'll explore the __tibble__ package, part of the core tidyverse.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-```
-
-## Creating tibbles
-
-Almost all of the functions that you'll use in this book produce tibbles, as tibbles are one of the unifying features of the tidyverse. Most other R packages use regular data frames, so you might want to coerce a data frame to a tibble. You can do that with `as_tibble()`:
-
-```{r}
-as_tibble(iris)
-```
-
-You can create a new tibble from individual vectors with `tibble()`. `tibble()` will automatically recycle inputs of length 1, and allows you to refer to variables that you just created, as shown below.
-
-```{r}
-tibble(
-  x = 1:5, 
-  y = 1, 
-  z = x ^ 2 + y
-)
-```
-
-If you're already familiar with `data.frame()`, note that `tibble()` does much less: it never changes the type of the inputs (e.g. it never converts strings to factors!), it never changes the names of variables, and it never creates row names.
-
-It's possible for a tibble to have column names that are not valid R variable names, aka __non-syntactic__ names. For example, they might not start with a letter, or they might contain unusual characters like a space. To refer to these variables, you need to surround them with backticks, `` ` ``:
-
-```{r}
-tb <- tibble(
-  `:)` = "smile", 
-  ` ` = "space",
-  `2000` = "number"
-)
-tb
-```
-
-You'll also need the backticks when working with these variables in other packages, like ggplot2, dplyr, and tidyr.
-
-Another way to create a tibble is with `tribble()`, short for **tr**ansposed tibble.  `tribble()` is customised for data entry in code: column headings are defined by formulas (i.e. they start with `~`), and entries are separated by commas. This makes it possible to lay out small amounts of data in easy to read form.
-
-```{r}
-tribble(
-  ~x, ~y, ~z,
-  #--|--|----
-  "a", 2, 3.6,
-  "b", 1, 8.5
-)
-```
-
-I often add a comment (the line starting with `#`), to make it really clear where the header is.
-
-## Tibbles vs. data.frame
-
-There are two main differences in the usage of a tibble vs. a classic `data.frame`: printing and subsetting.
-
-### Printing
-
-Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. This makes it much easier to work with large data. In addition to its name, each column reports its type, a nice feature borrowed from `str()`:
-
-```{r}
-tibble(
-  a = lubridate::now() + runif(1e3) * 86400,
-  b = lubridate::today() + runif(1e3) * 30,
-  c = 1:1e3,
-  d = runif(1e3),
-  e = sample(letters, 1e3, replace = TRUE)
-)
-```
-
-Tibbles are designed so that you don't accidentally overwhelm your console when you print large data frames. But sometimes you need more output than the default display. There are a few options that can help.
-
-First, you can explicitly `print()` the data frame and control the number of rows (`n`) and the `width` of the display. `width = Inf` will display all columns:
-
-```{r, eval = FALSE}
-nycflights13::flights %>% 
-  print(n = 10, width = Inf)
-```
-
-You can also control the default print behaviour by setting options:
-
-* `options(tibble.print_max = n, tibble.print_min = m)`: if more than `n`
-  rows, print only `m` rows. Use `options(tibble.print_min = Inf)` to always
-  show all rows.
-
-* Use `options(tibble.width = Inf)` to always print all columns, regardless
-  of the width of the screen.
-
-You can see a complete list of options by looking at the package help with `package?tibble`.
-
-A final option is to use RStudio's built-in data viewer to get a scrollable view of the complete dataset. This is also often useful at the end of a long chain of manipulations.
-
-```{r, eval = FALSE}
-nycflights13::flights %>% 
-  View()
-```
-
-### Subsetting
-
-So far all the tools you've learned have worked with complete data frames. If you want to pull out a single variable, you need some new tools, `$` and `[[`. `[[` can extract by name or position; `$` only extracts by name but is a little less typing.
-
-```{r}
-df <- tibble(
-  x = runif(5),
-  y = rnorm(5)
-)
-
-# Extract by name
-df$x
-df[["x"]]
-
-# Extract by position
-df[[1]]
-```
-
-To use these in a pipe, you'll need to use the special placeholder `.`:
-
-```{r}
-df %>% .$x
-df %>% .[["x"]]
-```
-
-Compared to a `data.frame`, tibbles are more strict: they never do partial matching, and they will generate a warning if the column you are trying to access does not exist.
-
-## Interacting with older code
-
-Some older functions don't work with tibbles. If you encounter one of these functions, use `as.data.frame()` to turn a tibble back to a `data.frame`:
-
-```{r}
-class(as.data.frame(tb))
-```
-
-The main reason that some older functions don't work with tibble is the `[` function.  We don't use `[` much in this book because `dplyr::filter()` and `dplyr::select()` allow you to solve the same problems with clearer code (but you will learn a little about it in [vector subsetting](#vector-subsetting)). With base R data frames, `[` sometimes returns a data frame, and sometimes returns a vector. With tibbles, `[` always returns another tibble.
-
-## Exercises
-
-1.  How can you tell if an object is a tibble? (Hint: try printing `mtcars`,
-    which is a regular data frame). 
-
-1.  Compare and contrast the following operations on a `data.frame` and 
-    equivalent tibble. What is different? Why might the default data frame
-    behaviours cause you frustration?
-    
-    ```{r, eval = FALSE}
-    df <- data.frame(abc = 1, xyz = "a")
-    df$x
-    df[, "xyz"]
-    df[, c("abc", "xyz")]
-    ```
-
-1.  If you have the name of a variable stored in an object, e.g. `var <- "mpg"`,
-    how can you extract the reference variable from a tibble?
-
-1.  Practice referring to non-syntactic names in the following data frame by:
-
-    1.  Extracting the variable called `1`.
-
-    1.  Plotting a scatterplot of `1` vs `2`.
-
-    1.  Creating a new column called `3` which is `2` divided by `1`.
-        
-    1.  Renaming the columns to `one`, `two` and `three`. 
-    
-    ```{r}
-    annoying <- tibble(
-      `1` = 1:10,
-      `2` = `1` * 2 + rnorm(length(`1`))
-    )
-    ```
-
-1.  What does `tibble::enframe()` do? When might you use it?
-
-1.  What option controls how many additional column names are printed
-    at the footer of a tibble?
diff --git a/tidy.Rmd b/tidy.Rmd
deleted file mode 100644
index 025565c9d..000000000
--- a/tidy.Rmd
+++ /dev/null
@@ -1,564 +0,0 @@
-# Tidy data
-
-## Introduction
-
-> "Happy families are all alike; every unhappy family is unhappy in its
-> own way." --– Leo Tolstoy
-
-> "Tidy datasets are all alike, but every messy dataset is messy in its
-> own way." --– Hadley Wickham
-
-In this chapter, you will learn a consistent way to organise your data in R, an organisation called __tidy data__. Getting your data into this format requires some upfront work, but that work pays off in the long term. Once you have tidy data and the tidy tools provided by packages in the tidyverse, you will spend much less time munging data from one representation to another, allowing you to spend more time on the analytic questions at hand.
-
-This chapter will give you a practical introduction to tidy data and the accompanying tools in the __tidyr__ package. If you'd like to learn more about the underlying theory, you might enjoy the *Tidy Data* paper published in the Journal of Statistical Software, <http://www.jstatsoft.org/v59/i10/paper>.
-
-### Prerequisites
-
-In this chapter we'll focus on tidyr, a package that provides a bunch of tools to help tidy up your messy datasets. tidyr is a member of the core tidyverse.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-```
-
-## Tidy data
-
-You can represent the same underlying data in multiple ways. The example below shows the same data organised in four different ways. Each dataset shows the same values of four variables *country*, *year*, *population*, and *cases*, but each dataset organises the values in a different way.
-
-```{r}
-table1
-table2
-table3
-
-# Spread across two tibbles
-table4a  # cases
-table4b  # population
-```
-
-These are all representations of the same underlying data, but they are not equally easy to use. One dataset, the tidy dataset, will be much easier to work with inside the tidyverse. 
-
-There are three interrelated rules which make a dataset tidy:
-
-1.  Each variable must have its own column.
-1.  Each observation must have its own row.
-1.  Each value must have its own cell.
-
-Figure \@ref(fig:tidy-structure) shows the rules visually.
-
-```{r tidy-structure, echo = FALSE, out.width = "100%", fig.cap = "Following three rules makes a dataset tidy: variables are in columns, observations are in rows, and values are in cells."}
-knitr::include_graphics("images/tidy-1.png")
-```
-
-These three rules are interrelated because it's impossible to only satisfy two of the three. That interrelationship leads to an even simpler set of practical instructions:
-
-1.  Put each dataset in a tibble.
-1.  Put each variable in a column.
-
-In this example, only `table1` is tidy. It's the only representation where each column is a variable.
-
-Why ensure that your data is tidy? There are two main advantages:
-
-1.  There's a general advantage to picking one consistent way of storing
-    data. If you have a consistent data structure, it's easier to learn the
-    tools that work with it because they have an underlying uniformity.
-    
-1.  There's a specific advantage to placing variables in columns because
-    it allows R's vectorised nature to shine. As you learned in
-    [mutate](#mutate-funs) and [summary functions](#summary-funs), most 
-    built-in R functions work with vectors of values. That makes transforming 
-    tidy data feel particularly natural.
-
-dplyr, ggplot2, and all the other packages in the tidyverse are designed to work with tidy data. Here are a couple of small examples showing how you might work with `table1`.
-
-```{r, out.width = "50%"}
-# Compute rate per 10,000
-table1 %>% 
-  mutate(rate = cases / population * 10000)
-
-# Compute cases per year
-table1 %>% 
-  count(year, wt = cases)
-
-# Visualise changes over time
-library(ggplot2)
-ggplot(table1, aes(year, cases)) + 
-  geom_line(aes(group = country), colour = "grey50") + 
-  geom_point(aes(colour = country))
-```
-
-### Exercises
-
-1.  Using prose, describe how the variables and observations are organised in
-    each of the sample tables.
-
-1.  Compute the `rate` for `table2`, and `table4a` + `table4b`. 
-    You will need to perform four operations:
-
-    1.  Extract the number of TB cases per country per year.
-    1.  Extract the matching population per country per year.
-    1.  Divide cases by population, and multiply by 10000.
-    1.  Store back in the appropriate place.
-    
-    Which representation is easiest to work with? Which is hardest? Why?
-
-1.  Recreate the plot showing change in cases over time using `table2`
-    instead of `table1`. What do you need to do first?
-
-## Spreading and gathering
-
-The principles of tidy data seem so obvious that you might wonder if you'll ever encounter a dataset that isn't tidy. Unfortunately, however, most data that you will encounter will be untidy. There are two main reasons:
-
-1.  Most people aren't familiar with the principles of tidy data, and it's hard
-    to derive them yourself unless you spend a _lot_ of time working with data.
-    
-1.  Data is often organised to facilitate some use other than analysis. For 
-    example, data is often organised to make entry as easy as possible.
-    
-This means for most real analyses, you'll need to do some tidying. The first step is always to figure out what the variables and observations are. Sometimes this is easy; other times you'll need to consult with the people who originally generated the data. 
-The second step is to resolve one of two common problems:
-
-1. One variable might be spread across multiple columns.
-
-1. One observation might be scattered across multiple rows.
-
-Typically a dataset will only suffer from one of these problems; it'll only suffer from both if you're really unlucky! To fix these problems, you'll need the two most important functions in tidyr: `gather()` and `spread()`.
-
-### Gathering
-
-A common problem is a dataset where some of the column names are not names of variables, but _values_ of a variable. Take `table4a`: the column names `1999` and `2000` represent values of the `year` variable, and each row represents two observations, not one.
-
-```{r}
-table4a
-```
-
-To tidy a dataset like this, we need to __gather__ those columns into a new pair of variables. To describe that operation we need three parameters:
-
-* The set of columns that represent values, not variables. In this example, 
-  those are the columns `1999` and `2000`.
-
-* The name of the variable whose values form the column names. I call that
-  the `key`, and here it is `year`.
-
-* The name of the variable whose values are spread over the cells. I call 
-  that `value`, and here it's the number of `cases`.
-  
-Together those parameters generate the call to `gather()`:
-
-```{r}
-table4a %>% 
-  gather(`1999`, `2000`, key = "year", value = "cases")
-```
-
-The columns to gather are specified with `dplyr::select()` style notation. Here there are only two columns, so we list them individually. Note that "1999" and "2000" are non-syntactic names (because they don't start with a letter) so we have to surround them in backticks. To refresh your memory of the other ways to select columns, see [select](#select).
-
-```{r tidy-gather, echo = FALSE, out.width = "100%", fig.cap = "Gathering `table4` into a tidy form."}
-knitr::include_graphics("images/tidy-9.png")
-```
-
-In the final result, the gathered columns are dropped, and we get new `key` and `value` columns. Otherwise, the relationships between the original variables are preserved. Visually, this is shown in Figure \@ref(fig:tidy-gather). We can use `gather()` to tidy `table4b` in a similar fashion. The only difference is the variable stored in the cell values:
-
-```{r}
-table4b %>% 
-  gather(`1999`, `2000`, key = "year", value = "population")
-```
-
-To combine the tidied versions of `table4a` and `table4b` into a single tibble, we need to use `dplyr::left_join()`, which you'll learn about in [relational data].
-
-```{r}
-tidy4a <- table4a %>% 
-  gather(`1999`, `2000`, key = "year", value = "cases")
-tidy4b <- table4b %>% 
-  gather(`1999`, `2000`, key = "year", value = "population")
-left_join(tidy4a, tidy4b)
-```
-
-### Spreading
-
-Spreading is the opposite of gathering. You use it when an observation is scattered across multiple rows. For example, take `table2`: an observation is a country in a year, but each observation is spread across two rows.
-
-```{r}
-table2
-```
-
-To tidy this up, we first analyse the representation in similar way to `gather()`. This time, however, we only need two parameters:
-
-* The column that contains variable names, the `key` column. Here, it's 
-  `type`.
-
-* The column that contains values from multiple variables, the `value`
-  column. Here it's `count`.
-
-Once we've figured that out, we can use `spread()`, as shown programmatically below, and visually in Figure \@ref(fig:tidy-spread).
-
-```{r}
-table2 %>%
-    spread(key = type, value = count)
-```
-
-```{r tidy-spread, echo = FALSE, out.width = "100%", fig.cap = "Spreading `table2` makes it tidy"}
-knitr::include_graphics("images/tidy-8.png")
-```
-
-As you might have guessed from the common `key` and `value` arguments, `spread()` and `gather()` are complements. `gather()` makes wide tables narrower and longer; `spread()` makes long tables shorter and wider.
-
-### Exercises
-
-1.  Why are `gather()` and `spread()` not perfectly symmetrical?  
-    Carefully consider the following example:
-    
-    ```{r, eval = FALSE}
-    stocks <- tibble(
-      year   = c(2015, 2015, 2016, 2016),
-      half  = c(   1,    2,     1,    2),
-      return = c(1.88, 0.59, 0.92, 0.17)
-    )
-    stocks %>% 
-      spread(year, return) %>% 
-      gather("year", "return", `2015`:`2016`)
-    ```
-    
-    (Hint: look at the variable types and think about column _names_.)
-    
-    Both `spread()` and `gather()` have a `convert` argument. What does it 
-    do?
-
-1.  Why does this code fail?
-
-    ```{r, error = TRUE}
-    table4a %>% 
-      gather(1999, 2000, key = "year", value = "cases")
-    ```
-
-1.  Why does spreading this tibble fail? How could you add a new column to fix
-    the problem?
-
-    ```{r}
-    people <- tribble(
-      ~name,             ~key,    ~value,
-      #-----------------|--------|------
-      "Phillip Woods",   "age",       45,
-      "Phillip Woods",   "height",   186,
-      "Phillip Woods",   "age",       50,
-      "Jessica Cordero", "age",       37,
-      "Jessica Cordero", "height",   156
-    )
-    ```
-
-1.  Tidy the simple tibble below. Do you need to spread or gather it?
-    What are the variables?
-
-    ```{r}
-    preg <- tribble(
-      ~pregnant, ~male, ~female,
-      "yes",     NA,    10,
-      "no",      20,    12
-    )
-    ```
-
-## Separating and uniting
-
-So far you've learned how to tidy `table2` and `table4`, but not `table3`. `table3` has a different problem: we have one column (`rate`) that contains two variables (`cases` and `population`). To fix this problem, we'll need the `separate()` function. You'll also learn about the complement of `separate()`: `unite()`, which you use if a single variable is spread across multiple columns.
-
-### Separate
-
-`separate()` pulls apart one column into multiple columns, by splitting wherever a separator character appears. Take `table3`:
-
-```{r}
-table3
-```
-
-The `rate` column contains both `cases` and `population` variables, and we need to split it into two variables. `separate()` takes the name of the column to separate, and the names of the columns to separate into, as shown in Figure \@ref(fig:tidy-separate) and the code below.
-
-```{r}
-table3 %>% 
-  separate(rate, into = c("cases", "population"))
-```
-
-```{r tidy-separate, echo = FALSE, out.width = "75%", fig.cap = "Separating `table3` makes it tidy"}
-knitr::include_graphics("images/tidy-17.png")
-```
-
-By default, `separate()` will split values wherever it sees a non-alphanumeric character (i.e. a character that isn't a number or letter). For example, in the code above, `separate()` split the values of `rate` at the forward slash characters. If you wish to use a specific character to separate a column, you can pass the character to the `sep` argument of `separate()`. For example, we could rewrite the code above as:
-
-```{r eval = FALSE}
-table3 %>% 
-  separate(rate, into = c("cases", "population"), sep = "/")
-```
-
-(Formally, `sep` is a regular expression, which you'll learn more about in [strings].)
-
-Look carefully at the column types: you'll notice that `cases` and `population` are character columns. This is the default behaviour in `separate()`: it leaves the type of the column as is. Here, however, it's not very useful as those really are numbers. We can ask `separate()` to try and convert to better types using `convert = TRUE`:
-
-```{r}
-table3 %>% 
-  separate(rate, into = c("cases", "population"), convert = TRUE)
-```
-
-You can also pass a vector of integers to `sep`. `separate()` will interpret the integers as positions to split at. Positive values start at 1 on the far-left of the strings; negative value start at -1 on the far-right of the strings. When using integers to separate strings, the length of `sep` should be one less than the number of names in `into`. 
-
-You can use this arrangement to separate the last two digits of each year. This make this data less tidy, but is useful in other cases, as you'll see in a little bit.
-
-```{r}
-table3 %>% 
-  separate(year, into = c("century", "year"), sep = 2)
-```
-
-### Unite
-
-`unite()` is the inverse of `separate()`: it combines multiple columns into a single column. You'll need it much less frequently than `separate()`, but it's still a useful tool to have in your back pocket.
-
-```{r tidy-unite, echo = FALSE, out.width = "75%", fig.cap = "Uniting `table5` makes it tidy"}
-knitr::include_graphics("images/tidy-18.png")
-```
-
-We can use `unite()` to rejoin the *century* and *year* columns that we created in the last example. That data is saved as `tidyr::table5`. `unite()` takes a data frame, the name of the new variable to create, and a set of columns to combine, again specified in `dplyr::select()` style:
-
-```{r}
-table5 %>% 
-  unite(new, century, year)
-```
-
-In this case we also need to use the `sep` argument. The default will place an underscore (`_`) between the values from different columns. Here we don't want any separator so we use `""`:
-
-```{r}
-table5 %>% 
-  unite(new, century, year, sep = "")
-```
-
-### Exercises
-
-1.  What do the `extra` and `fill` arguments do in `separate()`? 
-    Experiment with the various options for the following two toy datasets.
-    
-    ```{r, eval = FALSE}
-    tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
-      separate(x, c("one", "two", "three"))
-    
-    tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
-      separate(x, c("one", "two", "three"))
-    ```
-
-1.  Both `unite()` and `separate()` have a `remove` argument. What does it
-    do? Why would you set it to `FALSE`?
-
-1.  Compare and contrast `separate()` and `extract()`.  Why are there
-    three variations of separation (by position, by separator, and with
-    groups), but only one unite?
-
-## Missing values
-
-Changing the representation of a dataset brings up an important subtlety of missing values. Surprisingly, a value can be missing in one of two possible ways:
-
-* __Explicitly__, i.e. flagged with `NA`.
-* __Implicitly__, i.e. simply not present in the data.
-
-Let's illustrate this idea with a very simple data set:
-
-```{r}
-stocks <- tibble(
-  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
-  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
-  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
-)
-```
-
-There are two missing values in this dataset:
-
-* The return for the fourth quarter of 2015 is explicitly missing, because
-  the cell where its value should be instead contains `NA`.
-  
-* The return for the first quarter of 2016 is implicitly missing, because it
-  simply does not appear in the dataset.
-  
-One way to think about the difference is with this Zen-like koan: An explicit missing value is the presence of an absence; an implicit missing value is the absence of a presence.
-
-The way that a dataset is represented can make implicit values explicit. For example, we can make the implicit missing value explicit by putting years in the columns:
-
-```{r}
-stocks %>% 
-  spread(year, return)
-```
-
-Because these explicit missing values may not be important in other representations of the data, you can set `na.rm = TRUE` in `gather()` to turn explicit missing values implicit:
-
-```{r}
-stocks %>% 
-  spread(year, return) %>% 
-  gather(year, return, `2015`:`2016`, na.rm = TRUE)
-```
-
-Another important tool for making missing values explicit in tidy data is `complete()`:
-
-```{r}
-stocks %>% 
-  complete(year, qtr)
-```
-
-`complete()` takes a set of columns, and finds all unique combinations. It then ensures the original dataset contains all those values, filling in explicit `NA`s where necessary.
-
-There's one other important tool that you should know for working with missing values. Sometimes when a data source has primarily been used for data entry, missing values indicate that the previous value should be carried forward:
-
-```{r}
-treatment <- tribble(
-  ~ person,           ~ treatment, ~response,
-  "Derrick Whitmore", 1,           7,
-  NA,                 2,           10,
-  NA,                 3,           9,
-  "Katherine Burke",  1,           4
-)
-```
-
-You can fill in these missing values with `fill()`. It takes a set of columns where you want missing values to be replaced by the most recent non-missing value (sometimes called last observation carried forward).
-
-```{r}
-treatment %>% 
-  fill(person)
-```
-
-### Exercises
-
-1.  Compare and contrast the `fill` arguments to `spread()` and `complete()`. 
-
-1.  What does the direction argument to `fill()` do?
-
-## Case Study
-
-To finish off the chapter, let's pull together everything you've learned to tackle a realistic data tidying problem. The `tidyr::who` dataset contains tuberculosis (TB) cases broken down by year, country, age, gender, and diagnosis method. The data comes from the *2014 World Health Organization Global Tuberculosis Report*, available at <http://www.who.int/tb/country/data/download/en/>.
-
-There's a wealth of epidemiological information in this dataset, but it's challenging to work with the data in the form that it's provided:
-
-```{r}
-who
-```
-
-This is a very typical real-life example dataset. It contains redundant columns, odd variable codes, and many missing values. In short, `who` is messy, and we'll need multiple steps to tidy it. Like dplyr, tidyr is designed so that each function does one thing well. That means in real-life situations you'll usually need to string together multiple verbs into a pipeline. 
-
-The best place to start is almost always to gather together the columns that are not variables. Let's have a look at what we've got: 
-
-* It looks like `country`, `iso2`, and `iso3` are three variables that 
-  redundantly specify the country.
-  
-* `year` is clearly also a variable.
-
-* We don't know what all the other columns are yet, but given the structure 
-  in the variable names (e.g. `new_sp_m014`, `new_ep_m014`, `new_ep_f014`) 
-  these are likely to be values, not variables.
-
-So we need to gather together all the columns from `new_sp_m014` to `newrel_f65`. We don't know what those values represent yet, so we'll give them the generic name `"key"`. We know the cells represent the count of cases, so we'll use the variable `cases`. There are a lot of missing values in the current representation, so for now we'll use `na.rm` just so we can focus on the values that are present.
-
-```{r}
-who1 <- who %>% 
-  gather(new_sp_m014:newrel_f65, key = "key", value = "cases", na.rm = TRUE)
-who1
-```
-
-We can get some hint of the structure of the values in the new `key` column by counting them:
-
-```{r}
-who1 %>% 
-  count(key)
-```
-
-You might be able to parse this out by yourself with a little thought and some experimentation, but luckily we have the data dictionary handy. It tells us:
-
-1.  The first three letters of each column denote whether the column 
-    contains new or old cases of TB. In this dataset, each column contains 
-    new cases.
-
-1.  The next two letters describe the type of TB:
-    
-    *   `rel` stands for cases of relapse
-    *   `ep` stands for cases of extrapulmonary TB
-    *   `sn` stands for cases of pulmonary TB that could not be diagnosed by 
-        a pulmonary smear (smear negative)
-    *   `sp` stands for cases of pulmonary TB that could be diagnosed be 
-        a pulmonary smear (smear positive)
-
-3.  The sixth letter gives the sex of TB patients. The dataset groups 
-    cases by males (`m`) and females (`f`).
-
-4.  The remaining numbers gives the age group. The dataset groups cases into 
-    seven age groups:
-    
-    * `014` = 0 -- 14 years old
-    * `1524` = 15 -- 24 years old
-    * `2534` = 25 -- 34 years old
-    * `3544` = 35 -- 44 years old
-    * `4554` = 45 -- 54 years old
-    * `5564` = 55 -- 64 years old
-    * `65` = 65 or older
-
-We need to make a minor fix to the format of the column names: unfortunately the names are slightly inconsistent because instead of `new_rel` we have `newrel` (it's hard to spot this here but if you don't fix it we'll get errors in subsequent steps). You'll learn about `str_replace()` in [strings], but the basic idea is pretty simple: replace the characters "newrel" with "new_rel". This makes all variable names consistent.
-
-```{r}
-who2 <- who1 %>% 
-  mutate(key = stringr::str_replace(key, "newrel", "new_rel"))
-who2
-```
-
-We can separate the values in each code with two passes of `separate()`. The first pass will split the codes at each underscore.
-
-```{r}
-who3 <- who2 %>% 
-  separate(key, c("new", "type", "sexage"), sep = "_")
-who3
-```
-
-Then we might as well drop the `new` column because it's constant in this dataset. While we're dropping columns, let's also drop `iso2` and `iso3` since they're redundant.
-
-```{r}
-who3 %>% 
-  count(new)
-who4 <- who3 %>% 
-  select(-new, -iso2, -iso3)
-```
-
-Next we'll separate `sexage` into `sex` and `age` by splitting after the first character:
-
-```{r}
-who5 <- who4 %>% 
-  separate(sexage, c("sex", "age"), sep = 1)
-who5
-```
-
-The `who` dataset is now tidy!
-
-I've shown you the code a piece at a time, assigning each interim result to a new variable. This typically isn't how you'd work interactively. Instead, you'd gradually build up a complex pipe:
-
-```{r, results = "hide"}
-who %>%
-  gather(key, value, new_sp_m014:newrel_f65, na.rm = TRUE) %>% 
-  mutate(key = stringr::str_replace(key, "newrel", "new_rel")) %>%
-  separate(key, c("new", "var", "sexage")) %>% 
-  select(-new, -iso2, -iso3) %>% 
-  separate(sexage, c("sex", "age"), sep = 1)
-```
-
-### Exercises
-
-1.  In this case study I set `na.rm = TRUE` just to make it easier to
-    check that we had the correct values. Is this reasonable? Think about
-    how missing values are represented in this dataset. Are there implicit
-    missing values? What's the difference between an `NA` and zero? 
-
-1.  What happens if you neglect the `mutate()` step?
-    (`mutate(key = stringr::str_replace(key, "newrel", "new_rel"))`)
-
-1.  I claimed that `iso2` and `iso3` were redundant with `country`. 
-    Confirm this claim.
-
-1.  For each country, year, and sex compute the total number of cases of 
-    TB. Make an informative visualisation of the data.
-
-## Non-tidy data
-
-Before we continue on to other topics, it's worth talking briefly about non-tidy data. Earlier in the chapter, I used the pejorative term "messy" to refer to non-tidy data. That's an oversimplification: there are lots of useful and well-founded data structures that are not tidy data. There are two main reasons to use other data structures:
-
-* Alternative representations may have substantial performance or space 
-  advantages.
-  
-* Specialised fields have evolved their own conventions for storing data
-  that may be quite different to the conventions of  tidy data.
-
-Either of these reasons means you'll need something other than a tibble (or data frame). If your data does fit naturally into a rectangular structure composed of observations and variables, I think tidy data should be your default choice. But there are good reasons to use other structures; tidy data is not the only way.
-
-If you'd like to learn more about non-tidy data, I'd highly recommend this thoughtful blog post by Jeff Leek: <http://simplystatistics.org/2016/02/17/non-tidy-data/>
diff --git a/transform.Rmd b/transform.Rmd
deleted file mode 100644
index 75177751c..000000000
--- a/transform.Rmd
+++ /dev/null
@@ -1,897 +0,0 @@
-# Data transformation {#transform}
-
-## Introduction
-
-Visualisation is an important tool for insight generation, but it is rare that you get the data in exactly the right form you need. Often you'll need to create some new variables or summaries, or maybe you just want to rename the variables or reorder the observations in order to make the data a little easier to work with. You'll learn how to do all that (and more!) in this chapter, which will teach you how to transform your data using the dplyr package and a new dataset on flights departing New York City in 2013.
-
-### Prerequisites
-
-In this chapter we're going to focus on how to use the dplyr package, another core member of the tidyverse. We'll illustrate the key ideas using data from the nycflights13 package, and use ggplot2 to help us understand the data. 
-
-```{r setup, message = FALSE}
-library(nycflights13)
-library(tidyverse)
-```
-
-Take careful note of the conflicts message that's printed when you load the tidyverse. It tells you that dplyr overwrites some functions in base R. If you want to use the base version of these functions after loading dplyr, you'll need to use their full names: `stats::filter()` and `stats::lag()`.
-
-### nycflights13
-
-To explore the basic data manipulation verbs of dplyr, we'll use `nycflights13::flights`. This data frame contains all `r format(nrow(nycflights13::flights), big.mark = ",")` flights that departed from New York City in 2013. The data comes from the US [Bureau of Transportation Statistics](http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0), and is documented in `?flights`.
-
-```{r}
-flights
-```
-
-You might notice that this data frame prints a little differently from other data frames you might have used in the past: it only shows the first few rows and all the columns that fit on one screen. (To see the whole dataset, you can run `View(flights)` which will open the dataset in the RStudio viewer). It prints differently because it's a __tibble__. Tibbles are data frames, but slightly tweaked to work better in the tidyverse. For now, you don't need to worry about the differences; we'll come back to tibbles in more detail in [wrangle](#wrangle-intro).
- 
-You might also have noticed the row of three (or four) letter abbreviations under the column names. These describe the type of each variable:
-
-* `int` stands for integers.
-
-* `dbl` stands for doubles, or real numbers.
-
-* `chr` stands for character vectors, or strings.
-
-* `dttm` stands for date-times (a date + a time).
-
-There are three other common types of variables that aren't used in this dataset but you'll encounter later in the book:
-
-* `lgl` stands for logical, vectors that contain only `TRUE` or `FALSE`.
-
-* `fctr` stands for factors, which R uses to represent categorical variables
-  with fixed possible values.
-
-* `date` stands for dates.
-
-### dplyr basics
-
-In this chapter you are going to learn the five key dplyr functions that allow you to solve the vast majority of your data manipulation challenges:
-
-* Pick observations by their values (`filter()`).
-* Reorder the rows (`arrange()`).
-* Pick variables by their names (`select()`).
-* Create new variables with functions of existing variables (`mutate()`).
-* Collapse many values down to a single summary (`summarise()`).
-
-These can all be used in conjunction with `group_by()` which changes the scope of each function from operating on the entire dataset to operating on it group-by-group. These six functions provide the verbs for a language of data manipulation.
-
-All verbs work similarly: 
-
-1.  The first argument is a data frame.
-
-1.  The subsequent arguments describe what to do with the data frame,
-    using the variable names (without quotes).
-    
-1.  The result is a new data frame.
-
-Together these properties make it easy to chain together multiple simple steps to achieve a complex result. Let's dive in and see how these verbs work.
-
-## Filter rows with `filter()`
-
-`filter()` allows you to subset observations based on their values. The first argument is the name of the data frame. The second and subsequent arguments are the expressions that filter the data frame. For example, we can select all flights on January 1st with:
-
-```{r}
-filter(flights, month == 1, day == 1)
-```
-
-When you run that line of code, dplyr executes the filtering operation and returns a new data frame. dplyr functions never modify their inputs, so if you want to save the result, you'll need to use the assignment operator, `<-`:
-
-```{r}
-jan1 <- filter(flights, month == 1, day == 1)
-```
-
-R either prints out the results, or saves them to a variable. If you want to do both, you can wrap the assignment in parentheses:
-
-```{r}
-(dec25 <- filter(flights, month == 12, day == 25))
-```
-
-### Comparisons
-
-To use filtering effectively, you have to know how to select the observations that you want using the comparison operators. R provides the standard suite: `>`, `>=`, `<`, `<=`, `!=` (not equal), and `==` (equal). 
-
-When you're starting out with R, the easiest mistake to make is to use `=` instead of `==` when testing for equality. When this happens you'll get an informative error:
-
-```{r, error = TRUE}
-filter(flights, month = 1)
-```
-
-There's another common problem you might encounter when using `==`: floating point numbers. These results might surprise you!
-
-```{r}
-sqrt(2) ^ 2 == 2
-1 / 49 * 49 == 1
-```
-
-Computers use finite precision arithmetic (they obviously can't store an infinite number of digits!) so remember that every number you see is an approximation. Instead of relying on `==`, use `near()`:
-
-```{r}
-near(sqrt(2) ^ 2,  2)
-near(1 / 49 * 49, 1)
-```
-
-### Logical operators
-
-Multiple arguments to `filter()` are combined with "and": every expression must be true in order for a row to be included in the output. For other types of combinations, you'll need to use Boolean operators yourself: `&` is "and", `|` is "or", and `!` is "not". Figure \@ref(fig:bool-ops) shows the complete set of Boolean operations.
-
-```{r bool-ops, echo = FALSE, fig.cap = "Complete set of boolean operations. `x` is the left-hand circle, `y` is the right-hand circle, and the shaded region show which parts each operator selects."}
-knitr::include_graphics("diagrams/transform-logical.png")
-```
-
-The following code finds all flights that departed in November or December:
-
-```{r, eval = FALSE}
-filter(flights, month == 11 | month == 12)
-```
-
-The order of operations doesn't work like English. You can't write `filter(flights, month == 11 | 12)`, which you might literally translate into  "finds all flights that departed in November or December". Instead it finds all months that equal `11 | 12`, an expression that evaluates to `TRUE`. In a numeric context (like here), `TRUE` becomes one, so this finds all flights in January, not November or December. This is quite confusing!
-
-A useful short-hand for this problem is `x %in% y`. This will select every row where `x` is one of the values in `y`. We could use it to rewrite the code above:
-
-```{r, eval = FALSE}
-nov_dec <- filter(flights, month %in% c(11, 12))
-```
-
-Sometimes you can simplify complicated subsetting by remembering De Morgan's law: `!(x & y)` is the same as `!x | !y`, and `!(x | y)` is the same as `!x & !y`. For example, if you wanted to find flights that weren't delayed (on arrival or departure) by more than two hours, you could use either of the following two filters:
-
-```{r, eval = FALSE}
-filter(flights, !(arr_delay > 120 | dep_delay > 120))
-filter(flights, arr_delay <= 120, dep_delay <= 120)
-```
-
-As well as `&` and `|`, R also has `&&` and `||`. Don't use them here! You'll learn when you should use them in [conditional execution].
-
-Whenever you start using complicated, multipart expressions in `filter()`, consider making them explicit variables instead. That makes it much easier to check your work. You'll learn how to create new variables shortly.
-
-### Missing values
-
-One important feature of R that can make comparison tricky are missing values, or `NA`s ("not availables"). `NA` represents an unknown value so missing values are "contagious": almost any operation involving an unknown value will also be unknown.
-
-```{r}
-NA > 5
-10 == NA
-NA + 10
-NA / 2
-```
-
-The most confusing result is this one:
-
-```{r}
-NA == NA
-```
-
-It's easiest to understand why this is true with a bit more context:
-
-```{r}
-# Let x be Mary's age. We don't know how old she is.
-x <- NA
-
-# Let y be John's age. We don't know how old he is.
-y <- NA
-
-# Are John and Mary the same age?
-x == y
-# We don't know!
-```
-
-If you want to determine if a value is missing, use `is.na()`:
-
-```{r}
-is.na(x)
-```
-
-`filter()` only includes rows where the condition is `TRUE`; it excludes both `FALSE` and `NA` values. If you want to preserve missing values, ask for them explicitly:
-
-```{r}
-df <- tibble(x = c(1, NA, 3))
-filter(df, x > 1)
-filter(df, is.na(x) | x > 1)
-```
-
-### Exercises
-
-1.  Find all flights that
-
-    1. Had an arrival delay of two or more hours
-    1. Flew to Houston (`IAH` or `HOU`)
-    1. Were operated by United, American, or Delta
-    1. Departed in summer (July, August, and September)
-    1. Arrived more than two hours late, but didn't leave late
-    1. Were delayed by at least an hour, but made up over 30 minutes in flight
-    1. Departed between midnight and 6am (inclusive)
-
-1.  Another useful dplyr filtering helper is `between()`. What does it do?
-    Can you use it to simplify the code needed to answer the previous 
-    challenges?
-
-1.  How many flights have a missing `dep_time`? What other variables are 
-    missing? What might these rows represent?
-
-1.  Why is `NA ^ 0` not missing? Why is `NA | TRUE` not missing?
-    Why is `FALSE & NA` not missing? Can you figure out the general
-    rule?  (`NA * 0` is a tricky counterexample!)
-
-## Arrange rows with `arrange()`
-
-`arrange()` works similarly to `filter()` except that instead of selecting rows, it changes their order. It takes a data frame and a set of column names (or more complicated expressions) to order by. If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns:
-
-```{r}
-arrange(flights, year, month, day)
-```
-
-Use `desc()` to re-order by a column in descending order:
-
-```{r}
-arrange(flights, desc(dep_delay))
-```
-
-Missing values are always sorted at the end:
-
-```{r}
-df <- tibble(x = c(5, 2, NA))
-arrange(df, x)
-arrange(df, desc(x))
-```
-
-### Exercises
-
-1.  How could you use `arrange()` to sort all missing values to the start?
-    (Hint: use `is.na()`).
-    
-1.  Sort `flights` to find the most delayed flights. Find the flights that
-    left earliest.
-
-1.  Sort `flights` to find the fastest flights.
-
-1.  Which flights travelled the longest? Which travelled the shortest?
-
-## Select columns with `select()` {#select}
-
-It's not uncommon to get datasets with hundreds or even thousands of variables. In this case, the first challenge is often narrowing in on the variables you're actually interested in. `select()` allows you to rapidly zoom in on a useful subset using operations based on the names of the variables.
-
-`select()` is not terribly useful with the flights data because we only have 19 variables, but you can still get the general idea:
-
-```{r}
-# Select columns by name
-select(flights, year, month, day)
-# Select all columns between year and day (inclusive)
-select(flights, year:day)
-# Select all columns except those from year to day (inclusive)
-select(flights, -(year:day))
-```
-
-There are a number of helper functions you can use within `select()`:
-
-* `starts_with("abc")`: matches names that begin with "abc".
-
-* `ends_with("xyz")`: matches names that end with "xyz".
-
-* `contains("ijk")`: matches names that contain "ijk".
-
-* `matches("(.)\\1")`: selects variables that match a regular expression.
-   This one matches any variables that contain repeated characters. You'll 
-   learn more about regular expressions in [strings].
-   
-*  `num_range("x", 1:3)`: matches `x1`, `x2` and `x3`.
-   
-See `?select` for more details.
-
-`select()` can be used to rename variables, but it's rarely useful because it drops all of the variables not explicitly mentioned. Instead, use `rename()`, which is a variant of `select()` that keeps all the variables that aren't explicitly mentioned:
-
-```{r}
-rename(flights, tail_num = tailnum)
-```
-
-Another option is to use `select()` in conjunction with the `everything()` helper. This is useful if you have a handful of variables you'd like to move to the start of the data frame.
-
-```{r}
-select(flights, time_hour, air_time, everything())
-```
-
-### Exercises
-
-1.  Brainstorm as many ways as possible to select `dep_time`, `dep_delay`,
-    `arr_time`, and `arr_delay` from `flights`.
-    
-1.  What happens if you include the name of a variable multiple times in
-    a `select()` call?
-  
-1.  What does the `one_of()` function do? Why might it be helpful in conjunction
-    with this vector?
-    
-    ```{r}
-    vars <- c("year", "month", "day", "dep_delay", "arr_delay")
-    ```
-    
-1.  Does the result of running the following code surprise you?  How do the
-    select helpers deal with case by default? How can you change that default?
-
-    ```{r, eval = FALSE}
-    select(flights, contains("TIME"))
-    ```
-
-## Add new variables with `mutate()`
-
-Besides selecting sets of existing columns, it's often useful to add new columns that are functions of existing columns. That's the job of `mutate()`. 
-
-`mutate()` always adds new columns at the end of your dataset so we'll start by creating a narrower dataset so we can see the new variables. Remember that when you're in RStudio, the easiest way to see all the columns is `View()`.
-
-```{r}
-flights_sml <- select(flights, 
-  year:day, 
-  ends_with("delay"), 
-  distance, 
-  air_time
-)
-mutate(flights_sml,
-  gain = dep_delay - arr_delay,
-  speed = distance / air_time * 60
-)
-```
-
-Note that you can refer to columns that you've just created:
-
-```{r}
-mutate(flights_sml,
-  gain = dep_delay - arr_delay,
-  hours = air_time / 60,
-  gain_per_hour = gain / hours
-)
-```
-
-If you only want to keep the new variables, use `transmute()`:
-
-```{r}
-transmute(flights,
-  gain = dep_delay - arr_delay,
-  hours = air_time / 60,
-  gain_per_hour = gain / hours
-)
-```
-
-### Useful creation functions {#mutate-funs}
-
-There are many functions for creating new variables that you can use with `mutate()`. The key property is that the function must be vectorised: it must take a vector of values as input, return a vector with the same number of values as output. There's no way to list every possible function that you might use, but here's a selection of functions that are frequently useful:
-
-*   Arithmetic operators: `+`, `-`, `*`, `/`, `^`. These are all vectorised,
-    using the so called "recycling rules". If one parameter is shorter than 
-    the other, it will be automatically extended to be the same length. This 
-    is most useful when one of the arguments is a single number: `air_time / 60`,
-    `hours * 60 + minute`, etc.
-    
-    Arithmetic operators are also useful in conjunction with the aggregate
-    functions you'll learn about later. For example, `x / sum(x)` calculates 
-    the proportion of a total, and `y - mean(y)` computes the difference from 
-    the mean.
-    
-*   Modular arithmetic: `%/%` (integer division) and `%%` (remainder), where
-    `x == y * (x %/% y) + (x %% y)`. Modular arithmetic is a handy tool because 
-    it allows you to break integers up into pieces. For example, in the 
-    flights dataset, you can compute `hour` and `minute` from `dep_time` with:
-    
-    ```{r}
-    transmute(flights,
-      dep_time,
-      hour = dep_time %/% 100,
-      minute = dep_time %% 100
-    )
-    ```
-  
-*   Logs: `log()`, `log2()`, `log10()`. Logarithms are an incredibly useful
-    transformation for dealing with data that ranges across multiple orders of
-    magnitude. They also convert multiplicative relationships to additive, a
-    feature we'll come back to in modelling.
-    
-    All else being equal, I recommend using `log2()` because it's easy to
-    interpret: a difference of 1 on the log scale corresponds to doubling on
-    the original scale and a difference of -1 corresponds to halving.
-
-*   Offsets: `lead()` and `lag()` allow you to refer to leading or lagging 
-    values. This allows you to compute running differences (e.g. `x - lag(x)`) 
-    or find when values change (`x != lag(x)`). They are most useful in 
-    conjunction with `group_by()`, which you'll learn about shortly.
-    
-    ```{r}
-    (x <- 1:10)
-    lag(x)
-    lead(x)
-    ```
-  
-*   Cumulative and rolling aggregates: R provides functions for running sums,
-    products, mins and maxes: `cumsum()`, `cumprod()`, `cummin()`, `cummax()`; 
-    and dplyr provides `cummean()` for cumulative means. If you need rolling
-    aggregates (i.e. a sum computed over a rolling window), try the RcppRoll
-    package.
-    
-    ```{r}
-    x
-    cumsum(x)
-    cummean(x)
-    ```
-
-*   Logical comparisons, `<`, `<=`, `>`, `>=`, `!=`, which you learned about
-    earlier. If you're doing a complex sequence of logical operations it's 
-    often a good idea to store the interim values in new variables so you can
-    check that each step is working as expected.
-
-*   Ranking: there are a number of ranking functions, but you should 
-    start with `min_rank()`. It does the most usual type of ranking 
-    (e.g. 1st, 2nd, 2nd, 4th). The default gives smallest values the small
-    ranks; use `desc(x)` to give the largest values the smallest ranks. 
-    
-    ```{r}
-    y <- c(1, 2, 2, NA, 3, 4)
-    min_rank(y)
-    min_rank(desc(y))
-    ```
-    
-    If `min_rank()` doesn't do what you need, look at the variants
-    `row_number()`, `dense_rank()`, `percent_rank()`, `cume_dist()`,
-    `ntile()`.  See their help pages for more details.
-    
-    ```{r}
-    row_number(y)
-    dense_rank(y)
-    percent_rank(y)
-    cume_dist(y)
-    ```
-
-### Exercises
-
-```{r, eval = FALSE, echo = FALSE}
-flights <- flights %>% mutate(
-  dep_time = hour * 60 + minute,
-  arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100),
-  airtime2 = arr_time - dep_time,
-  dep_sched = dep_time + dep_delay
-)
-
-ggplot(flights, aes(dep_sched)) + geom_histogram(binwidth = 60)
-ggplot(flights, aes(dep_sched %% 60)) + geom_histogram(binwidth = 1)
-ggplot(flights, aes(air_time - airtime2)) + geom_histogram()
-```
-
-1.  Currently `dep_time` and `sched_dep_time` are convenient to look at, but
-    hard to compute with because they're not really continuous numbers. 
-    Convert them to a more convenient representation of number of minutes
-    since midnight.
-    
-1.  Compare `air_time` with `arr_time - dep_time`. What do you expect to see?
-    What do you see? What do you need to do to fix it?
-    
-1.  Compare `dep_time`, `sched_dep_time`, and `dep_delay`. How would you
-    expect those three numbers to be related?
-
-1.  Find the 10 most delayed flights using a ranking function. How do you want 
-    to handle ties? Carefully read the documentation for `min_rank()`.
-
-1.  What does `1:3 + 1:10` return? Why?
-
-1.  What trigonometric functions does R provide?
-
-## Grouped summaries with `summarise()`
-
-The last key verb is `summarise()`. It collapses a data frame to a single row:
-
-```{r}
-summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
-```
-
-(We'll come back to what that `na.rm = TRUE` means very shortly.)
-
-`summarise()` is not terribly useful unless we pair it with `group_by()`. This changes the unit of analysis from the complete dataset to individual groups. Then, when you use the dplyr verbs on a grouped data frame they'll be automatically applied "by group". For example, if we applied exactly the same code to a data frame grouped by date, we get the average delay per date:
-
-```{r}
-by_day <- group_by(flights, year, month, day)
-summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
-```
-
-Together `group_by()` and `summarise()` provide one of the tools that you'll use most commonly when working with dplyr: grouped summaries. But before we go any further with this, we need to introduce a powerful new idea: the pipe.
-
-### Combining multiple operations with the pipe
-
-Imagine that we want to explore the relationship between the distance and average delay for each location. Using what you know about dplyr, you might write code like this:
-
-```{r, fig.width = 6}
-by_dest <- group_by(flights, dest)
-delay <- summarise(by_dest,
-  count = n(),
-  dist = mean(distance, na.rm = TRUE),
-  delay = mean(arr_delay, na.rm = TRUE)
-)
-delay <- filter(delay, count > 20, dest != "HNL")
-
-# It looks like delays increase with distance up to ~750 miles 
-# and then decrease. Maybe as flights get longer there's more 
-# ability to make up delays in the air?
-ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
-  geom_point(aes(size = count), alpha = 1/3) +
-  geom_smooth(se = FALSE)
-```
-
-There are three steps to prepare this data:
-
-1.  Group flights by destination.
-
-1.  Summarise to compute distance, average delay, and number of flights.
-
-1.  Filter to remove noisy points and Honolulu airport, which is almost
-    twice as far away as the next closest airport.
-
-This code is a little frustrating to write because we have to give each intermediate data frame a name, even though we don't care about it. Naming things is hard, so this slows down our analysis. 
-
-There's another way to tackle the same problem with the pipe, `%>%`:
-
-```{r}
-delays <- flights %>% 
-  group_by(dest) %>% 
-  summarise(
-    count = n(),
-    dist = mean(distance, na.rm = TRUE),
-    delay = mean(arr_delay, na.rm = TRUE)
-  ) %>% 
-  filter(count > 20, dest != "HNL")
-```
-
-This focuses on the transformations, not what's being transformed, which makes the code easier to read. You can read it as a series of imperative statements: group, then summarise, then filter. As suggested by this reading, a good way to pronounce `%>%` when reading code is "then".
-
-Behind the scenes, `x %>% f(y)` turns into `f(x, y)`, and `x %>% f(y) %>% g(z)` turns into `g(f(x, y), z)` and so on. You can use the pipe to rewrite multiple operations in a way that you can read left-to-right, top-to-bottom. We'll use piping frequently from now on because it considerably improves the readability of code, and we'll come back to it in more detail in [pipes].
-
-Working with the pipe is one of the key criteria for belonging to the tidyverse. The only exception is ggplot2: it was written before the pipe was discovered. Unfortunately, the next iteration of ggplot2, ggvis, which does use the pipe, isn't quite ready for prime time yet. 
-
-### Missing values
-
-You may have wondered about the `na.rm` argument we used above. What happens if we don't set it?
-
-```{r}
-flights %>% 
-  group_by(year, month, day) %>% 
-  summarise(mean = mean(dep_delay))
-```
-
-We get a lot of missing values! That's because aggregation functions obey the usual rule of missing values: if there's any missing value in the input, the output will be a missing value. Fortunately, all aggregation functions have an `na.rm` argument which removes the missing values prior to computation:
-
-```{r}
-flights %>% 
-  group_by(year, month, day) %>% 
-  summarise(mean = mean(dep_delay, na.rm = TRUE))
-```
-
-In this case, where missing values represent cancelled flights, we could also tackle the problem by first removing the cancelled flights. We'll save this dataset so we can reuse in the next few examples.
-
-```{r}
-not_cancelled <- flights %>% 
-  filter(!is.na(dep_delay), !is.na(arr_delay))
-
-not_cancelled %>% 
-  group_by(year, month, day) %>% 
-  summarise(mean = mean(dep_delay))
-```
-
-### Counts
-
-Whenever you do any aggregation, it's always a good idea to include either a count (`n()`), or a count of non-missing values (`sum(!is.na(x))`). That way you can check that you're not drawing conclusions based on very small amounts of data. For example, let's look at the planes (identified by their tail number) that have the highest average delays:
-
-```{r}
-delays <- not_cancelled %>% 
-  group_by(tailnum) %>% 
-  summarise(
-    delay = mean(arr_delay)
-  )
-
-ggplot(data = delays, mapping = aes(x = delay)) + 
-  geom_freqpoly(binwidth = 10)
-```
-
-Wow, there are some planes that have an _average_ delay of 5 hours (300 minutes)!
-
-The story is actually a little more nuanced. We can get more insight if we draw a scatterplot of number of flights vs. average delay:
-
-```{r}
-delays <- not_cancelled %>% 
-  group_by(tailnum) %>% 
-  summarise(
-    delay = mean(arr_delay, na.rm = TRUE),
-    n = n()
-  )
-
-ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
-  geom_point(alpha = 1/10)
-```
-
-Not surprisingly, there is much greater variation in the average delay when there are few flights. The shape of this plot is very characteristic: whenever you plot a mean (or other summary) vs. group size, you'll see that the variation decreases as the sample size increases.
-
-When looking at this sort of plot, it's often useful to filter out the groups with the smallest numbers of observations, so you can see more of the pattern and less of the extreme variation in the smallest groups. This is what the following code does, as well as showing you a handy pattern for integrating ggplot2 into dplyr flows. It's a bit painful that you have to switch from `%>%` to `+`, but once you get the hang of it, it's quite convenient.
-
-```{r}
-delays %>% 
-  filter(n > 25) %>% 
-  ggplot(mapping = aes(x = n, y = delay)) + 
-    geom_point(alpha = 1/10)
-```
-
---------------------------------------------------------------------------------
-
-RStudio tip: a useful keyboard shortcut is Cmd/Ctrl + Shift + P. This resends the previously sent chunk from the editor to the console. This is very convenient when you're (e.g.) exploring the value of `n` in the example above. You send the whole block once with Cmd/Ctrl + Enter, then you modify the value of `n` and press Cmd/Ctrl + Shift + P to resend the complete block.
-
---------------------------------------------------------------------------------
-
-There's another common variation of this type of pattern. Let's look at how the average performance of batters in baseball is related to the number of times they're at bat. Here I use data from the __Lahman__ package to compute the batting average (number of hits / number of attempts) of every major league baseball player.  
-
-When I plot the skill of the batter (measured by the batting average, `ba`) against the number of opportunities to hit the ball (measured by at bat, `ab`), you see two patterns:
-
-1.  As above, the variation in our aggregate decreases as we get more 
-    data points.
-    
-2.  There's a positive correlation between skill (`ba`) and opportunities to 
-    hit the ball (`ab`). This is because teams control who gets to play, 
-    and obviously they'll pick their best players.
-
-```{r}
-# Convert to a tibble so it prints nicely
-batting <- as_tibble(Lahman::Batting)
-
-batters <- batting %>% 
-  group_by(playerID) %>% 
-  summarise(
-    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
-    ab = sum(AB, na.rm = TRUE)
-  )
-
-batters %>% 
-  filter(ab > 100) %>% 
-  ggplot(mapping = aes(x = ab, y = ba)) +
-    geom_point() + 
-    geom_smooth(se = FALSE)
-```
-
-This also has important implications for ranking. If you naively sort on `desc(ba)`, the people with the best batting averages are clearly lucky, not skilled:
-
-```{r}
-batters %>% 
-  arrange(desc(ba))
-```
-
-You can find a good explanation of this problem at <http://varianceexplained.org/r/empirical_bayes_baseball/> and <http://www.evanmiller.org/how-not-to-sort-by-average-rating.html>.
-
-### Useful summary functions {#summarise-funs}
-
-Just using means, counts, and sum can get you a long way, but R provides many other useful summary functions:
-
-*   Measures of location: we've used `mean(x)`, but `median(x)` is also
-    useful. The mean is the sum divided by the length; the median is a value 
-    where 50% of `x` is above it, and 50% is below it.
-    
-    It's sometimes useful to combine aggregation with logical subsetting. 
-    We haven't talked about this sort of subsetting yet, but you'll learn more
-    about it in [subsetting].
-    
-    ```{r}
-    not_cancelled %>% 
-      group_by(year, month, day) %>% 
-      summarise(
-        avg_delay1 = mean(arr_delay),
-        avg_delay2 = mean(arr_delay[arr_delay > 0]) # the average positive delay
-      )
-    ```
-
-*   Measures of spread: `sd(x)`, `IQR(x)`, `mad(x)`. The root mean squared deviation,
-    or standard deviation or sd for short, is the standard measure of spread.
-    The interquartile range `IQR()` and median absolute deviation `mad(x)`
-    are robust equivalents that may be more useful if you have outliers.
-    
-    ```{r}
-    # Why is distance to some destinations more variable than to others?
-    not_cancelled %>% 
-      group_by(dest) %>% 
-      summarise(distance_sd = sd(distance)) %>% 
-      arrange(desc(distance_sd))
-    ```
-  
-*   Measures of rank: `min(x)`, `quantile(x, 0.25)`, `max(x)`. Quantiles
-    are a generalisation of the median. For example, `quantile(x, 0.25)`
-    will find a value of `x` that is greater than 25% of the values,
-    and less than the remaining 75%.
-
-    ```{r}
-    # When do the first and last flights leave each day?
-    not_cancelled %>% 
-      group_by(year, month, day) %>% 
-      summarise(
-        first = min(dep_time),
-        last = max(dep_time)
-      )
-    ```
-  
-*   Measures of position: `first(x)`, `nth(x, 2)`, `last(x)`. These work 
-    similarly to `x[1]`, `x[2]`, and `x[length(x)]` but let you set a default 
-    value if that position does not exist (i.e. you're trying to get the 3rd
-    element from a group that only has two elements). For example, we can
-    find the first and last departure for each day:
-    
-    ```{r}
-    not_cancelled %>% 
-      group_by(year, month, day) %>% 
-      summarise(
-        first_dep = first(dep_time), 
-        last_dep = last(dep_time)
-      )
-    ```
-    
-    These functions are complementary to filtering on ranks. Filtering gives
-    you all variables, with each observation in a separate row:
-    
-    ```{r}
-    not_cancelled %>% 
-      group_by(year, month, day) %>% 
-      mutate(r = min_rank(desc(dep_time))) %>% 
-      filter(r %in% range(r))
-    ```
-
-*   Counts: You've seen `n()`, which takes no arguments, and returns the 
-    size of the current group. To count the number of non-missing values, use
-    `sum(!is.na(x))`. To count the number of distinct (unique) values, use
-    `n_distinct(x)`.
-    
-    ```{r}
-    # Which destinations have the most carriers?
-    not_cancelled %>% 
-      group_by(dest) %>% 
-      summarise(carriers = n_distinct(carrier)) %>% 
-      arrange(desc(carriers))
-    ```
-    
-    Counts are so useful that dplyr provides a simple helper if all you want is 
-    a count:
-    
-    ```{r}
-    not_cancelled %>% 
-      count(dest)
-    ```
-    
-    You can optionally provide a weight variable. For example, you could use 
-    this to "count" (sum) the total number of miles a plane flew:
-    
-    ```{r}
-    not_cancelled %>% 
-      count(tailnum, wt = distance)
-    ```
-  
-*   Counts and proportions of logical values: `sum(x > 10)`, `mean(y == 0)`.
-    When used with numeric functions, `TRUE` is converted to 1 and `FALSE` to 0. 
-    This makes `sum()` and `mean()` very useful: `sum(x)` gives the number of 
-    `TRUE`s in `x`, and `mean(x)` gives the proportion.
-    
-    ```{r}
-    # How many flights left before 5am? (these usually indicate delayed
-    # flights from the previous day)
-    not_cancelled %>% 
-      group_by(year, month, day) %>% 
-      summarise(n_early = sum(dep_time < 500))
-    
-    # What proportion of flights are delayed by more than an hour?
-    not_cancelled %>% 
-      group_by(year, month, day) %>% 
-      summarise(hour_perc = mean(arr_delay > 60))
-    ```
-
-### Grouping by multiple variables
-
-When you group by multiple variables, each summary peels off one level of the grouping. That makes it easy to progressively roll up a dataset:
-
-```{r}
-daily <- group_by(flights, year, month, day)
-(per_day   <- summarise(daily, flights = n()))
-(per_month <- summarise(per_day, flights = sum(flights)))
-(per_year  <- summarise(per_month, flights = sum(flights)))
-```
-
-Be careful when progressively rolling up summaries: it's OK for sums and counts, but you need to think about weighting means and variances, and it's not possible to do it exactly for rank-based statistics like the median. In other words, the sum of groupwise sums is the overall sum, but the median of groupwise medians is not the overall median.
-
-### Ungrouping
-
-If you need to remove grouping, and return to operations on ungrouped data, use `ungroup()`. 
-
-```{r}
-daily %>% 
-  ungroup() %>%             # no longer grouped by date
-  summarise(flights = n())  # all flights
-```
-
-### Exercises
-
-1.  Brainstorm at least 5 different ways to assess the typical delay 
-    characteristics of a group of flights. Consider the following scenarios:
-    
-    * A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of 
-      the time.
-      
-    * A flight is always 10 minutes late.
-
-    * A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of 
-      the time.
-      
-    * 99% of the time a flight is on time. 1% of the time it's 2 hours late.
-    
-    Which is more important: arrival delay or departure delay?
-
-1.  Come up with another approach that will give you the same output as 
-    `not_cancelled %>% count(dest)` and 
-    `not_cancelled %>% count(tailnum, wt = distance)` (without using 
-    `count()`).
-
-1.  Our definition of cancelled flights (`is.na(dep_delay) | is.na(arr_delay)`
-    ) is slightly suboptimal. Why? Which is the most important column?
-
-1.  Look at the number of cancelled flights per day. Is there a pattern?
-    Is the proportion of cancelled flights related to the average delay?
-
-1.  Which carrier has the worst delays? Challenge: can you disentangle the
-    effects of bad airports vs. bad carriers? Why/why not? (Hint: think about
-    `flights %>% group_by(carrier, dest) %>% summarise(n())`)
-
-1.  What does the `sort` argument to `count()` do. When might you use it?
-
-## Grouped mutates (and filters)
-
-Grouping is most useful in conjunction with `summarise()`, but you can also do convenient operations with `mutate()` and `filter()`:
-
-*   Find the worst members of each group:
-
-    ```{r}
-    flights_sml %>% 
-      group_by(year, month, day) %>%
-      filter(rank(desc(arr_delay)) < 10)
-    ```
-
-*   Find all groups bigger than a threshold:
-
-    ```{r}
-    popular_dests <- flights %>% 
-      group_by(dest) %>% 
-      filter(n() > 365)
-    popular_dests
-    ```
-
-*   Standardise to compute per group metrics:
-
-    ```{r}
-    popular_dests %>% 
-      filter(arr_delay > 0) %>% 
-      mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
-      select(year:day, dest, arr_delay, prop_delay)
-    ```
-
-A grouped filter is a grouped mutate followed by an ungrouped filter. I generally avoid them except for quick and dirty manipulations: otherwise it's hard to check that you've done the manipulation correctly.
-
-Functions that work most naturally in grouped mutates and filters are known as  window functions (vs. the summary functions used for summaries). You can learn more about useful window functions in the corresponding vignette: `vignette("window-functions")`.
-
-### Exercises
-
-1.  Refer back to the lists of useful mutate and filtering functions. 
-    Describe how each operation changes when you combine it with grouping.
-
-1.  Which plane (`tailnum`) has the worst on-time record?
-
-1.  What time of day should you fly if you want to avoid delays as much
-    as possible?
-    
-1.  For each destination, compute the total minutes of delay. For each 
-    flight, compute the proportion of the total delay for its destination.
-    
-1.  Delays are typically temporally correlated: even once the problem that
-    caused the initial delay has been resolved, later flights are delayed 
-    to allow earlier flights to leave. Using `lag()`, explore how the delay
-    of a flight is related to the delay of the immediately preceding flight.
-    
-1.  Look at each destination. Can you find flights that are suspiciously
-    fast? (i.e. flights that represent a potential data entry error). Compute
-    the air time a flight relative to the shortest flight to that destination.
-    Which flights were most delayed in the air?
-    
-1.  Find all destinations that are flown by at least two carriers. Use that
-    information to rank the carriers.
-
-1.  For each plane, count the number of flights before the first delay 
-    of greater than 1 hour.
diff --git a/transform.qmd b/transform.qmd
new file mode 100644
index 000000000..a82beb038
--- /dev/null
+++ b/transform.qmd
@@ -0,0 +1,50 @@
+# Transform {#sec-transform-intro .unnumbered}
+
+```{r}
+#| results: "asis"
+#| echo: false
+source("_common.R")
+```
+
+The second part of the book was a deep dive into data visualization.
+In this part of the book, you'll learn about the most important types of variables that you'll encounter inside a data frame and learn the tools you can use to work with them.
+
+```{r}
+#| label: fig-ds-transform
+#| echo: false
+#| fig-cap: |
+#|   The options for data transformation depends heavily on the type of 
+#|   data involved, the subject of this part of the book.
+#| fig-alt: |
+#|   Our data science model, with transform highlighted in blue.
+#| out.width: NULL
+
+knitr::include_graphics("diagrams/data-science/transform.png", dpi = 270)
+```
+
+You can read these chapters as you need them; they're designed to be largely standalone so that they can be read out of order.
+
+-   @sec-logicals teaches you about logical vectors.
+    These are the simplest types of vectors, but are extremely powerful.
+    You'll learn how to create them with numeric comparisons, how to combine them with Boolean algebra, how to use them in summaries, and how to use them for condition transformations.
+
+-   @sec-numbers dives into tools for vectors of numbers, the powerhouse of data science.
+    You'll learn more about counting and a bunch of important transformation and summary functions.
+
+-   @sec-strings will give you the tools to work with strings: you'll slice them, you'll dice them, and you'll stick them back together again.
+    This chapter mostly focuses on the stringr package, but you'll also learn some more tidyr functions devoted to extracting data from character strings.
+
+-   @sec-regular-expressions introduces you to regular expressions, a powerful tool for manipulating strings.
+    This chapter will take you from thinking that a cat walked over your keyboard to reading and writing complex string patterns.
+
+-   @sec-factors introduces factors: the data type that R uses to store categorical data.
+    You use a factor when variable has a fixed set of possible values, or when you want to use a non-alphabetical ordering of a string.
+
+-   @sec-dates-and-times will give you the key tools for working with dates and date-times.
+    Unfortunately, the more you learn about date-times, the more complicated they seem to get, but with the help of the lubridate package, you'll learn to how to overcome the most common challenges.
+
+-   @sec-missing-values discusses missing values in depth.
+    We've discussed them a couple of times in isolation, but now it's time to discuss them holistically, helping you come to grips with the difference between implicit and explicit missing values, and how and why you might convert between them.
+
+-   @sec-joins finishes up this part of the book by giving you tools to join two (or more) data frames together.
+    Learning about joins will force you to grapple with the idea of keys, and think about how you identify each row in a dataset.
diff --git a/vectors.Rmd b/vectors.Rmd
deleted file mode 100644
index 3a3095e2f..000000000
--- a/vectors.Rmd
+++ /dev/null
@@ -1,675 +0,0 @@
-# Vectors
-
-## Introduction
-
-So far this book has focussed on tibbles and packages that work with them. But as you start to write your own functions, and dig deeper into R, you need to learn about vectors, the objects that underlie tibbles. If you've learned R in a more traditional way, you're probably already familiar with vectors, as most R resources start with vectors and work their way up to tibbles. I think it's better to start with tibbles because they're immediately useful, and then work your way down to the underlying components.
-
-Vectors are particularly important as most of the functions you will write will work with vectors. It is possible to write functions that work with tibbles (like ggplot2, dplyr, and tidyr), but the tools you need to write such functions are currently idiosyncratic and immature. I am working on a better approach, <https://github.com/hadley/lazyeval>, but it will not be ready in time for the publication of the book. Even when complete, you'll still need to understand vectors, it'll just make it easier to write a user-friendly layer on top.
-
-### Prerequisites
-
-The focus of this chapter is on base R data structures, so it isn't essential to load any packages. We will, however, use a handful of functions from the __purrr__ package to avoid some inconsistencies in base R.
-
-```{r setup, message = FALSE}
-library(tidyverse)
-```
-
-## Vector basics
-
-There are two types of vectors:
-
-1. __Atomic__ vectors, of which there are six types:
-  __logical__, __integer__, __double__,  __character__, __complex__, and 
-  __raw__. Integer and double vectors are collectively known as
-  __numeric__ vectors. 
-
-1. __Lists__,  which are sometimes called recursive vectors because lists can 
-  contain other lists. 
-
-The chief difference between atomic vectors and lists is that atomic vectors are __homogeneous__, while lists can be __heterogeneous__. There's one other related object: `NULL`. `NULL` is often used to represent the absence of a vector (as opposed to `NA` which is used to represent the absence of a value in a vector). `NULL` typically behaves like a vector of length 0. Figure \@ref(fig:datatypes) summarises the interrelationships. 
-
-```{r datatypes, echo = FALSE, out.width = "50%", fig.cap = "The hierarchy of R's vector types"}
-knitr::include_graphics("diagrams/data-structures-overview.png")
-```
-
-Every vector has two key properties: 
-
-1.  Its __type__, which you can determine with `typeof()`.
-
-    ```{r}
-    typeof(letters)
-    typeof(1:10)
-    ```
-
-1. Its __length__, which you can determine with `length()`.
-
-    ```{r}
-    x <- list("a", "b", 1:10)
-    length(x)
-    ```
-
-Vectors can also contain arbitrary additional metadata in the form of attributes. These attributes are used to create __augmented vectors__ which build on additional behaviour. There are three important types of augmented vector:
-
-* Factors are built on top of integer vectors.
-* Dates and date-times are built on top of numeric vectors.
-* Data frames and tibbles are built on top of lists.
-
-This chapter will introduce you to these important vectors from simplest to most complicated. You'll start with atomic vectors, then build up to lists, and finish off with augmented vectors.
-
-## Important types of atomic vector
-
-The four most important types of atomic vector are logical, integer, double, and character. Raw and complex are rarely used during a data analysis, so I won't discuss them here.
-
-### Logical
-
-Logical vectors are the simplest type of atomic vector because they can take only three possible values: `FALSE`, `TRUE`, and `NA`. Logical vectors are usually constructed with comparison operators, as described in [comparisons]. You can also create them by hand with `c()`:
-
-```{r}
-1:10 %% 3 == 0
-
-c(TRUE, TRUE, FALSE, NA)
-```
-
-### Numeric
-
-Integer and double vectors are known collectively as numeric vectors. In R, numbers are doubles by default. To make an integer, place an `L` after the number:
-
-```{r}
-typeof(1)
-typeof(1L)
-1.5L
-```
-
-The distinction between integers and doubles is not usually important, but there are two important differences that you should be aware of:
-
-1.   Doubles are approximations. Doubles represent floating point numbers that 
-     can not always be precisely represented with a fixed amount of memory. 
-     This means that you should consider all doubles to be approximations. 
-     For example, what is square of the square root of two?
-
-    ```{r}
-    x <- sqrt(2) ^ 2
-    x
-    x - 2
-    ```
-
-    This behaviour is common when working with floating point numbers: most
-    calculations include some approximation error. Instead of comparing floating
-    point numbers using `==`, you should use `dplyr::near()` which allows for 
-    some numerical tolerance.
-
-1.  Integers have one special value: `NA`, while doubles have four:
-    `NA`, `NaN`, `Inf` and `-Inf`. All three special values `NaN`, `Inf` and `-Inf` can arise during division:
-   
-    ```{r}
-    c(-1, 0, 1) / 0
-    ```
-
-    Avoid using `==` to check for these other special values. Instead use the 
-    helper functions `is.finite()`, `is.infinite()`, and `is.nan()`:
-    
-    |                  |  0  | Inf | NA  | NaN |
-    |------------------|-----|-----|-----|-----|
-    | `is.finite()`    |  x  |     |     |     |
-    | `is.infinite()`  |     |  x  |     |     |
-    | `is.na()`        |     |     |  x  |  x  |
-    | `is.nan()`       |     |     |     |  x  |
-
-### Character
-
-Character vectors are the most complex type of atomic vector, because each element of a character vector is a string, and a string can contain an arbitrary amount of data. 
-
-You've already learned a lot about working with strings in [strings]. Here I wanted to mention one important feature of the underlying string implementation: R uses a global string pool. This means that each unique string is only stored in memory once, and every use of the string points to that representation. This reduces the amount of memory needed by duplicated strings. You can see this behaviour in practice with `pryr::object_size()`:
-
-```{r}
-x <- "This is a reasonably long string."
-pryr::object_size(x)
-
-y <- rep(x, 1000)
-pryr::object_size(y)
-```
-
-`y` doesn't take up 1,000x as much memory as `x`, because each element of `y` is just a pointer to that same string. A pointer is 8 bytes, so 1000 pointers to a 136 B string is 8 * 1000 + 136 = 8.13 kB.
-
-### Missing values
-
-Note that each type of atomic vector has its own missing value:
-
-```{r}
-NA            # logical
-NA_integer_   # integer
-NA_real_      # double
-NA_character_ # character
-```
-
-Normally you don't need to know about these different types because you can always use `NA` and it will be converted to the correct type using the implicit coercion rules described next. However, there are some functions that are strict about their inputs, so it's useful to have this knowledge sitting in your back pocket so you can be specific when needed.
-
-### Exercises
-
-1.  Describe the difference between `is.finite(x)` and  `!is.infinite(x)`.
-
-1.  Read the source code for `dplyr::near()` (Hint: to see the source code,
-    drop the `()`). How does it work? 
-
-1.  A logical vector can take 3 possible values. How many possible
-    values can an integer vector take? How many possible values can
-    a double take? Use google to do some research.
-
-1.  Brainstorm at least four functions that allow you to convert a double to an
-    integer. How do they differ? Be precise.
-    
-1.  What functions from the readr package allow you to turn a string
-    into logical, integer, and double vector?
-
-## Using atomic vectors
-
-Now that you understand the different types of atomic vector, it's useful to review some of the important tools for working with them. These include:
-
-1.  How to convert from one type to another, and when that happens
-    automatically.
-
-1.  How to tell if an object is a specific type of vector.
-
-1.  What happens when you work with vectors of different lengths.
-
-1.  How to name the elements of a vector.
-
-1.  How to pull out elements of interest.
-
-### Coercion
-
-There are two ways to convert, or coerce, one type of vector to another:
-
-1.  Explicit coercion happens when you call a function like `as.logical()`,
-    `as.integer()`, `as.double()`, or `as.character()`. Whenever you find
-    yourself using explicit coercion, you should always check whether you can
-    make the fix upstream, so that the vector never had the wrong type in 
-    the first place. For example, you may need to tweak your readr 
-    `col_types` specification.
-
-1.  Implicit coercion happens when you use a vector in a specific context
-    that expects a certain type of vector. For example, when you use a logical
-    vector with a numeric summary function, or when you use a double vector
-    where an integer vector is expected.
-    
-Because explicit coercion is used relatively rarely, and is largely easy to understand, I'll focus on implicit coercion here. 
-
-You've already seen the most important type of implicit coercion: using a logical vector in a numeric context. In this case `TRUE` is converted to `1` and `FALSE` converted to `0`. That means the sum of a logical vector is the number of trues, and the mean of a logical vector is the proportion of trues:
-
-```{r}
-x <- sample(20, 100, replace = TRUE)
-y <- x > 10
-sum(y)  # how many are greater than 10?
-mean(y) # what proportion are greater than 10?
-```
-
-You may see some code (typically older) that relies on implicit coercion in the opposite direction, from integer to logical:
-
-```{r, eval = FALSE}
-if (length(x)) {
-  # do something
-}
-```
-
-In this case, 0 is converted to `FALSE` and everything else is converted to `TRUE`. I think this makes it harder to understand your code, and I don't recommend it. Instead be explicit: `length(x) > 0`.
-
-It's also important to understand what happens when you try and create a vector containing multiple types with `c()`: the most complex type always wins.
-
-```{r}
-typeof(c(TRUE, 1L))
-typeof(c(1L, 1.5))
-typeof(c(1.5, "a"))
-```
-
-An atomic vector can not have a mix of different types because the type is a property of the complete vector, not the individual elements. If you need to mix multiple types in the same vector, you should use a list, which you'll learn about shortly.
-
-### Test functions
-
-Sometimes you want to do different things based on the type of vector. One option is to use `typeof()`. Another is to use a test function which returns a `TRUE` or `FALSE`. Base R provides many functions like `is.vector()` and `is.atomic()`, but they often returns surprising results. Instead, it's safer to use the `is_*` functions provided by purrr, which are summarised in the table below.
-
-|                  | lgl | int | dbl | chr | list |
-|------------------|-----|-----|-----|-----|------|
-| `is_logical()`   |  x  |     |     |     |      |
-| `is_integer()`   |     |  x  |     |     |      |
-| `is_double()`    |     |     |  x  |     |      |
-| `is_numeric()`   |     |  x  |  x  |     |      |
-| `is_character()` |     |     |     |  x  |      |
-| `is_atomic()`    |  x  |  x  |  x  |  x  |      |
-| `is_list()`      |     |     |     |     |  x   |
-| `is_vector()`    |  x  |  x  |  x  |  x  |  x   |
-
-Each predicate also comes with a "scalar" version, like `is_scalar_atomic()`, which checks that the length is 1. This is useful, for example, if you want to check that an argument to your function is a single logical value.
-
-### Scalars and recycling rules
-
-As well as implicitly coercing the types of vectors to be compatible, R will also implicitly coerce the length of vectors. This is called vector __recycling__, because the shorter vector is repeated, or recycled, to the same length as the longer vector. 
-
-This is generally most useful when you are mixing vectors and "scalars". I put scalars in quotes because R doesn't actually have scalars: instead, a single number is a vector of length 1. Because there are no scalars, most built-in functions are __vectorised__, meaning that they will operate on a vector of numbers. That's why, for example, this code works:
-
-```{r}
-sample(10) + 100
-runif(10) > 0.5
-```
-
-In R, basic mathematical operations work with vectors. That means that you should never need to perform explicit iteration when performing simple mathematical computations.
-
-It's intuitive what should happen if you add two vectors of the same length, or a vector and a "scalar", but what happens if you add two vectors of different lengths?
-
-```{r}
-1:10 + 1:2
-```
-
-Here, R will expand the shortest vector to the same length as the longest, so called recycling. This is silent except when the length of the longer is not an integer multiple of the length of the shorter:
-
-```{r}
-1:10 + 1:3
-```
-
-While vector recycling can be used to create very succinct, clever code, it can also silently conceal problems. For this reason, the vectorised functions in tidyverse will throw errors when you recycle anything other than a scalar. If you do want to recycle, you'll need to do it yourself with `rep()`:
-
-```{r, error = TRUE}
-tibble(x = 1:4, y = 1:2)
-
-tibble(x = 1:4, y = rep(1:2, 2))
-
-tibble(x = 1:4, y = rep(1:2, each = 2))
-```
-
-### Naming vectors
-
-All types of vectors can be named. You can name them during creation with `c()`:
-
-```{r}
-c(x = 1, y = 2, z = 4)
-```
-
-Or after the fact with `purrr::set_names()`:
-
-```{r}
-set_names(1:3, c("a", "b", "c"))
-```
-
-Named vectors are most useful for subsetting, described next.
-
-### Subsetting {#vector-subsetting}
-
-So far we've used `dplyr::filter()` to filter the rows in a tibble. `filter()` only works with tibble, so we'll need new tool for vectors: `[`. `[` is the subsetting function, and is called like `x[a]`. There are four types of things that you can subset a vector with:
-
-1.  A numeric vector containing only integers. The integers must either be all 
-    positive, all negative, or zero.
-    
-    Subsetting with positive integers keeps the elements at those positions:
-    
-    ```{r}
-    x <- c("one", "two", "three", "four", "five")
-    x[c(3, 2, 5)]
-    ```
-    
-    By repeating a position, you can actually make a longer output than 
-    input:
-    
-    ```{r}
-    x[c(1, 1, 5, 5, 5, 2)]
-    ```
-    
-    Negative values drop the elements at the specified positions:
-    
-    ```{r}
-    x[c(-1, -3, -5)]
-    ```
-    
-    It's an error to mix positive and negative values:
-    
-    ```{r, error = TRUE}
-    x[c(1, -1)]
-    ```
-
-    The error message mentions subsetting with zero, which returns no values:
-    
-    ```{r}
-    x[0]
-    ```
-    
-    This is not useful very often, but it can be helpful if you want to create 
-    unusual data structures to test your functions with.
-  
-1.  Subsetting with a logical vector keeps all values corresponding to a
-    `TRUE` value. This is most often useful in conjunction with the 
-    comparison functions.
-    
-    ```{r}
-    x <- c(10, 3, NA, 5, 8, 1, NA)
-    
-    # All non-missing values of x
-    x[!is.na(x)]
-    
-    # All even (or missing!) values of x
-    x[x %% 2 == 0]
-    ```
-
-1.  If you have a named vector, you can subset it with a character vector:
-    
-    ```{r}
-    x <- c(abc = 1, def = 2, xyz = 5)
-    x[c("xyz", "def")]
-    ```
-    
-    Like with positive integers, you can also use a character vector to 
-    duplicate individual entries.
-
-1.  The simplest type of subsetting is nothing, `x[]`, which returns the 
-    complete `x`. This is not useful for subsetting vectors, but it is useful
-    when subsetting matrices (and other high dimensional structures) because
-    it lets you select all the rows or all the columns, by leaving that
-    index blank. For example, if `x` is 2d, `x[1, ]` selects the first row and 
-    all the columns, and `x[, -1]` selects all rows and all columns except
-    the first.
-    
-To learn more about the applications of subsetting, reading the "Subsetting" chapter of _Advanced R_: <http://adv-r.had.co.nz/Subsetting.html#applications>.
-
-There is an important variation of `[` called `[[`. `[[` only ever extracts a single element, and always drops names. It's a good idea to use it whenever you want to make it clear that you're extracting a single item, as in a for loop. The distinction between `[` and `[[` is most important for lists, as we'll see shortly.
-
-### Exercises
-
-1.  What does `mean(is.na(x))` tell you about a vector `x`? What about
-    `sum(!is.finite(x))`?
-
-1.  Carefully read the documentation of `is.vector()`. What does it actually
-    test for? Why does `is.atomic()` not agree with the definition of 
-    atomic vectors above?
-    
-1.  Compare and contrast `setNames()` with `purrr::set_names()`.
-
-1.  Create functions that take a vector as input and returns:
-    
-    1. The last value.  Should you use `[` or `[[`?
-
-    1. The elements at even numbered positions.
-    
-    1. Every element except the last value.
-    
-    1. Only even numbers (and no missing values).
-
-1.  Why is `x[-which(x > 0)]` not the same as `x[x <= 0]`? 
-
-1.  What happens when you subset with a positive integer that's bigger
-    than the length of the vector? What happens when you subset with a 
-    name that doesn't exist?
-
-## Recursive vectors (lists) {#lists}
-
-Lists are a step up in complexity from atomic vectors, because lists can contain other lists. This makes them suitable for representing hierarchical or tree-like structures. You create a list with `list()`:
-
-```{r}
-x <- list(1, 2, 3)
-x
-```
-
-A very useful tool for working with lists is `str()` because it focusses on the **str**ucture, not the contents.
-
-```{r}
-str(x)
-
-x_named <- list(a = 1, b = 2, c = 3)
-str(x_named)
-```
-
-Unlike atomic vectors, `list()` can contain a mix of objects:
-
-```{r}
-y <- list("a", 1L, 1.5, TRUE)
-str(y)
-```
-
-Lists can even contain other lists!
-
-```{r}
-z <- list(list(1, 2), list(3, 4))
-str(z)
-```
-
-### Visualising lists
-
-To explain more complicated list manipulation functions, it's helpful to have a visual representation of lists. For example, take these three lists:
-
-```{r}
-x1 <- list(c(1, 2), c(3, 4))
-x2 <- list(list(1, 2), list(3, 4))
-x3 <- list(1, list(2, list(3)))
-```
-
-I'll draw them as follows:
-
-```{r, echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/lists-structure.png")
-```
-
-There are three principles:
-
-1.  Lists have rounded corners. Atomic vectors have square corners.
-  
-1.  Children are drawn inside their parent, and have a slightly darker
-    background to make it easier to see  the hierarchy.
-  
-1.  The orientation of the children (i.e. rows or columns) isn't important, 
-    so I'll pick a row or column orientation to either save space or illustrate 
-    an important property in the example.
-
-### Subsetting
-
-There are three ways to subset a list, which I'll illustrate with a list named `a`:
-
-```{r}
-a <- list(a = 1:3, b = "a string", c = pi, d = list(-1, -5))
-```
-
-*   `[` extracts a sub-list. The result will always be a list.
-
-    ```{r}
-    str(a[1:2])
-    str(a[4])
-    ```
-    
-    Like with vectors, you can subset with a logical, integer, or character
-    vector.
-    
-*   `[[` extracts a single component from a list. It removes a level of 
-    hierarchy from the list.
-
-    ```{r}
-    str(a[[1]])
-    str(a[[4]])
-    ```
-
-*   `$` is a shorthand for extracting named elements of a list. It works
-    similarly to `[[` except that you don't need to use quotes.
-    
-    ```{r}
-    a$a
-    a[["a"]]
-    ```
-
-The distinction between `[` and `[[` is really important for lists, because `[[` drills down into the list while `[` returns a new, smaller list. Compare the code and output above with the visual representation in Figure \@ref(fig:lists-subsetting).
-
-```{r lists-subsetting, echo = FALSE, out.width = "75%", fig.cap = "Subsetting a list, visually."}
-knitr::include_graphics("diagrams/lists-subsetting.png")
-```
-
-### Lists of condiments
-
-The difference between `[` and `[[` is very important, but it's easy to get confused. To help you remember, let me show you an unusual pepper shaker.
-
-```{r, echo = FALSE, out.width = "25%"} 
-knitr::include_graphics("images/pepper.jpg")
-```
-
-If this pepper shaker is your list `x`, then, `x[1]` is a pepper shaker containing a single pepper packet:
-
-```{r, echo = FALSE, out.width = "25%"} 
-knitr::include_graphics("images/pepper-1.jpg")
-```
-
-`x[2]` would look the same, but would contain the second packet. `x[1:2]` would be a pepper shaker containing two pepper packets. 
-
-`x[[1]]` is:
-
-```{r, echo = FALSE, out.width = "25%"} 
-knitr::include_graphics("images/pepper-2.jpg")
-```
-
-If you wanted to get the content of the pepper package, you'd need `x[[1]][[1]]`:
-
-```{r, echo = FALSE, out.width = "25%"} 
-knitr::include_graphics("images/pepper-3.jpg")
-```
-
-### Exercises
-
-1.  Draw the following lists as nested sets:
-
-    1.  `list(a, b, list(c, d), list(e, f))`
-    1.  `list(list(list(list(list(list(a))))))`
-
-1.  What happens if you subset a tibble as if you're subsetting a list?
-    What are the key differences between a list and a tibble?
-
-## Attributes
-
-Any vector can contain arbitrary additional metadata through its __attributes__. You can think of attributes as named list of vectors that can be attached to any object. 
-You can get and set individual attribute values with `attr()` or see them all at once with `attributes()`.
-
-```{r}
-x <- 1:10
-attr(x, "greeting")
-attr(x, "greeting") <- "Hi!"
-attr(x, "farewell") <- "Bye!"
-attributes(x)
-```
-
-There are three very important attributes that are used to implement fundamental parts of R:
-
-1. __Names__ are used to name the elements of a vector.
-1. __Dimensions__ (dims, for short) make a vector behave like a matrix or array.
-1. __Class__ is used to implement the S3 object oriented system.
-
-You've seen names above, and we won't cover dimensions because we don't use matrices in this book. It remains to describe the class, which controls how __generic functions__ work. Generic functions are key to object oriented programming in R, because they make functions behave differently for different classes of input. A detailed discussion of object oriented programming is beyond the scope of this book, but you can read more about it in _Advanced R_ at <http://adv-r.had.co.nz/OO-essentials.html#s3>.
-
-Here's what a typical generic function looks like:
-
-```{r}
-as.Date
-```
-
-The call to "UseMethod" means that this is a generic function, and it will call a specific __method__, a function, based on the class of the first argument. (All methods are functions; not all functions are methods). You can list all the methods for a generic with `methods()`:
-
-```{r}
-methods("as.Date")
-```
-
-For example, if `x` is a character vector, `as.Date()` will call `as.Date.character()`; if it's a factor, it'll call `as.Date.factor()`.
-
-You can see the specific implementation of a method with `getS3method()`:
-
-```{r}
-getS3method("as.Date", "default")
-getS3method("as.Date", "numeric")
-```
-
-The most important S3 generic is `print()`: it controls how the object is printed when you type its name at the console. Other important generics are the subsetting functions `[`, `[[`, and `$`. 
-
-## Augmented vectors
-
-Atomic vectors and lists are the building blocks for other important vector types like factors and dates. I call these __augmented vectors__, because they are vectors with additional __attributes__, including class. Because augmented vectors have a class, they behave differently to the atomic vector on which they are built. In this book, we make use of four important augmented vectors:
-
-* Factors
-* Dates 
-* Date-times
-* Tibbles
-
-These are described below.
-
-### Factors
-
-Factors are designed to represent categorical data that can take a fixed set of possible values. Factors are built on top of integers, and have a levels attribute:
-
-```{r}
-x <- factor(c("ab", "cd", "ab"), levels = c("ab", "cd", "ef"))
-typeof(x)
-attributes(x)
-```
-
-### Dates and date-times
-
-Dates in R are numeric vectors that represent the number of days since 1 January 1970.
-
-```{r}
-x <- as.Date("1971-01-01")
-unclass(x)
-
-typeof(x)
-attributes(x)
-```
-
-Date-times are numeric vectors with class `POSIXct` that represent the number of seconds since 1 January 1970. (In case you were wondering, "POSIXct" stands for "Portable Operating System Interface", calendar time.)
-
-```{r}
-x <- lubridate::ymd_hm("1970-01-01 01:00")
-unclass(x)
-
-typeof(x)
-attributes(x)
-```
-
-The `tzone` attribute is optional. It controls how the time is printed, not what absolute time it refers to.
-
-```{r}
-attr(x, "tzone") <- "US/Pacific"
-x
-
-attr(x, "tzone") <- "US/Eastern"
-x
-```
-
-There is another type of date-times called POSIXlt. These are built on top of named lists:
-
-```{r}
-y <- as.POSIXlt(x)
-typeof(y)
-attributes(y)
-```
-
-POSIXlts are rare inside the tidyverse. They do crop up in base R, because they are needed to extract specific components of a date, like the year or month. Since lubridate provides helpers for you to do this instead, you don't need them. POSIXct's are always easier to work with, so if you find you have a POSIXlt, you should always convert it to a regular data time `lubridate::as_date_time()`.
-
-### Tibbles
-
-Tibbles are augmented lists: they have class "tbl_df" + "tbl" + "data.frame", and `names` (column) and `row.names` attributes:
-
-```{r}
-tb <- tibble::tibble(x = 1:5, y = 5:1)
-typeof(tb)
-attributes(tb)
-```
-
-The difference between a tibble and a list is that all the elements of a data frame must be vectors with the same length. All functions that work with tibbles enforce this constraint.
-
-Traditional data.frames have a very similar structure:
-
-```{r}
-df <- data.frame(x = 1:5, y = 5:1)
-typeof(df)
-attributes(df)
-```
-
-The main difference is the class. The class of tibble includes "data.frame" which means tibbles inherit the regular data frame behaviour by default.
-
-### Exercises
-
-1.  What does `hms::hms(3600)` return? How does it print? What primitive
-    type is the augmented vector built on top of? What attributes does it 
-    use?
-    
-1.  Try and make a tibble that has columns with different lengths. What
-    happens?
-
-1.  Based on the definition above, is it ok to have a list as a
-    column of a tibble?
diff --git a/visualize.Rmd b/visualize.Rmd
deleted file mode 100644
index 1bb813dab..000000000
--- a/visualize.Rmd
+++ /dev/null
@@ -1,767 +0,0 @@
-# Data visualisation
-
-## Introduction
-
-> "The simple graph has brought more information to the data analyst’s mind 
-> than any other device." --- John Tukey
-
-This chapter will teach you how to visualise your data using ggplot2. R has several systems for making graphs, but ggplot2 is one of the most elegant and most versatile. ggplot2 implements the __grammar of graphics__, a coherent system for describing and building graphs. With ggplot2, you can do more faster by learning one system and applying it in many places.
-
-If you'd like to learn more about the theoretical underpinnings of ggplot2 before you start, I'd recommend reading "The Layered Grammar of Graphics", <http://vita.had.co.nz/papers/layered-grammar.pdf>.
-
-### Prerequisites
-
-This chapter focusses on ggplot2, one of the core members of the tidyverse. To access the datasets, help pages, and functions that we will use in this chapter, load the tidyverse by running this code:
-
-```{r setup}
-library(tidyverse)
-```
-
-That one line of code loads the core tidyverse; packages which you will use in almost every data analysis. It also tells you which functions from the tidyverse conflict with functions in base R (or from other packages you might have loaded). 
-
-If you run this code and get the error message "there is no package called ‘tidyverse’", you'll need to first install it, then run `library()` once again.
-
-```{r eval = FALSE}
-install.packages("tidyverse")
-library(tidyverse)
-```
-
-You only need to install a package once, but you need to reload it every time you start a new session.
-
-If we need to be explicit about where a function (or dataset) comes from, we'll use the special form `package::function()`. For example, `ggplot2::ggplot()` tells you explicitly that we're using the `ggplot()` function from the ggplot2 package.
-
-## First steps
-
-Let's use our first graph to answer a question: Do cars with big engines use more fuel than cars with small engines? You probably already have an answer, but try to make your answer precise. What does the relationship between engine size and fuel efficiency look like? Is it positive? Negative? Linear? Nonlinear?
-
-### The `mpg` data frame
-
-You can test your answer with the `mpg` __data frame__ found in ggplot2 (aka  `ggplot2::mpg`). A data frame is a rectangular collection of variables (in the columns) and observations (in the rows). `mpg` contains observations collected by the US Environment Protection Agency on 38 models of car. 
-
-```{r}
-mpg
-```
-
-Among the variables in `mpg` are:
-
-1. `displ`, a car's engine size, in litres.
-
-1. `hwy`, a car's fuel efficiency on the highway, in miles per gallon (mpg). 
-  A car with a low fuel efficiency consumes more fuel than a car with a high 
-  fuel efficiency when they travel the same distance. 
-
-To learn more about `mpg`, open its help page by running `?mpg`.
-
-### Creating a ggplot
-
-To plot `mpg`, run this code to put `displ` on the x-axis and `hwy` on the y-axis:
-
-```{r}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy))
-```
-
-The plot shows a negative relationship between engine size (`displ`) and fuel efficiency (`hwy`). In other words, cars with big engines use more fuel. Does this confirm or refute your hypothesis about fuel efficiency and engine size?
-
-With ggplot2, you begin a plot with the function `ggplot()`. `ggplot()` creates a coordinate system that you can add layers to. The first argument of `ggplot()` is the dataset to use in the graph. So `ggplot(data = mpg)` creates an empty graph, but it's not very interesting so I'm not going to show it here.
-
-You complete your graph by adding one or more layers to `ggplot()`. The function `geom_point()` adds a layer of points to your plot, which creates a scatterplot. ggplot2 comes with many geom functions that each add a different type of layer to a plot. You'll learn a whole bunch of them throughout this chapter.
-
-Each geom function in ggplot2 takes a `mapping` argument. This defines how variables in your dataset are mapped to visual properties. The `mapping` argument is always paired with `aes()`, and the `x` and `y` arguments of `aes()` specify which variables to map to the x and y axes. ggplot2 looks for the mapped variable in the `data` argument, in this case, `mpg`.
-
-### A graphing template
-
-Let's turn this code into a reusable template for making graphs with ggplot2. To make a graph, replace the bracketed sections in the code below with a dataset, a geom function, or a collection of mappings.
-
-```{r eval = FALSE}
-ggplot(data = <DATA>) + 
-  <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))
-```
-
-The rest of this chapter will show you how to complete and extend this template to make different types of graphs. We will begin with the `<MAPPINGS>` component.
-
-### Exercises
-
-1.  Run `ggplot(data = mpg)`. What do you see?
-
-1.  How many rows are in `mpg`? How many columns?
-
-1.  What does the `drv` variable describe?  Read the help for `?mpg` to find
-    out.
-     
-1.  Make a scatterplot of `hwy` vs `cyl`.
-
-1.  What happens if you make a scatterplot of `class` vs `drv`? Why is
-    the plot not useful?
-
-## Aesthetic mappings
-
-> "The greatest value of a picture is when it forces us to notice what we
-> never expected to see." --- John Tukey
-
-In the plot below, one group of points (highlighted in red) seems to fall outside of the linear trend. These cars have a higher mileage than you might expect. How can you explain these cars? 
-
-```{r, echo = FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
-  geom_point() + 
-  geom_point(data = dplyr::filter(mpg, displ > 5, hwy > 20), colour = "red", size = 2.2)
-```
-
-Let's hypothesize that the cars are hybrids. One way to test this hypothesis is to look at the `class` value for each car. The `class` variable of the `mpg` dataset classifies cars into groups such as compact, midsize, and SUV. If the outlying points are hybrids, they should be classified as compact cars or, perhaps, subcompact cars (keep in mind that this data was collected before hybrid trucks and SUVs became popular).
-
-You can add a third variable, like `class`, to a two dimensional scatterplot by mapping it to an __aesthetic__. An aesthetic is a visual property of the objects in your plot. Aesthetics include things like the size, the shape, or the color of your points. You can display a point (like the one below) in different ways by changing the values of its aesthetic properties. Since we already use the word "value" to describe data, let's use the word "level" to describe aesthetic properties. Here we change the levels of a point's size, shape, and color to make the point small, triangular, or blue:
-
-```{r, echo = FALSE, asp = 1/4}
-ggplot() +
-  geom_point(aes(1, 1), size = 20) +
-  geom_point(aes(2, 1), size = 10) + 
-  geom_point(aes(3, 1), size = 20, shape = 17) + 
-  geom_point(aes(4, 1), size = 20, colour = "blue") + 
-  scale_x_continuous(NULL, limits = c(0.5, 4.5), labels = NULL) + 
-  scale_y_continuous(NULL, limits = c(0.9, 1.1), labels = NULL) + 
-  theme(aspect.ratio = 1/3)
-```
-
-You can convey information about your data by mapping the aesthetics in your plot to the variables in your dataset. For example, you can map the colors of your points to the `class` variable to reveal the class of each car.
-
-```{r}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, color = class))
-```
-
-(If you prefer British English, like Hadley, you can use `colour` instead of `color`.)
-
-To map an aesthetic to a variable, associate the name of the aesthetic to the name of the variable inside `aes()`. ggplot2 will automatically assign a unique level of the aesthetic (here a unique color) to each unique value of the variable, a process known as __scaling__. ggplot2 will also add a legend that explains which levels correspond to which values.
-
-The colors reveal that many of the unusual points are two-seater cars. These cars don't seem like hybrids, and are, in fact, sports cars! Sports cars have large engines like SUVs and pickup trucks, but small bodies like midsize and compact cars, which improves their gas mileage. In hindsight, these cars were unlikely to be hybrids since they have large engines.
-
-In the above example, we mapped `class` to the color aesthetic, but we could have mapped `class` to the size aesthetic in the same way. In this case, the exact size of each point would reveal its class affiliation. We get a _warning_ here, because mapping an unordered variable (`class`) to an ordered aesthetic (`size`) is not a good idea.
-
-```{r}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, size = class))
-```
-
-Or we could have mapped `class` to the _alpha_ aesthetic, which controls the transparency of the points, or the shape of the points.
-
-```{r out.width = "50%", fig.align = 'default', warning = FALSE, fig.asp = 1/2, fig.cap =""}
-# Left
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
-
-# Right
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy, shape = class))
-```
-
-What happened to the SUVs? ggplot2 will only use six shapes at a time. By default, additional groups will go unplotted when you use the shape aesthetic.
-
-For each aesthetic, you use `aes()` to associate the name of the aesthetic with a variable to display. The `aes()` function gathers together each of the aesthetic mappings used by a layer and passes them to the layer's mapping argument. The syntax highlights a useful insight about `x` and `y`: the x and y locations of a point are themselves aesthetics, visual properties that you can map to variables to display information about the data. 
-
-Once you map an aesthetic, ggplot2 takes care of the rest. It selects a reasonable scale to use with the aesthetic, and it constructs a legend that explains the mapping between levels and values. For x and y aesthetics, ggplot2 does not create a legend, but it creates an axis line with tick marks and a label. The axis line acts as a legend; it explains the mapping between locations and values.
-
-You can also _set_ the aesthetic properties of your geom manually. For example, we can make all of the points in our plot blue:
-
-```{r}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy), color = "blue")
-```
-
-Here, the color doesn't convey information about a variable, but only changes the appearance of the plot. To set an aesthetic manually, set the aesthetic by name as an argument of your geom function; i.e. it goes _outside_ of `aes()`. You'll need to pick a level that makes sense for that aesthetic:
-
-* The name of a color as a character string.
-
-* The size of a point in mm.
-
-* The shape of a point as a number, as shown in Figure \@ref(fig:shapes).
-
-```{r shapes, echo = FALSE, out.width = "75%", fig.asp = 1/3, fig.cap="R has 25 built in shapes that are identified by numbers. There are some seeming duplicates: for example, 0, 15, and 22 are all squares. The difference comes from the interaction of the `colour` and `fill` aesthetics. The hollow shapes (0--14) have a border determined by `colour`; the solid shapes (15--18) are filled with `colour`; the filled shapes (21--24) have a border of `colour` and are filled with `fill`.", warning = FALSE}
-shapes <- tibble(
-  shape = c(0, 1, 2, 5, 3, 4, 6:19, 22, 21, 24, 23, 20),
-  x = (0:24 %/% 5) / 2,
-  y = (-(0:24 %% 5)) / 4
-)
-ggplot(shapes, aes(x, y)) + 
-  geom_point(aes(shape = shape), size = 5, fill = "red") +
-  geom_text(aes(label = shape), hjust = 0, nudge_x = 0.15) +
-  scale_shape_identity() +
-  expand_limits(x = 4.1) +
-  scale_x_continuous(NULL, breaks = NULL) + 
-  scale_y_continuous(NULL, breaks = NULL, limits = c(-1.2, 0.2)) + 
-  theme_minimal() +
-  theme(aspect.ratio = 1/2.75)
-```
-
-### Exercises
-
-1.  What's gone wrong with this code? Why are the points not blue?
-
-    ```{r}
-    ggplot(data = mpg) + 
-      geom_point(mapping = aes(x = displ, y = hwy, color = "blue"))
-    ```
-    
-1.  Which variables in `mpg` are categorical? Which variables are continuous? 
-    (Hint: type `?mpg` to read the documentation for the dataset). How
-    can you see this information when you run `mpg`?
-
-1.  Map a continuous variable to `color`, `size`, and `shape`. How do
-    these aesthetics behave differently for categorical vs. continuous
-    variables? 
-    
-1.  What happens if you map the same variable to multiple aesthetics? 
-
-1.  What does the `stroke` aesthetic do? What shapes does it work with?
-    (Hint: use `?geom_point`)
-    
-1.  What happens if you map an aesthetic to something other than a variable 
-    name, like `aes(colour = displ < 5)`?  
-
-## Common problems
-
-As you start to run R code, you're likely to run into problems. Don't worry --- it happens to everyone. I have been writing R code for years, and every day I still write code that doesn't work! 
-
-Start by carefully comparing the code that you're running to the code in the book. R is extremely picky, and a misplaced character can make all the difference. Make sure that every `(` is matched with a `)` and every `"` is paired with another `"`. Sometimes you'll run the code and nothing happens. Check the left-hand of your console: if it's a `+`, it means that R doesn't think you've typed a complete expression and it's waiting for you to finish it. In this case, it's usually easy to start from scratch again by pressing ESCAPE to abort processing the current command.
-
-One common problem when creating ggplot2 graphics is to put the `+` in the wrong place: it has to come at the end of the line, not the start. In other words, make sure you haven't accidentally written code like this:
-
-```R
-ggplot(data = mpg) 
-+ geom_point(mapping = aes(x = displ, y = hwy))
-```
-
-If you're still stuck, try the help. You can get help about any R function by running `?function_name` in the console, or selecting the function name and pressing F1 in RStudio. Don't worry if the help doesn't seem that helpful - instead skip down to the examples and look for code that matches what you're trying to do.
-
-If that doesn't help, carefully read the error message. Sometimes the answer will be buried there! But when you're new to R, the answer might be in the error message but you don't yet know how to understand it. Another great tool is Google: try googling the error message, as it's likely someone else has had the same problem, and has gotten help online.
-
-## Facets
-
-One way to add additional variables is with aesthetics. Another way, particularly useful for categorical variables, is to split your plot into __facets__, subplots that each display one subset of the data. 
-
-To facet your plot by a single variable, use `facet_wrap()`. The first argument of `facet_wrap()` should be a formula, which you create with `~` followed by a variable name (here "formula" is the name of a data structure in R, not a synonym for "equation"). The variable that you pass to `facet_wrap()` should be discrete. 
-
-```{r}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) + 
-  facet_wrap(~ class, nrow = 2)
-```
-
-To facet your plot on the combination of two variables, add `facet_grid()` to your plot call. The first argument of `facet_grid()` is also a formula. This time the formula should contain two variable names separated by a `~`. 
-
-```{r}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) + 
-  facet_grid(drv ~ cyl)
-```
-
-If you prefer to not facet in the rows or columns dimension, use a `.` instead of a variable name, e.g. `+ facet_grid(. ~ cyl)`.
-
-### Exercises
-
-1.  What happens if you facet on a continuous variable?
-
-1.  What do the empty cells in plot with `facet_grid(drv ~ cyl)` mean?
-    How do they relate to this plot?
-    
-    ```{r, eval = FALSE}
-    ggplot(data = mpg) + 
-      geom_point(mapping = aes(x = drv, y = cyl))
-    ```
-
-1.  What plots does the following code make? What does `.` do?
-
-    ```{r eval = FALSE}
-    ggplot(data = mpg) + 
-      geom_point(mapping = aes(x = displ, y = hwy)) +
-      facet_grid(drv ~ .)
-    
-    ggplot(data = mpg) + 
-      geom_point(mapping = aes(x = displ, y = hwy)) +
-      facet_grid(. ~ cyl)
-    ```
-
-1.  Take the first faceted plot in this section:
-
-    ```{r, eval = FALSE}
-    ggplot(data = mpg) + 
-      geom_point(mapping = aes(x = displ, y = hwy)) + 
-      facet_wrap(~ class, nrow = 2)
-    ```
-    
-    What are the advantages to using faceting instead of the colour aesthetic?
-    What are the disadvantages? How might the balance change if you had a 
-    larger dataset?
-    
-1.  Read `?facet_wrap`. What does `nrow` do? What does `ncol` do? What other
-    options control the layout of the individual panels? Why doesn't
-    `facet_grid()` have `nrow` and `ncol` arguments?
-
-1.  When using `facet_grid()` you should usually put the variable with more
-    unique levels in the columns. Why?
-
-## Geometric objects
-
-How are these two plots similar? 
-
-```{r echo = FALSE, out.width = "50%", fig.align="default", message = FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy))
-
-ggplot(data = mpg) + 
-  geom_smooth(mapping = aes(x = displ, y = hwy))
-```
-
-Both plots contain the same x variable, the same y variable, and both describe the same data. But the plots are not identical. Each plot uses a different visual object to represent the data. In ggplot2 syntax, we say that they use different __geoms__.
-
-A __geom__ is the geometrical object that a plot uses to represent data. People often describe plots by the type of geom that the plot uses. For example, bar charts use bar geoms, line charts use line geoms, boxplots use boxplot geoms, and so on. Scatterplots break the trend; they use the point geom. As we see above, you can use different geoms to plot the same data. The plot on the left uses the point geom, and the plot on the right uses the smooth geom, a smooth line fitted to the data. 
-
-To change the geom in your plot, change the geom function that you add to `ggplot()`. For instance, to make the plots above, you can use this code:
-
-```{r eval = FALSE}
-# left
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy))
-
-# right
-ggplot(data = mpg) + 
-  geom_smooth(mapping = aes(x = displ, y = hwy))
-```
-
-Every geom function in ggplot2 takes a `mapping` argument. However, not every aesthetic works with every geom. You could set the shape of a point, but you couldn't set the "shape" of a line. On the other hand, you _could_ set the linetype of a line. `geom_smooth()` will draw a different line, with a different linetype, for each unique value of the variable that you map to linetype.
-
-```{r message = FALSE}
-ggplot(data = mpg) + 
-  geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
-```
-
-Here `geom_smooth()` separates the cars into three lines based on their `drv` value, which describes a car's drivetrain. One line describes all of the points with a `4` value, one line describes all of the points with an `f` value, and one line describes all of the points with an `r` value. Here, `4` stands for four-wheel drive, `f` for front-wheel drive, and `r` for rear-wheel drive.
-
-If this sounds strange, we can make it more clear by overlaying the lines on top of the raw data and then coloring everything according to `drv`. 
-
-```{r echo = FALSE, message = FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
-  geom_point() +
-  geom_smooth(mapping = aes(linetype = drv))
-```
-
-Notice that this plot contains two geoms in the same graph! If this makes you excited, buckle up. In the next section, we will learn how to place multiple geoms in the same plot.
-
-ggplot2 provides over 30 geoms, and extension packages provide even more (see <https://www.ggplot2-exts.org> for a sampling). The best way to get a comprehensive overview is the ggplot2 cheatsheet, which you can find at <http://rstudio.com/cheatsheets>. To learn more about any single geom, use help: `?geom_smooth`.
-
-Many geoms, like `geom_smooth()`, use a single geometric object to display multiple rows of data. For these geoms, you can set the `group` aesthetic to a categorical variable to draw multiple objects. ggplot2 will draw a separate object for each unique value of the grouping variable. In practice, ggplot2 will automatically group the data for these geoms whenever you map an aesthetic to a discrete variable (as in the `linetype` example). It is convenient to rely on this feature because the group aesthetic by itself does not add a legend or distinguishing features to the geoms.
-
-```{r, fig.width = 3, fig.align = 'default', out.width = "33%", message = FALSE}
-ggplot(data = mpg) +
-  geom_smooth(mapping = aes(x = displ, y = hwy))
-              
-ggplot(data = mpg) +
-  geom_smooth(mapping = aes(x = displ, y = hwy, group = drv))
-    
-ggplot(data = mpg) +
-  geom_smooth(
-    mapping = aes(x = displ, y = hwy, color = drv),
-    show.legend = FALSE
-  )
-```
-
-To display multiple geoms in the same plot, add multiple geom functions to `ggplot()`:
-
-```{r, message = FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy)) +
-  geom_smooth(mapping = aes(x = displ, y = hwy))
-```
-
-This, however, introduces some duplication in our code. Imagine if you wanted to change the y-axis to display `cty` instead of `hwy`. You'd need to change the variable in two places, and you might forget to update one. You can avoid this type of repetition by passing a set of mappings to `ggplot()`. ggplot2 will treat these mappings as global mappings that apply to each geom in the graph.  In other words, this code will produce the same plot as the previous code:
-
-```{r, eval = FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-  geom_point() + 
-  geom_smooth()
-```
-
-If you place mappings in a geom function, ggplot2 will treat them as local mappings for the layer. It will use these mappings to extend or overwrite the global mappings _for that layer only_. This makes it possible to display different aesthetics in different layers.
-
-```{r, message = FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-  geom_point(mapping = aes(color = class)) + 
-  geom_smooth()
-```
-
-You can use the same idea to specify different `data` for each layer. Here, our smooth line displays just a subset of the `mpg` dataset, the subcompact cars. The local data argument in `geom_smooth()` overrides the global data argument in `ggplot()` for that layer only.
-
-```{r, message = FALSE}
-ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-  geom_point(mapping = aes(color = class)) + 
-  geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE)
-```
-
-(You'll learn how `filter()` works in the next chapter: for now, just know that this command selects only the subcompact cars.)
-
-### Exercises
-
-1.  What geom would you use to draw a line chart? A boxplot? 
-    A histogram? An area chart?
-
-1.  Run this code in your head and predict what the output will look like.
-    Then, run the code in R and check your predictions.
-    
-    ```{r, eval = FALSE}
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
-      geom_point() + 
-      geom_smooth(se = FALSE)
-    ```
-
-1.  What does `show.legend = FALSE` do?  What happens if you remove it?  
-    Why do you think I used it earlier in the chapter?
-
-1.  What does the `se` argument to `geom_smooth()` do?
-
-
-1.  Will these two graphs look different? Why/why not?
-
-    ```{r, eval = FALSE}
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-      geom_point() + 
-      geom_smooth()
-    
-    ggplot() + 
-      geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-      geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy))
-    ```
-
-1.  Recreate the R code necessary to generate the following graphs.
-    
-    ```{r echo = FALSE, fig.width = 3, out.width = "50%", fig.align = "default", message = FALSE}
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-      geom_point() + 
-      geom_smooth(se = FALSE)
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-      geom_smooth(aes(group = drv), se = FALSE) +
-      geom_point()
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
-      geom_point() + 
-      geom_smooth(se = FALSE)
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-      geom_point(aes(color = drv)) + 
-      geom_smooth(se = FALSE)
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-      geom_point(aes(color = drv)) +
-      geom_smooth(aes(linetype = drv), se = FALSE)
-    ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
-      geom_point(size = 4, colour = "white") + 
-      geom_point(aes(colour = drv))
-    ```
-
-## Statistical transformations
-
-Next, let's take a look at a bar chart. Bar charts seem simple, but they are interesting because they reveal something subtle about plots. Consider a basic bar chart, as drawn with `geom_bar()`. The following chart displays the total number of diamonds in the `diamonds` dataset, grouped by `cut`. The `diamonds` dataset comes in ggplot2 and contains information about ~54,000 diamonds, including the `price`, `carat`, `color`, `clarity`, and `cut` of each diamond. The chart shows that more diamonds are available with high quality cuts than with low quality cuts. 
-
-```{r}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut))
-```
-
-On the x-axis, the chart displays `cut`, a variable from `diamonds`. On the y-axis, it displays count, but count is not a variable in `diamonds`! Where does count come from? Many graphs, like scatterplots, plot the raw values of your dataset. Other graphs, like bar charts, calculate new values to plot:
-
-* bar charts, histograms, and frequency polygons bin your data 
-  and then plot bin counts, the number of points that fall in each bin.
-
-* smoothers fit a model to your data and then plot predictions from the
-  model.
-
-* boxplots compute a robust summary of the distribution and then display a 
-  specially formatted box.
-
-The algorithm used to calculate new values for a graph is called a __stat__, short for statistical transformation. The figure below describes how this process works with `geom_bar()`.
-
-```{r, echo = FALSE, out.width = "100%"}
-knitr::include_graphics("images/visualization-stat-bar.png")
-```
-
-You can learn which stat a geom uses by inspecting the default value for the `stat` argument. For example, `?geom_bar` shows that the default value for `stat` is "count", which means that `geom_bar()` uses `stat_count()`. `stat_count()` is documented on the same page as `geom_bar()`, and if you scroll down you can find a section called "Computed variables". That describes how it computes two new variables: `count` and `prop`.
-
-You can generally use geoms and stats interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`:
-
-```{r}
-ggplot(data = diamonds) + 
-  stat_count(mapping = aes(x = cut))
-```
-
-This works because every geom has a default stat; and every stat has a default geom. This means that you can typically use geoms without worrying about the underlying statistical transformation. There are three reasons you might need to use a stat explicitly:
-
-1.  You might want to override the default stat. In the code below, I change 
-    the stat of `geom_bar()` from count (the default) to identity. This lets 
-    me map the height of the bars to the raw values of a $y$ variable. 
-    Unfortunately when people talk about bar charts casually, they might be
-    referring to this type of bar chart, where the height of the bar is already
-    present in the data, or the previous bar chart where the height of the bar
-    is generated by counting rows.
-    
-    ```{r, warning = FALSE}
-    demo <- tribble(
-      ~cut,         ~freq,
-      "Fair",       1610,
-      "Good",       4906,
-      "Very Good",  12082,
-      "Premium",    13791,
-      "Ideal",      21551
-    )
-    
-    ggplot(data = demo) +
-      geom_bar(mapping = aes(x = cut, y = freq), stat = "identity")
-    ```
-    
-    (Don't worry that you haven't seen `<-` or `tribble()` before. You might be
-    able to guess at their meaning from the context, and you'll learn exactly
-    what they do soon!)
-
-1.  You might want to override the default mapping from transformed variables
-    to aesthetics. For example, you might want to display a bar chart of
-    proportion, rather than count:
-    
-    ```{r}
-    ggplot(data = diamonds) + 
-      geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))
-    ```
-
-    To find the variables computed by the stat, look for the help section
-    titled "computed variables".
-    
-1.  You might want to draw greater attention to the statistical transformation
-    in your code. For example, you might use `stat_summary()`, which
-    summarises the y values for each unique x value, to draw 
-    attention to the summary that you're computing:
-    
-    ```{r}
-    ggplot(data = diamonds) + 
-      stat_summary(
-        mapping = aes(x = cut, y = depth),
-        fun.ymin = min,
-        fun.ymax = max,
-        fun.y = median
-      )
-    ```
-    
-ggplot2 provides over 20 stats for you to use. Each stat is a function, so you can get help in the usual way, e.g. `?stat_bin`. To see a complete list of stats, try the ggplot2 cheatsheet.
-
-### Exercises
-
-1.  What is the default geom associated with `stat_summary()`? How could
-    you rewrite the previous plot to use that geom function instead of the 
-    stat function?
-
-1.  What does `geom_col()` do? How is it different to `geom_bar()`?
-
-1.  Most geoms and stats come in pairs that are almost always used in 
-    concert. Read through the documentation and make a list of all the 
-    pairs. What do they have in common?
-
-1.  What variables does `stat_smooth()` compute? What parameters control
-    its behaviour?
-
-1.  In our proportion bar chart, we need to set `group = 1`. Why? In other
-    words what is the problem with these two graphs?
-    
-    ```{r, eval = FALSE}
-    ggplot(data = diamonds) + 
-      geom_bar(mapping = aes(x = cut, y = ..prop..))
-    ggplot(data = diamonds) + 
-      geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..))
-    ```
-  
-
-## Position adjustments
-
-There's one more piece of magic associated with bar charts. You can colour a bar chart using either the `colour` aesthetic, or, more usefully, `fill`:
-
-```{r out.width = "50%", fig.align = "default"}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, colour = cut))
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = cut))
-```
-
-Note what happens if you map the fill aesthetic to another variable, like `clarity`: the bars are automatically stacked. Each colored rectangle represents a combination of `cut` and `clarity`.
-
-```{r}
-ggplot(data = diamonds) + 
-  geom_bar(mapping = aes(x = cut, fill = clarity))
-```
-
-The stacking is performed automatically by the __position adjustment__ specified by the `position` argument. If you don't want a stacked bar chart, you can use one of three other options: `"identity"`, `"dodge"` or `"fill"`.
-
-*   `position = "identity"` will place each object exactly where it falls in 
-    the context of the graph. This is not very useful for bars, because it
-    overlaps them. To see that overlapping we either need to make the bars
-    slightly transparent by setting `alpha` to a small value, or completely
-    transparent by setting `fill = NA`.
-    
-    ```{r out.width = "50%", fig.align = "default"}
-    ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
-      geom_bar(alpha = 1/5, position = "identity")
-    ggplot(data = diamonds, mapping = aes(x = cut, colour = clarity)) + 
-      geom_bar(fill = NA, position = "identity")
-    ```
-    
-    The identity position adjustment is more useful for 2d geoms, like points,
-    where it is the default.
-    
-*   `position = "fill"` works like stacking, but makes each set of stacked bars
-    the same height. This makes it easier to compare proportions across 
-    groups.
-
-    ```{r}
-    ggplot(data = diamonds) + 
-      geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")
-    ```
-
-*   `position = "dodge"` places overlapping objects directly _beside_ one 
-    another. This makes it easier to compare individual values.
-
-    ```{r}
-    ggplot(data = diamonds) + 
-      geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")
-    ```
-
-There's one other type of adjustment that's not useful for bar charts, but it can be very useful for scatterplots. Recall our first scatterplot. Did you notice that the plot displays only 126 points, even though there are 234 observations in the dataset?
-
-```{r echo = FALSE}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy))
-```
-
-The values of `hwy` and `displ` are rounded so the points appear on a grid and many points overlap each other. This problem is known as __overplotting__. This arrangement makes it hard to see where the mass of the data is. Are the data points spread equally throughout the graph, or is there one special combination of `hwy` and `displ` that contains 109 values? 
-
-You can avoid this gridding by setting the position adjustment to "jitter".  `position = "jitter"` adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise.
-
-```{r}
-ggplot(data = mpg) + 
-  geom_point(mapping = aes(x = displ, y = hwy), position = "jitter")
-```
-
-Adding randomness seems like a strange way to improve your plot, but while it makes your graph less accurate at small scales, it makes your graph _more_ revealing at large scales. Because this is such a useful operation, ggplot2 comes with a shorthand for `geom_point(position = "jitter")`: `geom_jitter()`.
-
-To learn more about a position adjustment, look up the help page associated with each adjustment: `?position_dodge`, `?position_fill`, `?position_identity`, `?position_jitter`, and `?position_stack`.
-
-### Exercises
-
-1.  What is the problem with this plot? How could you improve it?
-
-    ```{r}
-    ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 
-      geom_point()
-    ```
-
-1.  What parameters to `geom_jitter()` control the amount of jittering?
-
-1.  Compare and contrast `geom_jitter()` with `geom_count()`.
-
-1.  What's the default position adjustment for `geom_boxplot()`? Create
-    a visualisation of the `mpg` dataset that demonstrates it.
-
-## Coordinate systems
-
-Coordinate systems are probably the most complicated part of ggplot2. The default coordinate system is the Cartesian coordinate system where the x and y positions act independently to determine the location of each point. There are a number of other coordinate systems that are occasionally helpful.
-
-*   `coord_flip()` switches the x and y axes. This is useful (for example),
-    if you want horizontal boxplots. It's also useful for long labels: it's
-    hard to get them to fit without overlapping on the x-axis.
-    
-    ```{r fig.width = 3, out.width = "50%", fig.align = "default"}
-    ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
-      geom_boxplot()
-    ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 
-      geom_boxplot() +
-      coord_flip()
-    ```
-
-*   `coord_quickmap()` sets the aspect ratio correctly for maps. This is very
-    important if you're plotting spatial data with ggplot2 (which unfortunately
-    we don't have the space to cover in this book).
-
-    ```{r fig.width = 3, out.width = "50%", fig.align = "default", message = FALSE}
-    nz <- map_data("nz")
-
-    ggplot(nz, aes(long, lat, group = group)) +
-      geom_polygon(fill = "white", colour = "black")
-
-    ggplot(nz, aes(long, lat, group = group)) +
-      geom_polygon(fill = "white", colour = "black") +
-      coord_quickmap()
-    ```
-
-*   `coord_polar()` uses polar coordinates. Polar coordinates reveal an 
-    interesting connection between a bar chart and a Coxcomb chart.
-    
-    ```{r fig.width = 3, out.width = "50%", fig.align = "default", fig.asp = 1}
-    bar <- ggplot(data = diamonds) + 
-      geom_bar(
-        mapping = aes(x = cut, fill = cut), 
-        show.legend = FALSE,
-        width = 1
-      ) + 
-      theme(aspect.ratio = 1) +
-      labs(x = NULL, y = NULL)
-    
-    bar + coord_flip()
-    bar + coord_polar()
-    ```
-
-### Exercises
-
-1.  Turn a stacked bar chart into a pie chart using `coord_polar()`.
-
-1.  What does `labs()` do? Read the documentation.
-
-1.  What's the difference between `coord_quickmap()` and `coord_map()`?
-
-1.  What does the plot below tell you about the relationship between city
-    and highway mpg? Why is `coord_fixed()` important? What does 
-    `geom_abline()` do?
-    
-    ```{r, fig.asp = 1, out.width = "50%"}
-    ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
-      geom_point() + 
-      geom_abline() +
-      coord_fixed()
-    ```
-
-## The layered grammar of graphics
-
-In the previous sections, you learned much more than how to make scatterplots, bar charts, and boxplots. You learned a foundation that you can use to make _any_ type of plot with ggplot2. To see this, let's add position adjustments, stats, coordinate systems, and faceting to our code template:
-
-```
-ggplot(data = <DATA>) + 
-  <GEOM_FUNCTION>(
-     mapping = aes(<MAPPINGS>),
-     stat = <STAT>, 
-     position = <POSITION>
-  ) +
-  <COORDINATE_FUNCTION> +
-  <FACET_FUNCTION>
-```
-
-Our new template takes seven parameters, the bracketed words that appear in the template. In practice, you rarely need to supply all seven parameters to make a graph because ggplot2 will provide useful defaults for everything except the data, the mappings, and the geom function.
-
-The seven parameters in the template compose the grammar of graphics, a formal system for building plots. The grammar of graphics is based on the insight that you can uniquely describe _any_ plot as a combination of a dataset, a geom, a set of mappings, a stat, a position adjustment, a coordinate system, and a faceting scheme. 
-
-To see how this works, consider how you could build a basic plot from scratch: you could start with a dataset and then transform it into the information that you want to display (with a stat).
-
-```{r, echo = FALSE, out.width = "100%"}
-knitr::include_graphics("images/visualization-grammar-1.png")
-```
-
-Next, you could choose a geometric object to represent each observation in the transformed data. You could then use the aesthetic properties of the geoms to represent variables in the data. You would map the values of each variable to the levels of an aesthetic.
-
-```{r, echo = FALSE, out.width = "100%"}
-knitr::include_graphics("images/visualization-grammar-2.png")
-```
-
-You'd then select a coordinate system to place the geoms into. You'd use the location of the objects (which is itself an aesthetic property) to display the values of the x and y variables. At that point, you would have a complete graph, but you could further adjust the positions of the geoms within the coordinate system (a position adjustment) or split the graph into subplots (faceting). You could also extend the plot by adding one or more additional layers, where each additional layer uses a dataset, a geom, a set of mappings, a stat, and a position adjustment.
-
-```{r, echo = FALSE, out.width = "100%"}
-knitr::include_graphics("images/visualization-grammar-3.png")
-```
-
-You could use this method to build _any_ plot that you imagine. In other words, you can use the code template that you've learned in this chapter to build hundreds of thousands of unique plots.
diff --git a/visualize.qmd b/visualize.qmd
new file mode 100644
index 000000000..b162ec47d
--- /dev/null
+++ b/visualize.qmd
@@ -0,0 +1,38 @@
+# Visualize {#sec-visualize .unnumbered}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+After reading the first part of the book, you understand (at least superficially) the most important tools for doing data science.
+Now it's time to start diving into the details.
+In this part of the book, you'll learn about visualizing data in further depth.
+
+```{r}
+#| label: fig-ds-visualize
+#| echo: false
+#| fig-cap: |
+#|   Data visualization is often the first step in data exploration.
+#| fig-alt: |
+#|   Our data science model, with visualize highlighted in blue.
+#| out.width: NULL
+
+knitr::include_graphics("diagrams/data-science/visualize.png", dpi = 270)
+```
+
+Each chapter addresses one to a few aspects of creating a data visualization.
+
+-   In @sec-layers you will learn about the layered grammar of graphics.
+
+-   In @sec-exploratory-data-analysis, you'll combine visualization with your curiosity and skepticism to ask and answer interesting questions about data.
+
+-   Finally, in @sec-communication you will learn how to take your exploratory graphics, elevate them, and turn them into expository graphics, graphics that help the newcomer to your analysis understand what's going on as quickly and easily as possible.
+
+These three chapters get you started in the world of visualization, but there is much more to learn.
+The absolute best place to learn more is the ggplot2 book: [*ggplot2: Elegant graphics for data analysis*](https://ggplot2-book.org/).
+It goes into much more depth about the underlying theory, and has many more examples of how to combine the individual pieces to solve practical problems.
+Another great resource is the ggplot2 extensions gallery <https://exts.ggplot2.tidyverse.org/gallery/>.
+This site lists many of the packages that extend ggplot2 with new geoms and scales.
+It's a great place to start if you're trying to do something that seems hard with ggplot2.
diff --git a/webscraping.qmd b/webscraping.qmd
new file mode 100644
index 000000000..9e30b3790
--- /dev/null
+++ b/webscraping.qmd
@@ -0,0 +1,554 @@
+# Web scraping {#sec-scraping}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+## Introduction
+
+This chapter introduces you to the basics of web scraping with [rvest](https://rvest.tidyverse.org).
+Web scraping is a very useful tool for extracting data from web pages.
+Some websites will offer an API, a set of structured HTTP requests that return data as JSON, which you handle using the techniques from @sec-rectangling.
+Where possible, you should use the API[^webscraping-1], because typically it will give you more reliable data.
+Unfortunately, however, programming with web APIs is out of scope for this book.
+Instead, we are teaching scraping, a technique that works whether or not a site provides an API.
+
+[^webscraping-1]: And many popular APIs already have CRAN packages that wrap them, so start with a little research first!
+
+In this chapter, we'll first discuss the ethics and legalities of scraping before we dive into the basics of HTML.
+You'll then learn the basics of CSS selectors to locate specific elements on the page, and how to use rvest functions to get data from text and attributes out of HTML and into R.
+We'll then discuss some techniques to figure out what CSS selector you need for the page you're scraping, before finishing up with a couple of case studies, and a brief discussion of dynamic websites.
+
+### Prerequisites
+
+In this chapter, we'll focus on tools provided by rvest.
+rvest is a member of the tidyverse, but is not a core member so you'll need to load it explicitly.
+We'll also load the full tidyverse since we'll find it generally useful working with the data we've scraped.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(rvest)
+```
+
+## Scraping ethics and legalities
+
+Before we get started discussing the code you'll need to perform web scraping, we need to talk about whether it's legal and ethical for you to do so.
+Overall, the situation is complicated with regards to both of these.
+
+Legalities depend a lot on where you live.
+However, as a general principle, if the data is public, non-personal, and factual, you're likely to be ok[^webscraping-2].
+These three factors are important because they're connected to the site's terms and conditions, personally identifiable information, and copyright, as we'll discuss below.
+
+[^webscraping-2]: Obviously we're not lawyers, and this is not legal advice.
+    But this is the best summary we can give having read a bunch about this topic.
+
+If the data isn't public, non-personal, or factual or you're scraping the data specifically to make money with it, you'll need to talk to a lawyer.
+In any case, you should be respectful of the resources of the server hosting the pages you are scraping.
+Most importantly, this means that if you're scraping many pages, you should make sure to wait a little between each request.
+One easy way to do so is to use the [**polite**](https://dmi3kno.github.io/polite/) package by Dmytro Perepolkin.
+It will automatically pause between requests and cache the results so you never ask for the same page twice.
+
+### Terms of service
+
+If you look closely, you'll find many websites include a "terms and conditions" or "terms of service" link somewhere on the page, and if you read that page closely you'll often discover that the site specifically prohibits web scraping.
+These pages tend to be a legal land grab where companies make very broad claims.
+It's polite to respect these terms of service where possible, but take any claims with a grain of salt.
+
+US courts have generally found that simply putting the terms of service in the footer of the website isn't sufficient for you to be bound by them, e.g., [HiQ Labs v. LinkedIn](https://en.wikipedia.org/wiki/HiQ_Labs_v._LinkedIn).
+Generally, to be bound to the terms of service, you must have taken some explicit action like creating an account or checking a box.
+This is why whether or not the data is **public** is important; if you don't need an account to access them, it is unlikely that you are bound to the terms of service.
+Note, however, the situation is rather different in Europe where courts have found that terms of service are enforceable even if you don't explicitly agree to them.
+
+### Personally identifiable information
+
+Even if the data is public, you should be extremely careful about scraping personally identifiable information like names, email addresses, phone numbers, dates of birth, etc.
+Europe has particularly strict laws about the collection or storage of such data ([GDPR](https://gdpr-info.eu/)), and regardless of where you live you're likely to be entering an ethical quagmire.
+For example, in 2016, a group of researchers scraped public profile information (e.g., usernames, age, gender, location, etc.) about 70,000 people on the dating site OkCupid and they publicly released these data without any attempts for anonymization.
+While the researchers felt that there was nothing wrong with this since the data were already public, this work was widely condemned due to ethics concerns around identifiability of users whose information was released in the dataset.
+If your work involves scraping personally identifiable information, we strongly recommend reading about the OkCupid study[^webscraping-3] as well as similar studies with questionable research ethics involving the acquisition and release of personally identifiable information.
+
+[^webscraping-3]: One example of an article on the OkCupid study was published by Wired, <https://www.wired.com/2016/05/okcupid-study-reveals-perils-big-data-science>.
+
+### Copyright
+
+Finally, you also need to worry about copyright law.
+Copyright law is complicated, but it's worth taking a look at the [US law](https://www.law.cornell.edu/uscode/text/17/102) which describes exactly what's protected: "\[...\] original works of authorship fixed in any tangible medium of expression, \[...\]".
+It then goes on to describe specific categories that it applies like literary works, musical works, motion pictures and more.
+Notably absent from copyright protection are data.
+This means that as long as you limit your scraping to facts, copyright protection does not apply.
+(But note that Europe has a separate "[sui generis](https://en.wikipedia.org/wiki/Database_right)" right that protects databases.)
+
+As a brief example, in the US, lists of ingredients and instructions are not copyrightable, so copyright can not be used to protect a recipe.
+But if that list of recipes is accompanied by substantial novel literary content, that is copyrightable.
+This is why when you're looking for a recipe on the internet there's always so much content beforehand.
+
+If you do need to scrape original content (like text or images), you may still be protected under the [doctrine of fair use](https://en.wikipedia.org/wiki/Fair_use).
+Fair use is not a hard and fast rule, but weighs up a number of factors.
+It's more likely to apply if you are collecting the data for research or non-commercial purposes and if you limit what you scrape to just what you need.
+
+## HTML basics
+
+To scrape webpages, you need to first understand a little bit about **HTML**, the language that describes web pages.
+HTML stands for **H**yper**T**ext **M**arkup **L**anguage and looks something like this:
+
+``` html
+<html>
+<head>
+  <title>Page title</title>
+</head>
+<body>
+  <h1 id='first'>A heading</h1>
+  <p>Some text &amp; <b>some bold text.</b></p>
+  <img src='myimg.png' width='100' height='100'>
+</body>
+```
+
+HTML has a hierarchical structure formed by **elements** which consist of a start tag (e.g., `<tag>`), optional **attributes** (`id='first'`), an end tag[^webscraping-4] (like `</tag>`), and **contents** (everything in between the start and end tag).
+
+[^webscraping-4]: A number of tags (including `<p>` and `<li>)` don't require end tags, but we think it's best to include them because it makes seeing the structure of the HTML a little easier.
+
+Since `<` and `>` are used for start and end tags, you can't write them directly.
+Instead you have to use the HTML **escapes** `&gt;` (greater than) and `&lt;` (less than).
+And since those escapes use `&`, if you want a literal ampersand you have to escape it as `&amp;`.
+There are a wide range of possible HTML escapes but you don't need to worry about them too much because rvest automatically handles them for you.
+
+Web scraping is possible because most pages that contain data that you want to scrape generally have a consistent structure.
+
+### Elements
+
+There are over 100 HTML elements.
+Some of the most important are:
+
+-   Every HTML page must be in an `<html>` element, and it must have two children: `<head>`, which contains document metadata like the page title, and `<body>`, which contains the content you see in the browser.
+
+-   Block tags like `<h1>` (heading 1), `<section>` (section), `<p>` (paragraph), and `<ol>` (ordered list) form the overall structure of the page.
+
+-   Inline tags like `<b>` (bold), `<i>` (italics), and `<a>` (link) format text inside block tags.
+
+If you encounter a tag that you've never seen before, you can find out what it does with a little googling.
+Another good place to start are the [MDN Web Docs](https://developer.mozilla.org/en-US/docs/Web/HTML) which describe just about every aspect of web programming.
+
+Most elements can have content in between their start and end tags.
+This content can either be text or more elements.
+For example, the following HTML contains paragraph of text, with one word in bold.
+
+```         
+<p>
+  Hi! My <b>name</b> is Hadley.
+</p>
+```
+
+The **children** are the elements it contains, so the `<p>` element above has one child, the `<b>` element.
+The `<b>` element has no children, but it does have contents (the text "name").
+
+### Attributes
+
+Tags can have named **attributes** which look like `name1='value1' name2='value2'`.
+Two of the most important attributes are `id` and `class`, which are used in conjunction with CSS (Cascading Style Sheets) to control the visual appearance of the page.
+These are often useful when scraping data off a page.
+Attributes are also used to record the destination of links (the `href` attribute of `<a>` elements) and the source of images (the `src` attribute of the `<img>` element).
+
+## Extracting data
+
+To get started scraping, you'll need the URL of the page you want to scrape, which you can usually copy from your web browser.
+You'll then need to read the HTML for that page into R with `read_html()`.
+This returns an `xml_document`[^webscraping-5] object which you'll then manipulate using rvest functions:
+
+[^webscraping-5]: This class comes from the [xml2](https://xml2.r-lib.org) package.
+    xml2 is a low-level package that rvest builds on top of.
+
+```{r}
+html <- read_html("http://rvest.tidyverse.org/")
+html
+```
+
+rvest also includes a function that lets you write HTML inline.
+We'll use this a bunch in this chapter as we teach how the various rvest functions work with simple examples.
+
+```{r}
+html <- minimal_html("
+  <p>This is a paragraph</p>
+  <ul>
+    <li>This is a bulleted list</li>
+  </ul>
+")
+html
+```
+
+Now that you have the HTML in R, it's time to extract the data of interest.
+You'll first learn about the CSS selectors that allow you to identify the elements of interest and the rvest functions that you can use to extract data from them.
+Then we'll briefly cover HTML tables, which have some special tools.
+
+### Find elements
+
+CSS is short for cascading style sheets, and is a tool for defining the visual styling of HTML documents.
+CSS includes a miniature language for selecting elements on a page called **CSS selectors**.
+CSS selectors define patterns for locating HTML elements, and are useful for scraping because they provide a concise way of describing which elements you want to extract.
+
+We'll come back to CSS selectors in more detail in @sec-css-selectors, but luckily you can get a long way with just three:
+
+-   `p` selects all `<p>` elements.
+
+-   `.title` selects all elements with `class` "title".
+
+-   `#title` selects the element with the `id` attribute that equals "title".
+    Id attributes must be unique within a document, so this will only ever select a single element.
+
+Let's try out these selectors with a simple example:
+
+```{r}
+html <- minimal_html("
+  <h1>This is a heading</h1>
+  <p id='first'>This is a paragraph</p>
+  <p class='important'>This is an important paragraph</p>
+")
+```
+
+Use `html_elements()` to find all elements that match the selector:
+
+```{r}
+html |> html_elements("p")
+html |> html_elements(".important")
+html |> html_elements("#first")
+```
+
+Another important function is `html_element()` which always returns the same number of outputs as inputs.
+If you apply it to a whole document it'll give you the first match:
+
+```{r}
+html |> html_element("p")
+```
+
+There's an important difference between `html_element()` and `html_elements()` when you use a selector that doesn't match any elements.
+`html_elements()` returns a vector of length 0, where `html_element()` returns a missing value.
+This will be important shortly.
+
+```{r}
+html |> html_elements("b")
+html |> html_element("b")
+```
+
+### Nesting selections
+
+In most cases, you'll use `html_elements()` and `html_element()` together, typically using `html_elements()` to identify elements that will become observations then using `html_element()` to find elements that will become variables.
+Let's see this in action using a simple example.
+Here we have an unordered list (`<ul>)` where each list item (`<li>`) contains some information about four characters from StarWars:
+
+```{r}
+html <- minimal_html("
+  <ul>
+    <li><b>C-3PO</b> is a <i>droid</i> that weighs <span class='weight'>167 kg</span></li>
+    <li><b>R4-P17</b> is a <i>droid</i></li>
+    <li><b>R2-D2</b> is a <i>droid</i> that weighs <span class='weight'>96 kg</span></li>
+    <li><b>Yoda</b> weighs <span class='weight'>66 kg</span></li>
+  </ul>
+  ")
+```
+
+We can use `html_elements()` to make a vector where each element corresponds to a different character:
+
+```{r}
+characters <- html |> html_elements("li")
+characters
+```
+
+To extract the name of each character, we use `html_element()`, because when applied to the output of `html_elements()` it's guaranteed to return one response per element:
+
+```{r}
+characters |> html_element("b")
+```
+
+The distinction between `html_element()` and `html_elements()` isn't important for name, but it is important for weight.
+We want to get one weight for each character, even if there's no weight `<span>`.
+That's what `html_element()` does:
+
+```{r}
+characters |> html_element(".weight")
+```
+
+`html_elements()` finds all weight `<span>`s that are children of `characters`.
+There's only three of these, so we lose the connection between names and weights:
+
+```{r}
+characters |> html_elements(".weight")
+```
+
+Now that you've selected the elements of interest, you'll need to extract the data, either from the text contents or some attributes.
+
+### Text and attributes
+
+`html_text2()`[^webscraping-6] extracts the plain text contents of an HTML element:
+
+[^webscraping-6]: rvest also provides `html_text()` but you should almost always use `html_text2()` since it does a better job of converting nested HTML to text.
+
+```{r}
+characters |> 
+  html_element("b") |> 
+  html_text2()
+
+characters |> 
+  html_element(".weight") |> 
+  html_text2()
+```
+
+Note that any escapes will be automatically handled; you'll only ever see HTML escapes in the source HTML, not in the data returned by rvest.
+
+`html_attr()` extracts data from attributes:
+
+```{r}
+html <- minimal_html("
+  <p><a href='https://en.wikipedia.org/wiki/Cat'>cats</a></p>
+  <p><a href='https://en.wikipedia.org/wiki/Dog'>dogs</a></p>
+")
+
+html |> 
+  html_elements("p") |> 
+  html_element("a") |> 
+  html_attr("href")
+```
+
+`html_attr()` always returns a string, so if you're extracting numbers or dates, you'll need to do some post-processing.
+
+### Tables
+
+If you're lucky, your data will be already stored in an HTML table, and it'll be a matter of just reading it from that table.
+It's usually straightforward to recognize a table in your browser: it'll have a rectangular structure of rows and columns, and you can copy and paste it into a tool like Excel.
+
+HTML tables are built up from four main elements: `<table>`, `<tr>` (table row), `<th>` (table heading), and `<td>` (table data).
+Here's a simple HTML table with two columns and three rows:
+
+```{r}
+html <- minimal_html("
+  <table class='mytable'>
+    <tr><th>x</th>   <th>y</th></tr>
+    <tr><td>1.5</td> <td>2.7</td></tr>
+    <tr><td>4.9</td> <td>1.3</td></tr>
+    <tr><td>7.2</td> <td>8.1</td></tr>
+  </table>
+  ")
+```
+
+rvest provides a function that knows how to read this sort of data: `html_table()`.
+It returns a list containing one tibble for each table found on the page.
+Use `html_element()` to identify the table you want to extract:
+
+```{r}
+html |> 
+  html_element(".mytable") |> 
+  html_table()
+```
+
+Note that `x` and `y` have automatically been converted to numbers.
+This automatic conversion doesn't always work, so in more complex scenarios you may want to turn it off with `convert = FALSE` and then do your own conversion.
+
+## Finding the right selectors {#sec-css-selectors}
+
+Figuring out the selector you need for your data is typically the hardest part of the problem.
+You'll often need to do some experimenting to find a selector that is both specific (i.e. it doesn't select things you don't care about) and sensitive (i.e. it does select everything you care about).
+Lots of trial and error is a normal part of the process!
+There are two main tools that are available to help you with this process: SelectorGadget and your browser's developer tools.
+
+[SelectorGadget](https://rvest.tidyverse.org/articles/selectorgadget.html) is a javascript bookmarklet that automatically generates CSS selectors based on the positive and negative examples that you provide.
+It doesn't always work, but when it does, it's magic!
+You can learn how to install and use SelectorGadget either by reading <https://rvest.tidyverse.org/articles/selectorgadget.html> or watching Mine's video at <https://www.youtube.com/watch?v=PetWV5g1Xsc>.
+
+Every modern browser comes with some toolkit for developers, but we recommend Chrome, even if it isn't your regular browser: its web developer tools are some of the best and they're immediately available.
+Right click on an element on the page and click `Inspect`.
+This will open an expandable view of the complete HTML page, centered on the element that you just clicked.
+You can use this to explore the page and get a sense of what selectors might work.
+Pay particular attention to the class and id attributes, since these are often used to form the visual structure of the page, and hence make for good tools to extract the data that you're looking for.
+
+Inside the Elements view, you can also right click on an element and choose `Copy as Selector` to generate a selector that will uniquely identify the element of interest.
+
+If either SelectorGadget or Chrome DevTools have generated a CSS selector that you don't understand, try [Selectors Explained](https://kittygiraudel.github.io/selectors-explained/){.uri} which translates CSS selectors into plain English.
+If you find yourself doing this a lot, you might want to learn more about CSS selectors generally.
+We recommend starting with the fun [CSS dinner](https://flukeout.github.io/) tutorial and then referring to the [MDN web docs](https://developer.mozilla.org/en-US/docs/Web/CSS/CSS_Selectors).
+
+## Putting it all together
+
+Let's put this all together to scrape some websites.
+There's some risk that these examples may no longer work when you run them --- that's the fundamental challenge of web scraping; if the structure of the site changes, then you'll have to change your scraping code.
+
+### StarWars
+
+rvest includes a very simple example in `vignette("starwars")`.
+This is a simple page with minimal HTML so it's a good place to start.
+I'd encourage you to navigate to that page now and use "Inspect Element" to inspect one of the headings that's the title of a Star Wars movie.
+Use the keyboard or mouse to explore the hierarchy of the HTML and see if you can get a sense of the shared structure used by each movie.
+
+You should be able to see that each movie has a shared structure that looks like this:
+
+``` html
+<section>
+  <h2 data-id="1">The Phantom Menace</h2>
+  <p>Released: 1999-05-19</p>
+  <p>Director: <span class="director">George Lucas</span></p>
+  
+  <div class="crawl">
+    <p>...</p>
+    <p>...</p>
+    <p>...</p>
+  </div>
+</section>
+```
+
+Our goal is to turn this data into a 7 row data frame with variables `title`, `year`, `director`, and `intro`.
+We'll start by reading the HTML and extracting all the `<section>` elements:
+
+```{r}
+url <- "https://rvest.tidyverse.org/articles/starwars.html"
+html <- read_html(url)
+
+section <- html |> html_elements("section")
+section
+```
+
+This retrieves seven elements matching the seven movies found on that page, suggesting that using `section` as a selector is good.
+Extracting the individual elements is straightforward since the data is always found in the text.
+It's just a matter of finding the right selector:
+
+```{r}
+section |> html_element("h2") |> html_text2()
+
+section |> html_element(".director") |> html_text2()
+```
+
+Once we've done that for each component, we can wrap all the results up into a tibble:
+
+```{r}
+tibble(
+  title = section |> 
+    html_element("h2") |> 
+    html_text2(),
+  released = section |> 
+    html_element("p") |> 
+    html_text2() |> 
+    str_remove("Released: ") |> 
+    parse_date(),
+  director = section |> 
+    html_element(".director") |> 
+    html_text2(),
+  intro = section |> 
+    html_element(".crawl") |> 
+    html_text2()
+)
+```
+
+We did a little more processing of `released` to get a variable that will be easy to use later in our analysis.
+
+### IMDB top films
+
+For our next task we'll tackle something a little trickier, extracting the top 250 movies from the internet movie database (IMDb).
+At the time we wrote this chapter, the page looked like @fig-scraping-imdb.
+
+```{r}
+#| label: fig-scraping-imdb
+#| echo: false
+#| fig-cap: | 
+#|   Screenshot of the IMDb top movies web page taken on 2022-12-05.
+#| fig-alt: |
+#|   The screenshot shows a table with columns "Rank and Title",
+#|   "IMDb Rating", and "Your Rating". 9 movies out of the top 250
+#|   are shown. The top 5 are the Shawshank Redemption, The Godfather,
+#|   The Dark Knight, The Godfather: Part II, and 12 Angry Men.
+
+knitr::include_graphics("screenshots/scraping-imdb.png", dpi = 300)
+```
+
+This data has a clear tabular structure so it's worth starting with `html_table()`:
+
+```{r}
+url <- "https://web.archive.org/web/20220201012049/https://www.imdb.com/chart/top/"
+html <- read_html(url)
+
+table <- html |> 
+  html_element("table") |> 
+  html_table()
+table
+```
+
+This includes a few empty columns, but overall does a good job of capturing the information from the table.
+However, we need to do some more processing to make it easier to use.
+First, we'll rename the columns to be easier to work with, and remove the extraneous whitespace in rank and title.
+We will do this with `select()` (instead of `rename()`) to do the renaming and selecting of just these two columns in one step.
+Then we'll remove the new lines and extra spaces, and then apply `separate_wider_regex()` (from @sec-extract-variables) to pull out the title, year, and rank into their own variables.
+
+```{r}
+ratings <- table |>
+  select(
+    rank_title_year = `Rank & Title`,
+    rating = `IMDb Rating`
+  ) |> 
+  mutate(
+    rank_title_year = str_replace_all(rank_title_year, "\n +", " ")
+  ) |> 
+  separate_wider_regex(
+    rank_title_year,
+    patterns = c(
+      rank = "\\d+", "\\. ",
+      title = ".+", " +\\(",
+      year = "\\d+", "\\)"
+    )
+  )
+ratings
+```
+
+Even in this case where most of the data comes from table cells, it's still worth looking at the raw HTML.
+If you do so, you'll discover that we can add a little extra data by using one of the attributes.
+This is one of the reasons it's worth spending a little time spelunking the source of the page; you might find extra data, or might find a parsing route that's slightly easier.
+
+```{r}
+html |> 
+  html_elements("td strong") |> 
+  head() |> 
+  html_attr("title")
+```
+
+We can combine this with the tabular data and again apply `separate_wider_regex()` to extract out the bit of data we care about:
+
+```{r}
+ratings |>
+  mutate(
+    rating_n = html |> html_elements("td strong") |> html_attr("title")
+  ) |> 
+  separate_wider_regex(
+    rating_n,
+    patterns = c(
+      "[0-9.]+ based on ",
+      number = "[0-9,]+",
+      " user ratings"
+    )
+  ) |> 
+  mutate(
+    number = parse_number(number)
+  )
+```
+
+## Dynamic sites
+
+So far we have focused on websites where `html_elements()` returns what you see in the browser and discussed how to parse what it returns and how to organize that information in tidy data frames.
+From time-to-time, however, you'll hit a site where `html_elements()` and friends don't return anything like what you see in the browser.
+In many cases, that's because you're trying to scrape a website that dynamically generates the content of the page with javascript.
+This doesn't currently work with rvest, because rvest downloads the raw HTML and doesn't run any javascript.
+
+It's still possible to scrape these types of sites, but rvest needs to use a more expensive process: fully simulating the web browser including running all javascript.
+This functionality is not available at the time of writing, but it's something we're actively working on and might be available by the time you read this.
+It uses the [chromote package](https://rstudio.github.io/chromote/index.html) which actually runs the Chrome browser in the background, and gives you additional tools to interact with the site, like a human typing text and clicking buttons.
+Check out the [rvest website](http://rvest.tidyverse.org/) for more details.
+
+## Summary
+
+In this chapter, you've learned about the why, the why not, and the how of scraping data from web pages.
+First, you've learned about the basics of HTML and using CSS selectors to refer to specific elements, then you've learned about using the rvest package to get data out of HTML into R.
+We then demonstrated web scraping with two case studies: a simpler scenario on scraping data on StarWars films from the rvest package website and a more complex scenario on scraping the top 250 films from IMDB.
+
+Technical details of scraping data off the web can be complex, particularly when dealing with sites, however legal and ethical considerations can be even more complex.
+It's important for you to educate yourself about both of these before setting out to scrape data.
+
+This brings us to the end of the import part of the book where you've learned techniques to get data from where it lives (spreadsheets, databases, JSON files, and web sites) into a tidy form in R.
+Now it's time to turn our sights to a new topic: making the most of R as a programming language.
diff --git a/whole-game.qmd b/whole-game.qmd
new file mode 100644
index 000000000..8cec883be
--- /dev/null
+++ b/whole-game.qmd
@@ -0,0 +1,45 @@
+# Whole game {#sec-whole-game-intro .unnumbered}
+
+```{r}
+#| results: "asis"
+#| echo: false
+source("_common.R")
+```
+
+Our goal in this part of the book is to give you a rapid overview of the main tools of data science: **importing**, **tidying**, **transforming**, and **visualizing data**, as shown in @fig-ds-whole-game.
+We want to show you the "whole game" of data science giving you just enough of all the major pieces so that you can tackle real, if simple, datasets.
+The later parts of the book will hit each of these topics in more depth, increasing the range of data science challenges that you can tackle.
+
+```{r}
+#| label: fig-ds-whole-game 
+#| echo: false
+#| out.width: NULL
+#| fig-cap: |
+#|   In this section of the book, you'll learn how to import,
+#|   tidy, transform, and visualize data.
+#| fig-alt: |
+#|   A diagram displaying the data science cycle: Import -> Tidy ->
+#|   Understand  (which has the phases Transform -> Visualize -> Model in a
+#|   cycle) -> Communicate. Surrounding all of these is Program 
+#|   Import, Tidy, Transform, and Visualize is highlighted.
+
+knitr::include_graphics("diagrams/data-science/whole-game.png", dpi = 270)
+```
+
+Four chapters focus on the tools of data science:
+
+-   Visualization is a great place to start with R programming, because the payoff is so clear: you get to make elegant and informative plots that help you understand data.
+    In @sec-data-visualization you'll dive into visualization, learning the basic structure of a ggplot2 plot, and powerful techniques for turning data into plots.
+
+-   Visualization alone is typically not enough, so in @sec-data-transform, you'll learn the key verbs that allow you to select important variables, filter out key observations, create new variables, and compute summaries.
+
+-   In @sec-data-tidy, you'll learn about tidy data, a consistent way of storing your data that makes transformation, visualization, and modelling easier.
+    You'll learn the underlying principles, and how to get your data into a tidy form.
+
+-   Before you can transform and visualize your data, you need to first get your data into R.
+    In @sec-data-import you'll learn the basics of getting `.csv` files into R.
+
+Nestled among these chapters are four other chapters that focus on your R workflow.
+In @sec-workflow-basics, @sec-workflow-style, and @sec-workflow-scripts-projects you'll learn good workflow practices for writing and organizing your R code.
+These will set you up for success in the long run, as they'll give you the tools to stay organized when you tackle real projects.
+Finally, @sec-workflow-getting-help will teach you how to get help and keep learning.
diff --git a/workflow-basics.Rmd b/workflow-basics.Rmd
deleted file mode 100644
index 5e51219a0..000000000
--- a/workflow-basics.Rmd
+++ /dev/null
@@ -1,158 +0,0 @@
-# Workflow: basics
-
-You now have some experience running R code. I didn't give you many details, but you've obviously figured out the basics, or you would've thrown this book away in frustration! Frustration is natural when you start programming in R, because it is such a stickler for punctuation, and even one character out of place will cause it to complain. But while you should expect to be a little frustrated, take comfort in that it's both typical and temporary: it happens to everyone, and the only way to get over it is to keep trying.
-
-Before we go any further, let's make sure you've got a solid foundation in running R code, and that you know about some of the most helpful RStudio features.
-
-## Coding basics
-
-Let's review some basics we've so far omitted in the interests of getting you plotting as quickly as possible. You can use R as a calculator:
-
-```{r}
-1 / 200 * 30
-(59 + 73 + 2) / 3
-sin(pi / 2)
-```
-
-You can create new objects with `<-`:
-
-```{r}
-x <- 3 * 4
-```
-
-All R statements where you create objects, __assignment__ statements, have the same form:
-
-```{r eval = FALSE}
-object_name <- value
-```
-
-When reading that code say "object name gets value" in your head.
-
-You will make lots of assignments and `<-` is a pain to type. Don't be lazy and use `=`: it will work, but it will cause confusion later. Instead, use RStudio's keyboard shortcut: Alt + - (the minus sign). Notice that RStudio automagically surrounds `<-` with spaces, which is a good code formatting practice. Code is miserable to read on a good day, so giveyoureyesabreak and use spaces.
-
-## What's in a name?
-
-Object names must start with a letter, and can only contain letters, numbers, `_` and `.`. You want your object names to be descriptive, so you'll need a convention for multiple words. I recommend __snake_case__ where you separate lowercase words with `_`. 
-
-```{r, eval = FALSE}
-i_use_snake_case
-otherPeopleUseCamelCase
-some.people.use.periods
-And_aFew.People_RENOUNCEconvention
-```
-
-We'll come back to code style later, in [functions].
-
-You can inspect an object by typing its name:
-
-```{r}
-x
-```
-
-Make another assignment:
-
-```{r}
-this_is_a_really_long_name <- 2.5
-```
-
-To inspect this object, try out RStudio's completion facility: type "this", press TAB, add characters until you have a unique prefix, then press return.
-
-Ooops, you made a mistake! `this_is_a_really_long_name` should have value 3.5 not 2.5. Use another keyboard shortcut to help you fix it.  Type "this" then press Cmd/Ctrl + ↑. That will list all the commands you've typed that start those letters. Use the arrow keys to navigate, then press enter to retype the command. Change 2.5 to 3.5 and rerun.
-
-Make yet another assignment:
-
-```{r}
-r_rocks <- 2 ^ 3
-```
-
-Let's try to inspect it:
-
-```{r, eval = FALSE}
-r_rock
-#> Error: object 'r_rock' not found
-R_rocks
-#> Error: object 'R_rocks' not found
-```
-
-There's an implied contract between you and R: it will do the tedious computation for you, but in return, you must be completely precise in your instructions. Typos matter. Case matters.
-
-## Calling functions
-
-R has a large collection of built-in functions that are called like this:
-
-```{r eval = FALSE}
-function_name(arg1 = val1, arg2 = val2, ...)
-```
-
-Let's try using `seq()` which makes regular **seq**uences of numbers and, while we're at it, learn more helpful features of RStudio. Type `se` and hit TAB. A popup shows you possible completions. Specify `seq()` by typing more (a "q") to disambiguate, or by using ↑/↓ arrows to select. Notice the floating tooltip that pops up, reminding you of the function's arguments and purpose. If you want more help, press F1 to get all the details in the help tab in the lower right pane. 
-
-Press TAB once more when you've selected the function you want. RStudio will add matching opening (`(`) and closing (`)`) parentheses for you. Type the arguments `1, 10` and hit return.
-
-```{r}
-seq(1, 10)
-```
-
-Type this code and notice you get similar assistance with the paired quotation marks:
-
-```{r}
-x <- "hello world"
-```
-
-Quotation marks and parentheses must always come in a pair. RStudio does its best to help you, but it's still possible to mess up and end up with a mismatch. If this happens, R will show you the continuation character "+":
-
-```
-> x <- "hello
-+
-```
-
-The `+` tells you that R is waiting for more input; it doesn't think you're done yet. Usually that means you've forgotten either a `"` or a `)`. Either add the missing pair, or press ESCAPE to abort the expression and try again.
-
-If you make an assignment, you don't get to see the value. You're then tempted to immediately double-check the result:
-
-```{r}
-y <- seq(1, 10, length.out = 5)
-y
-```
-
-This common action can be shortened by surrounding the assignment with parentheses, which causes assignment and "print to screen" to happen.
-
-```{r}
-(y <- seq(1, 10, length.out = 5))
-```
-
-Now look at your environment in the upper right pane:
-
-```{r, echo = FALSE, out.width = NULL}
-knitr::include_graphics("screenshots/rstudio-env.png")
-```
-
-Here you can see all of the objects that you've created.
-
-## Practice
-
-1.  Why does this code not work?
-
-    ```{r, error = TRUE}
-    my_variable <- 10
-    my_varıable
-    ```
-    
-    Look carefully! (This may seem like an exercise in pointlessness, but
-    training your brain to notice even the tiniest difference will pay off
-    when programming.)
-    
-1.  Tweak each of the following R commands so that they run correctly:
-
-    ```{r, eval = FALSE}
-    library(tidyverse)
-
-    ggplot(data = mpg) + 
-      geom_point(mapping = aes(x = displ, y = hwy))
-    
-    fliter(mpg, cyl = 8)
-    filter(diamond, carat > 3)
-    ```
-    
-1.  Press Alt + Shift + K. What happens? How can you get to the same place
-    using the menus?
-
diff --git a/workflow-basics.qmd b/workflow-basics.qmd
new file mode 100644
index 000000000..6e6cafa84
--- /dev/null
+++ b/workflow-basics.qmd
@@ -0,0 +1,264 @@
+# Workflow: basics {#sec-workflow-basics}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+You now have some experience running R code.
+We didn't give you many details, but you've obviously figured out the basics, or you would've thrown this book away in frustration!
+Frustration is natural when you start programming in R because it is such a stickler for punctuation, and even one character out of place can cause it to complain.
+But while you should expect to be a little frustrated, take comfort in that this experience is typical and temporary: it happens to everyone, and the only way to get over it is to keep trying.
+
+Before we go any further, let's ensure you've got a solid foundation in running R code and that you know some of the most helpful RStudio features.
+
+## Coding basics
+
+Let's review some basics we've omitted so far in the interest of getting you plotting as quickly as possible.
+You can use R to do basic math calculations:
+
+```{r}
+1 / 200 * 30
+(59 + 73 + 2) / 3
+sin(pi / 2)
+```
+
+You can create new objects with the assignment operator `<-`:
+
+```{r}
+x <- 3 * 4
+```
+
+Note that the value of `x` is not printed, it's just stored.
+If you want to view the value, type `x` in the console.
+
+You can **c**ombine multiple elements into a vector with `c()`:
+
+```{r}
+primes <- c(2, 3, 5, 7, 11, 13)
+```
+
+And basic arithmetic on vectors is applied to every element of of the vector:
+
+```{r}
+primes * 2
+primes - 1
+```
+
+All R statements where you create objects, **assignment** statements, have the same form:
+
+```{r}
+#| eval: false
+
+object_name <- value
+```
+
+When reading that code, say "object name gets value" in your head.
+
+You will make lots of assignments, and `<-` is a pain to type.
+You can save time with RStudio's keyboard shortcut: Alt + - (the minus sign).
+Notice that RStudio automatically surrounds `<-` with spaces, which is a good code formatting practice.
+Code can be miserable to read on a good day, so giveyoureyesabreak and use spaces.
+
+## Comments
+
+R will ignore any text after `#` for that line.
+This allows you to write **comments**, text that is ignored by R but read by other humans.
+We'll sometimes include comments in examples explaining what's happening with the code.
+
+Comments can be helpful for briefly describing what the following code does.
+
+```{r}
+# create vector of primes
+primes <- c(2, 3, 5, 7, 11, 13)
+
+# multiply primes by 2
+primes * 2
+```
+
+With short pieces of code like this, leaving a comment for every single line of code might not be necessary.
+But as the code you're writing gets more complex, comments can save you (and your collaborators) a lot of time figuring out what was done in the code.
+
+Use comments to explain the *why* of your code, not the *how* or the *what*.
+The *what* and *how* of your code are always possible to figure out, even if it might be tedious, by carefully reading it.
+If you describe every step in the comments, and then change the code, you will have to remember to update the comments as well or it will be confusing when you return to your code in the future.
+
+Figuring out *why* something was done is much more difficult, if not impossible.
+For example, `geom_smooth()` has an argument called `span`, which controls the smoothness of the curve, with larger values yielding a smoother curve.
+Suppose you decide to change the value of `span` from its default of 0.75 to 0.9: it's easy for a future reader to understand *what* is happening, but unless you note your thinking in a comment, no one will understand *why* you changed the default.
+
+For data analysis code, use comments to explain your overall plan of attack and record important insights as you encounter them.
+There's no way to re-capture this knowledge from the code itself.
+
+## What's in a name? {#sec-whats-in-a-name}
+
+Object names must start with a letter and can only contain letters, numbers, `_`, and `.`.
+You want your object names to be descriptive, so you'll need to adopt a convention for multiple words.
+We recommend **snake_case**, where you separate lowercase words with `_`.
+
+```{r}
+#| eval: false
+
+i_use_snake_case
+otherPeopleUseCamelCase
+some.people.use.periods
+And_aFew.People_RENOUNCEconvention
+```
+
+We'll return to names again when we discuss code style in @sec-workflow-style.
+
+You can inspect an object by typing its name:
+
+```{r}
+x
+```
+
+Make another assignment:
+
+```{r}
+this_is_a_really_long_name <- 2.5
+```
+
+To inspect this object, try out RStudio's completion facility: type "this", press TAB, add characters until you have a unique prefix, then press return.
+
+Let's assume you made a mistake, and that the value of `this_is_a_really_long_name` should be 3.5, not 2.5.
+You can use another keyboard shortcut to help you fix it.
+For example, you can press ↑ to bring the last command you typed and edit it.
+Or, type "this" then press Cmd/Ctrl + ↑ to list all the commands you've typed that start with those letters.
+Use the arrow keys to navigate, then press enter to retype the command.
+Change 2.5 to 3.5 and rerun.
+
+Make yet another assignment:
+
+```{r}
+r_rocks <- 2^3
+```
+
+Let's try to inspect it:
+
+```{r}
+#| eval: false
+
+r_rock
+#> Error: object 'r_rock' not found
+R_rocks
+#> Error: object 'R_rocks' not found
+```
+
+This illustrates the implied contract between you and R: R will do the tedious computations for you, but in exchange, you must be completely precise in your instructions.
+If not, you're likely to get an error that says the object you're looking for was not found.
+Typos matter; R can't read your mind and say, "oh, they probably meant `r_rocks` when they typed `r_rock`".
+Case matters; similarly, R can't read your mind and say, "oh, they probably meant `r_rocks` when they typed `R_rocks`".
+
+## Calling functions
+
+R has a large collection of built-in functions that are called like this:
+
+```{r}
+#| eval: false
+
+function_name(argument1 = value1, argument2 = value2, ...)
+```
+
+Let's try using `seq()`, which makes regular **seq**uences of numbers, and while we're at it, learn more helpful features of RStudio.
+Type `se` and hit TAB.
+A popup shows you possible completions.
+Specify `seq()` by typing more (a `q`) to disambiguate or by using ↑/↓ arrows to select.
+Notice the floating tooltip that pops up, reminding you of the function's arguments and purpose.
+If you want more help, press F1 to get all the details in the help tab in the lower right pane.
+
+When you've selected the function you want, press TAB again.
+RStudio will add matching opening (`(`) and closing (`)`) parentheses for you.
+Type the name of the first argument, `from`, and set it equal to `1`.
+Then, type the name of the second argument, `to`, and set it equal to `10`.
+Finally, hit return.
+
+```{r}
+seq(from = 1, to = 10)
+```
+
+We often omit the names of the first several arguments in function calls, so we can rewrite this as follows:
+
+```{r}
+seq(1, 10)
+```
+
+Type the following code and notice that RStudio provides similar assistance with the paired quotation marks:
+
+```{r}
+x <- "hello world"
+```
+
+Quotation marks and parentheses must always come in a pair.
+RStudio does its best to help you, but it's still possible to mess up and end up with a mismatch.
+If this happens, R will show you the continuation character "+":
+
+```         
+> x <- "hello
++
+```
+
+The `+` tells you that R is waiting for more input; it doesn't think you're done yet.
+Usually, this means you've forgotten either a `"` or a `)`. Either add the missing pair, or press ESCAPE to abort the expression and try again.
+
+Note that the environment tab in the upper right pane displays all of the objects that you've created:
+
+```{r}
+#| echo: false
+#| fig-alt: |
+#|   Environment tab of RStudio which shows r_rocks, this_is_a_really_long_name, 
+#|   x, and y in the Global Environment.
+
+knitr::include_graphics("screenshots/rstudio-env.png")
+```
+
+## Exercises
+
+1.  Why does this code not work?
+
+    ```{r}
+    #| error: true
+
+    my_variable <- 10
+    my_varıable
+    ```
+
+    Look carefully!
+    (This may seem like an exercise in pointlessness, but training your brain to notice even the tiniest difference will pay off when programming.)
+
+2.  Tweak each of the following R commands so that they run correctly:
+
+    ```{r}
+    #| eval: false
+
+    libary(todyverse)
+
+    ggplot(dTA = mpg) + 
+      geom_point(maping = aes(x = displ y = hwy)) +
+      geom_smooth(method = "lm)
+    ```
+
+3.  Press Option + Shift + K / Alt + Shift + K.
+    What happens?
+    How can you get to the same place using the menus?
+
+4.  Let's revisit an exercise from the @sec-ggsave.
+    Run the following lines of code.
+    Which of the two plots is saved as `mpg-plot.png`?
+    Why?
+
+    ```{r}
+    #| eval: false
+
+    my_bar_plot <- ggplot(mpg, aes(x = class)) +
+      geom_bar()
+    my_scatter_plot <- ggplot(mpg, aes(x = cty, y = hwy)) +
+      geom_point()
+    ggsave(filename = "mpg-plot.png", plot = my_bar_plot)
+    ```
+
+## Summary
+
+Now that you've learned a little more about how R code works, and some tips to help you understand your code when you come back to it in the future.
+In the next chapter, we'll continue your data science journey by teaching you about dplyr, the tidyverse package that helps you transform data, whether it's selecting important variables, filtering down to rows of interest, or computing summary statistics.
diff --git a/workflow-help.qmd b/workflow-help.qmd
new file mode 100644
index 000000000..9dadda89c
--- /dev/null
+++ b/workflow-help.qmd
@@ -0,0 +1,133 @@
+# Workflow: getting help {#sec-workflow-getting-help}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+This book is not an island; there is no single resource that will allow you to master R.
+As you begin to apply the techniques described in this book to your own data, you will soon find questions that we do not answer.
+This section describes a few tips on how to get help and to help you keep learning.
+
+## Google is your friend
+
+If you get stuck, start with Google.
+Typically adding "R" to a query is enough to restrict it to relevant results: if the search isn't useful, it often means that there aren't any R-specific results available.
+Additionally, adding package names like "tidyverse" or "ggplot2" will help narrow down the results to code that will feel more familiar to you as well, e.g., "how to make a boxplot in R" vs. "how to make a boxplot in R with ggplot2".
+Google is particularly useful for error messages.
+If you get an error message and you have no idea what it means, try googling it!
+Chances are that someone else has been confused by it in the past, and there will be help somewhere on the web.
+(If the error message isn't in English, run `Sys.setenv(LANGUAGE = "en")` and re-run the code; you're more likely to find help for English error messages.)
+
+If Google doesn't help, try [Stack Overflow](https://stackoverflow.com).
+Start by spending a little time searching for an existing answer, including `[R]`, to restrict your search to questions and answers that use R.
+
+## Making a reprex
+
+If your googling doesn't find anything useful, it's a really good idea to prepare a **reprex,** short for minimal **repr**oducible **ex**ample.
+A good reprex makes it easier for other people to help you, and often you'll figure out the problem yourself in the course of making it.
+There are two parts to creating a reprex:
+
+-   First, you need to make your code reproducible.
+    This means that you need to capture everything, i.e. include any `library()` calls and create all necessary objects.
+    The easiest way to make sure you've done this is using the reprex package.
+
+-   Second, you need to make it minimal.
+    Strip away everything that is not directly related to your problem.
+    This usually involves creating a much smaller and simpler R object than the one you're facing in real life or even using built-in data.
+
+That sounds like a lot of work!
+And it can be, but it has a great payoff:
+
+-   80% of the time, creating an excellent reprex reveals the source of your problem.
+    It's amazing how often the process of writing up a self-contained and minimal example allows you to answer your own question.
+
+-   The other 20% of the time, you will have captured the essence of your problem in a way that is easy for others to play with.
+    This substantially improves your chances of getting help!
+
+When creating a reprex by hand, it's easy to accidentally miss something, meaning your code can't be run on someone else's computer.
+Avoid this problem by using the reprex package, which is installed as part of the tidyverse.
+Let's say you copy this code onto your clipboard (or, on RStudio Server or Cloud, select it):
+
+```{r}
+#| eval: false
+
+y <- 1:4
+mean(y)
+```
+
+Then call `reprex()`, where the default output is formatted for GitHub:
+
+``` r
+reprex::reprex()
+```
+
+A nicely rendered HTML preview will display in RStudio's Viewer (if you're in RStudio) or your default browser otherwise.
+The reprex is automatically copied to your clipboard (on RStudio Server or Cloud, you will need to copy this yourself):
+
+````         
+``` r
+y <- 1:4
+mean(y)
+#> [1] 2.5
+```
+````
+
+This text is formatted in a special way, called Markdown, which can be pasted to sites like StackOverflow or Github and they will automatically render it to look like code.
+Here's what that Markdown would look like rendered on GitHub:
+
+```{r}
+#| eval: false
+
+y <- 1:4
+mean(y)
+#> [1] 2.5
+```
+
+Anyone else can copy, paste, and run this immediately.
+
+There are three things you need to include to make your example reproducible: required packages, data, and code.
+
+1.  **Packages** should be loaded at the top of the script so it's easy to see which ones the example needs.
+    This is a good time to check that you're using the latest version of each package; you may have discovered a bug that's been fixed since you installed or last updated the package.
+    For packages in the tidyverse, the easiest way to check is to run `tidyverse_update()`.
+
+2.  The easiest way to include **data** is to use `dput()` to generate the R code needed to recreate it.
+    For example, to recreate the `mtcars` dataset in R, perform the following steps:
+
+    1.  Run `dput(mtcars)` in R
+    2.  Copy the output
+    3.  In reprex, type `mtcars <-`, then paste.
+
+    Try to use the smallest subset of your data that still reveals the problem.
+
+3.  Spend a little bit of time ensuring that your **code** is easy for others to read:
+
+    -   Make sure you've used spaces and your variable names are concise yet informative.
+
+    -   Use comments to indicate where your problem lies.
+
+    -   Do your best to remove everything that is not related to the problem.
+
+    The shorter your code is, the easier it is to understand and the easier it is to fix.
+
+Finish by checking that you have actually made a reproducible example by starting a fresh R session and copying and pasting your script.
+
+Creating reprexes is not trivial, and it will take some practice to learn to create good, truly minimal reprexes.
+However, learning to ask questions that include the code, and investing the time to make it reproducible will continue to pay off as you learn and master R.
+
+## Investing in yourself
+
+You should also spend some time preparing yourself to solve problems before they occur.
+Investing a little time in learning R each day will pay off handsomely in the long run.
+One way is to follow what the tidyverse team is doing on the [tidyverse blog](https://www.tidyverse.org/blog/).
+To keep up with the R community more broadly, we recommend reading [R Weekly](https://rweekly.org): it's a community effort to aggregate the most interesting news in the R community each week.
+
+## Summary
+
+This chapter concludes the Whole Game part of the book.
+You've now seen the most important parts of the data science process: visualization, transformation, tidying and importing.
+Now you've got a holistic view of the whole process, and we start to get into the details of small pieces.
+
+The next part of the book, Visualize, does a deeper dive into the grammar of graphics and creating data visualizations with ggplot2, showcases how to use the tools you've learned so far to conduct exploratory data analysis, and introduces good practices for creating plots for communication.
diff --git a/workflow-projects.Rmd b/workflow-projects.Rmd
deleted file mode 100644
index fb586b5a9..000000000
--- a/workflow-projects.Rmd
+++ /dev/null
@@ -1,134 +0,0 @@
-# Workflow: projects
-
-One day you will need to quit R, go do something else and return to your analysis the next day. One day you will be working on multiple analyses simultaneously that all use R and you want to keep them separate. One day you will need to bring data from the outside world into R and send numerical results and figures from R back out into the world. To handle these real life situations, you need to make two decisions:
-
-1.  What about your analysis is "real", i.e. what will you save as your 
-    lasting record of what happened?
-
-1.  Where does your analysis "live"?
-
-## What is real?
-
-As a beginning R user, it's OK to consider your environment (i.e. the objects listed in the environment pane) "real". However, in the long run, you'll be much better off if you consider your R scripts as "real". 
-
-With your R scripts (and your data files), you can recreate the environment. It's much harder to recreate your R scripts from your environment! You'll either have to retype a lot of code from memory (making mistakes all the way) or you'll have to carefully mine your R history.
-
-To foster this behaviour, I highly recommend that you instruct RStudio not to preserve your workspace between sessions:
-
-```{r, echo = FALSE, out.width = "75%"}
-knitr::include_graphics("screenshots/rstudio-workspace.png")
-```
-
-This will cause you some short-term pain, because now when you restart RStudio it will not remember the results of the code that you ran last time. But this short-term pain will save you long-term agony because it forces you to capture all important interactions in your code. There's nothing worse than discovering three months after the fact that you've only stored the results of an important calculation in your workspace, not the calculation itself in your code. 
-
-There is a great pair of keyboard shortcuts that will work together to make sure you've captured the important parts of your code in the editor:
-
-1. Press Cmd/Ctrl + Shift + F10 to restart RStudio.
-2. Press Cmd/Ctrl + Shift + S to rerun the current script.
-
-I use this pattern hundreds of times a week.
-
-## Where does your analysis live?
-
-R has a powerful notion of the __working directory__. This is where R looks for files that you ask it to load, and where it will put any files that you ask it to save. RStudio shows your current working directory at the top of the console:
-
-```{r, echo = FALSE, out.width = "50%"}
-knitr::include_graphics("screenshots/rstudio-wd.png")
-```
-
-And you can print this out in R code by running `getwd()`:
-
-```{r eval = FALSE}
-getwd()
-#> [1] "/Users/hadley/Documents/r4ds/r4ds"
-```
-
-As a beginning R user, it's OK to let your home directory, documents directory, or any other weird directory on your computer be R's working directory. But you're six chapters into this book, and you're no longer a rank beginner. Very soon now you should evolve to organising your analytical projects into directories and, when working on a project, setting R's working directory to the associated directory.
-
-__I do not recommend it__, but you can also set the working directory from within R:
-
-```{r eval = FALSE}
-setwd("/path/to/my/CoolProject")
-```
-
-But you should never do this because there's a better way; a way that also puts you on the path to managing your R work like an expert.
-
-## Paths and directories
-
-Paths and directories are a little complicated because there are two basic styles of paths: Mac/Linux and Windows. There are three chief ways in which they differ:
-
-1.  The most important difference is how you separate the components of the
-    path. Mac and Linux uses slashes (e.g. `plots/diamonds.pdf`) and Windows
-    uses backslashes (e.g. `plots\diamonds.pdf`). R can work with either type
-    (no matter what platform you're currently using), but unfortunately, 
-    backslashes mean something special to R, and to get a single backslash 
-    in the path, you need to type two backslashes! That makes life frustrating, 
-    so I recommend always using the Linux/Mac style with forward slashes.
-
-1.  Absolute paths (i.e. paths that point to the same place regardless of 
-    your working directory) look different. In Windows they start with a drive
-    letter (e.g. `C:`) or two backslashes (e.g. `\\servername`) and in
-    Mac/Linux they start with a slash "/" (e.g. `/users/hadley`). You should
-    __never__ use absolute paths in your scripts, because they hinder sharing: 
-    no one else will have exactly the same directory configuration as you.
-
-1.  The last minor difference is the place that `~` points to. `~` is a
-    convenient shortcut to your home directory. Windows doesn't really have 
-    the notion of a home directory, so it instead points to your documents
-    directory.
-
-## RStudio projects
-
-R experts keep all the files associated with a project together --- input data, R scripts, analytical results, figures. This is such a wise and common practice that RStudio has built-in support for this via __projects__.
-
-Let's make a project for you to use while you're working through the rest of this book. Click File > New Project, then:
-
-```{r, echo = FALSE, out.width = "50%"}
-knitr::include_graphics("screenshots/rstudio-project-1.png")
-knitr::include_graphics("screenshots/rstudio-project-2.png")
-knitr::include_graphics("screenshots/rstudio-project-3.png")
-```
-
-Call your project `r4ds` and think carefully about which _subdirectory_ you put the project in. If you don't store it somewhere sensible, it will be hard to find it in the future!
-
-Once this process is complete, you'll get a new RStudio project just for this book. Check that the "home" directory of your project is the current working directory:
-
-```{r eval = FALSE}
-getwd()
-#> [1] /Users/hadley/Documents/r4ds/r4ds
-```
-
-Whenever you refer to a file with a relative path it will look for it here. 
-
-Now enter the following commands in the script editor, and save the file, calling it "diamonds.R". Next, run the complete script which will save a PDF and CSV file into your project directory. Don't worry about the details, you'll learn them later in the book.
-
-```{r toy-line, eval = FALSE}
-library(tidyverse)
-
-ggplot(diamonds, aes(carat, price)) + 
-  geom_hex()
-ggsave("diamonds.pdf")
-
-write_csv(diamonds, "diamonds.csv")
-```
-
-Quit RStudio. Inspect the folder associated with your project --- notice the `.Rproj` file. Double-click that file to re-open the project. Notice you get back to where you left off: it's the same working directory and command history, and all the files you were working on are still open. Because you followed my instructions above, you will, however, have a completely fresh environment, guaranteeing that you're starting with a clean slate.
-
-In your favorite OS-specific way, search your computer for `diamonds.pdf` and you will find the PDF (no surprise) but _also the script that created it_ (`diamonds.R`). This is huge win! One day you will want to remake a figure or just understand where it came from. If you rigorously save figures to files __with R code__ and never with the mouse or the clipboard, you will be able to reproduce old work with ease!
-
-## Summary
-
-In summary, RStudio projects give you a solid workflow that will serve you well in the future:
-
-* Create an RStudio project for each data analysis project. 
-
-* Keep data files there; we'll talk about loading them into R in 
-  [data import].
-
-* Keep scripts there; edit them, run them in bits or as a whole.
-
-* Save your outputs (plots and cleaned data) there.
-
-* Only ever use relative paths, not absolute paths.
-
-Everything you need is in one place, and cleanly separated from all the other projects that you are working on.
diff --git a/workflow-scripts.Rmd b/workflow-scripts.Rmd
deleted file mode 100644
index 8ff90b3cd..000000000
--- a/workflow-scripts.Rmd
+++ /dev/null
@@ -1,61 +0,0 @@
-# Workflow: scripts
-
-So far you've been using the console to run code. That's a great place to start, but you'll find it gets cramped pretty quickly as you create more complex ggplot2 graphics and dplyr pipes. To give yourself more room to work, it's a great idea to use the script editor. Open it up either by clicking the File menu, and selecting New File, then R script, or using the keyboard shortcut Cmd/Ctrl + Shift + N. Now you'll see four panes:
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/rstudio-editor.png")
-```
-
-The script editor is a great place to put code you care about. Keep experimenting in the console, but once you have written code that works and does what you want, put it in the script editor. RStudio will automatically save the contents of the editor when you quit RStudio, and will automatically load it when you re-open. Nevertheless, it's a good idea to save your scripts regularly and to back them up.
-
-## Running code
-
-The script editor is also a great place to build up complex ggplot2 plots or long sequences of dplyr manipulations. The key to using the script editor effectively is to memorise one of the most important keyboard shortcuts: Cmd/Ctrl + Enter. This executes the current R expression in the console. For example, take the code below. If your cursor is at █, pressing Cmd/Ctrl + Enter will run the complete command that generates `not_cancelled`. It will also move the cursor to the next statement (beginning with `not_cancelled %>%`). That makes it easy to run your complete script by repeatedly pressing Cmd/Ctrl + Enter.
-
-```{r, eval = FALSE}
-library(dplyr)
-library(nycflights13)
-
-not_cancelled <- flights %>% 
-  filter(!is.na(dep_delay)█, !is.na(arr_delay))
-
-not_cancelled %>% 
-  group_by(year, month, day) %>% 
-  summarise(mean = mean(dep_delay))
-```
-
-Instead of running expression-by-expression, you can also execute the complete script in one step: Cmd/Ctrl + Shift + S. Doing this regularly is a great way to check that you've captured all the important parts of your code in the script. 
-
-I recommend that you always start your script with the packages that you need. That way, if you share your code with others, they can easily see what packages they need to install. Note, however, that you should never include `install.packages()` or `setwd()` in a script that you share. It's very antisocial to change settings on someone else's computer!
-
-When working through future chapters, I highly recommend starting in the editor and practicing your keyboard shortcuts. Over time, sending code to the console in this way will become so natural that you won't even think about it.
-
-## RStudio diagnostics
-
-The script editor will also highlight syntax errors with a red squiggly line and a cross in the sidebar:
-
-```{r echo = FALSE, out.width = NULL}
-knitr::include_graphics("screenshots/rstudio-diagnostic.png")
-```
-
-Hover over the cross to see what the problem is:
-
-```{r echo = FALSE, out.width = NULL}
-knitr::include_graphics("screenshots/rstudio-diagnostic-tip.png")
-```
-
-RStudio will also let you know about potential problems:
-
-```{r echo = FALSE, out.width = NULL}
-knitr::include_graphics("screenshots/rstudio-diagnostic-warn.png")
-```
-
-## Practice
-
-1.  Go to the RStudio Tips twitter account, <https://twitter.com/rstudiotips>
-    and find one tip that looks interesting. Practice using it!
-
-1.  What other common mistakes will RStudio diagnostics report?  Read
-    <https://support.rstudio.com/hc/en-us/articles/205753617-Code-Diagnostics> to 
-    find out.
-    
diff --git a/workflow-scripts.qmd b/workflow-scripts.qmd
new file mode 100644
index 000000000..5027b58dc
--- /dev/null
+++ b/workflow-scripts.qmd
@@ -0,0 +1,370 @@
+# Workflow: scripts and projects {#sec-workflow-scripts-projects}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+This chapter will introduce you to two essential tools for organizing your code: scripts and projects.
+
+## Scripts
+
+So far, you have used the console to run code.
+That's a great place to start, but you'll find it gets cramped pretty quickly as you create more complex ggplot2 graphics and longer dplyr pipelines.
+To give yourself more room to work, use the script editor.
+Open it up by clicking the File menu, selecting New File, then R script, or using the keyboard shortcut Cmd/Ctrl + Shift + N.
+Now you'll see four panes, as in @fig-rstudio-script.
+The script editor is a great place to experiment with your code.
+When you want to change something, you don't have to re-type the whole thing, you can just edit the script and re-run it.
+And once you have written code that works and does what you want, you can save it as a script file to easily return to later.
+
+```{r}
+#| label: fig-rstudio-script
+#| echo: false
+#| out-width: ~
+#| fig-cap: |
+#|   Opening the script editor adds a new pane at the top-left of the
+#|   IDE.
+#| fig-alt: |
+#|   RStudio IDE with Editor, Console, and Output highlighted.
+knitr::include_graphics("diagrams/rstudio/script.png", dpi = 270)
+```
+
+### Running code
+
+The script editor is an excellent place for building complex ggplot2 plots or long sequences of dplyr manipulations.
+The key to using the script editor effectively is to memorize one of the most important keyboard shortcuts: Cmd/Ctrl + Enter.
+This executes the current R expression in the console.
+For example, take the code below.
+
+```{r}
+#| eval: false
+
+library(dplyr)
+library(nycflights13)
+
+not_cancelled <- flights |> 
+  filter(!is.na(dep_delay)█, !is.na(arr_delay))
+
+not_cancelled |> 
+  group_by(year, month, day) |> 
+  summarize(mean = mean(dep_delay))
+```
+
+If your cursor is at █, pressing Cmd/Ctrl + Enter will run the complete command that generates `not_cancelled`.
+It will also move the cursor to the following statement (beginning with `not_cancelled |>`).
+That makes it easy to step through your complete script by repeatedly pressing Cmd/Ctrl + Enter.
+
+Instead of running your code expression-by-expression, you can also execute the complete script in one step with Cmd/Ctrl + Shift + S.
+Doing this regularly is a great way to ensure that you've captured all the important parts of your code in the script.
+
+We recommend you always start your script with the packages you need.
+That way, if you share your code with others, they can easily see which packages they need to install.
+Note, however, that you should never include `install.packages()` in a script you share.
+It's inconsiderate to hand off a script that will change something on their computer if they're not being careful!
+
+When working through future chapters, we highly recommend starting in the script editor and practicing your keyboard shortcuts.
+Over time, sending code to the console in this way will become so natural that you won't even think about it.
+
+### RStudio diagnostics
+
+In the script editor, RStudio will highlight syntax errors with a red squiggly line and a cross in the sidebar:
+
+```{r}
+#| echo: false
+#| out-width: ~
+#| fig-alt: |
+#|   Script editor with the script x y <- 10. A red X indicates that there is 
+#|   syntax error. The syntax error is also highlighted with a red squiggly line.
+
+knitr::include_graphics("screenshots/rstudio-diagnostic.png")
+```
+
+Hover over the cross to see what the problem is:
+
+```{r}
+#| echo: false
+#| out-width: ~
+#| fig-alt: |
+#|   Script editor with the script x y <- 10. A red X indicates that there is 
+#|   syntax error. The syntax error is also highlighted with a red squiggly line.
+#|   Hovering over the X shows a text box with the text unexpected token y and 
+#|   unexpected token <-.
+
+knitr::include_graphics("screenshots/rstudio-diagnostic-tip.png")
+```
+
+RStudio will also let you know about potential problems:
+
+```{r}
+#| echo: false
+#| out-width: ~
+#| fig-alt: |
+#|   Script editor with the script 3 == NA. A yellow exclamation mark 
+#|   indicates that there may be a potential problem. Hovering over the 
+#|   exclamation mark shows a text box with the text use is.na to check 
+#|   whether expression evaluates to NA.
+
+knitr::include_graphics("screenshots/rstudio-diagnostic-warn.png")
+```
+
+### Saving and naming
+
+RStudio automatically saves the contents of the script editor when you quit, and automatically reloads it when you re-open.
+Nevertheless, it's a good idea to avoid Untitled1, Untitled2, Untitled3, and so on and instead save your scripts and to give them informative names.
+
+It might be tempting to name your files `code.R` or `myscript.R`, but you should think a bit harder before choosing a name for your file.
+Three important principles for file naming are as follows:
+
+1.  File names should be **machine** readable: avoid spaces, symbols, and special characters. Don't rely on case sensitivity to distinguish files.
+2.  File names should be **human** readable: use file names to describe what's in the file.
+3.  File names should play well with default ordering: start file names with numbers so that alphabetical sorting puts them in the order they get used.
+
+For example, suppose you have the following files in a project folder.
+
+```         
+alternative model.R
+code for exploratory analysis.r
+finalreport.qmd
+FinalReport.qmd
+fig 1.png
+Figure_02.png
+model_first_try.R
+run-first.r
+temp.txt
+```
+
+There are a variety of problems here: it's hard to find which file to run first, file names contain spaces, there are two files with the same name but different capitalization (`finalreport` vs. `FinalReport`[^workflow-scripts-1]), and some names don't describe their contents (`run-first` and `temp`).
+
+[^workflow-scripts-1]: Not to mention that you're tempting fate by using "final" in the name 😆 The comic Piled Higher and Deeper has a [fun strip on this](https://phdcomics.com/comics/archive.php?comicid=1531).
+
+Here's a better way of naming and organizing the same set of files:
+
+```         
+01-load-data.R
+02-exploratory-analysis.R
+03-model-approach-1.R
+04-model-approach-2.R
+fig-01.png
+fig-02.png
+report-2022-03-20.qmd
+report-2022-04-02.qmd
+report-draft-notes.txt
+```
+
+Numbering the key scripts make it obvious in which order to run them and a consistent naming scheme makes it easier to see what varies.
+Additionally, the figures are labelled similarly, the reports are distinguished by dates included in the file names, and `temp` is renamed to `report-draft-notes` to better describe its contents.
+If you have a lot of files in a directory, taking organization one step further and placing different types of files (scripts, figures, etc.) in different directories is recommended.
+
+## Projects
+
+One day, you will need to quit R, go do something else, and return to your analysis later.
+One day, you will be working on multiple analyses simultaneously and you want to keep them separate.
+One day, you will need to bring data from the outside world into R and send numerical results and figures from R back out into the world.
+
+To handle these real life situations, you need to make two decisions:
+
+1.  What is the source of truth?
+    What will you save as your lasting record of what happened?
+
+2.  Where does your analysis live?
+
+### What is the source of truth?
+
+As a beginner, it's okay to rely on your current Environment to contain all the objects you have created throughout your analysis.
+However, to make it easier to work on larger projects or collaborate with others, your source of truth should be the R scripts.
+With your R scripts (and your data files), you can recreate the environment.
+With only your environment, it's much harder to recreate your R scripts: you'll either have to retype a lot of code from memory (inevitably making mistakes along the way) or you'll have to carefully mine your R history.
+
+To help keep your R scripts as the source of truth for your analysis, we highly recommend that you instruct RStudio not to preserve your workspace between sessions.
+You can do this either by running `usethis::use_blank_slate()`[^workflow-scripts-2] or by mimicking the options shown in @fig-blank-slate. This will cause you some short-term pain, because now when you restart RStudio, it will no longer remember the code that you ran last time nor will the objects you created or the datasets you read be available to use.
+But this short-term pain saves you long-term agony because it forces you to capture all important procedures in your code.
+There's nothing worse than discovering three months after the fact that you've only stored the results of an important calculation in your environment, not the calculation itself in your code.
+
+[^workflow-scripts-2]: If you don't have usethis installed, you can install it with `install.packages("usethis")`.
+
+```{r}
+#| label: fig-blank-slate
+#| echo: false
+#| fig-cap: |
+#|   Copy these options in your RStudio options to always start your 
+#|   RStudio session with a clean slate.
+#| fig-alt: |
+#|   RStudio Global Options window where the option Restore .RData into workspace 
+#|   at startup is not checked. Also, the option Save workspace to .RData 
+#|   on exit is set to Never.
+#| out-width: ~
+
+knitr::include_graphics("diagrams/rstudio/clean-slate.png", dpi = 270)
+```
+
+There is a great pair of keyboard shortcuts that will work together to make sure you've captured the important parts of your code in the editor:
+
+1.  Press Cmd/Ctrl + Shift + 0/F10 to restart R.
+2.  Press Cmd/Ctrl + Shift + S to re-run the current script.
+
+We collectively use this pattern hundreds of times a week.
+
+Alternatively, if you don't use keyboard shortcuts, you can go to Session \> Restart R and then highlight and re-run your current script.
+
+::: callout-note
+## RStudio server
+
+If you're using RStudio server, your R session is never restarted by default.
+When you close your RStudio server tab, it might feel like you're closing R, but the server actually keeps it running in the background.
+The next time you return, you'll be in exactly the same place you left.
+This makes it even more important to regularly restart R so that you're starting with a clean slate.
+:::
+
+### Where does your analysis live?
+
+R has a powerful notion of the **working directory**.
+This is where R looks for files that you ask it to load, and where it will put any files that you ask it to save.
+RStudio shows your current working directory at the top of the console:
+
+```{r}
+#| echo: false
+#| fig-alt: |
+#|   The Console tab shows the current working directory as 
+#|   ~/Documents/r4ds.
+#| out-width: ~
+knitr::include_graphics("screenshots/rstudio-wd.png")
+```
+
+And you can print this out in R code by running `getwd()`:
+
+```{r}
+#| eval: false
+getwd()
+#> [1] "/Users/hadley/Documents/r4ds"
+```
+
+In this R session, the current working directory (think of it as "home") is in hadley's Documents folder, in a subfolder called r4ds.
+This code will return a different result when you run it, because your computer has a different directory structure than Hadley's!
+
+As a beginning R user, it's OK to let your working directory be your home directory, documents directory, or any other weird directory on your computer.
+But you're seven chapters into this book, and you're no longer a beginner.
+Very soon now you should evolve to organizing your projects into directories and, when working on a project, set R's working directory to the associated directory.
+
+You can set the working directory from within R but **we** **do not recommend it**:
+
+```{r}
+#| eval: false
+setwd("/path/to/my/CoolProject")
+```
+
+There's a better way; a way that also puts you on the path to managing your R work like an expert.
+That way is the **RStudio** **project**.
+
+### RStudio projects
+
+Keeping all the files associated with a given project (input data, R scripts, analytical results, and figures) together in one directory is such a wise and common practice that RStudio has built-in support for this via **projects**.
+Let's make a project for you to use while you're working through the rest of this book.
+Click File \> New Project, then follow the steps shown in @fig-new-project.
+
+```{r}
+#| label: fig-new-project
+#| echo: false
+#| fig-cap: | 
+#|   To create new project: (top) first click New Directory, then (middle)
+#|   click New Project, then (bottom) fill in the directory (project) name,
+#|   choose a good subdirectory for its home and click Create Project.
+#| fig-alt: |
+#|   Three screenshots of the New Project menu. In the first screenshot, 
+#|   the Create Project window is shown and New Directory is selected. 
+#|   In the second screenshot, the Project Type window is shown and 
+#|   Empty Project is selected. In the third screenshot, the Create New
+#|   Project  window is shown and the directory name is given as r4ds and
+#|   the project is being created as subdirectory of the Desktop.
+#| out-width: ~
+
+knitr::include_graphics("diagrams/new-project.png")
+```
+
+Call your project `r4ds` and think carefully about which subdirectory you put the project in.
+If you don't store it somewhere sensible, it will be hard to find it in the future!
+
+Once this process is complete, you'll get a new RStudio project just for this book.
+Check that the "home" of your project is the current working directory:
+
+```{r}
+#| eval: false
+getwd()
+#> [1] /Users/hadley/Documents/r4ds
+```
+
+Now enter the following commands in the script editor, and save the file, calling it "diamonds.R".
+Then, create a new folder called "data".
+You can do this by clicking on the "New Folder" button in the Files pane in RStudio.
+Finally, run the complete script which will save a PNG and CSV file into your project directory.
+Don't worry about the details, you'll learn them later in the book.
+
+```{r}
+#| label: toy-line
+#| eval: false
+
+library(tidyverse)
+
+ggplot(diamonds, aes(x = carat, y = price)) + 
+  geom_hex()
+ggsave("diamonds.png")
+
+write_csv(diamonds, "data/diamonds.csv")
+```
+
+Quit RStudio.
+Inspect the folder associated with your project --- notice the `.Rproj` file.
+Double-click that file to re-open the project.
+Notice you get back to where you left off: it's the same working directory and command history, and all the files you were working on are still open.
+Because you followed our instructions above, you will, however, have a completely fresh environment, guaranteeing that you're starting with a clean slate.
+
+In your favorite OS-specific way, search your computer for `diamonds.png` and you will find the PNG (no surprise) but *also the script that created it* (`diamonds.R`).
+This is a huge win!
+One day, you will want to remake a figure or just understand where it came from.
+If you rigorously save figures to files **with R code** and never with the mouse or the clipboard, you will be able to reproduce old work with ease!
+
+### Relative and absolute paths
+
+Once you're inside a project, you should only ever use relative paths not absolute paths.
+What's the difference?
+A relative path is relative to the working directory, i.e. the project's home.
+When Hadley wrote `data/diamonds.csv` above it was a shortcut for `/Users/hadley/Documents/r4ds/data/diamonds.csv`.
+But importantly, if Mine ran this code on her computer, it would point to `/Users/Mine/Documents/r4ds/data/diamonds.csv`.
+This is why relative paths are important: they'll work regardless of where the R project folder ends up.
+
+Absolute paths point to the same place regardless of your working directory.
+They look a little different depending on your operating system.
+On Windows they start with a drive letter (e.g., `C:`) or two backslashes (e.g., `\\servername`) and on Mac/Linux they start with a slash "/" (e.g., `/users/hadley`).
+You should **never** use absolute paths in your scripts, because they hinder sharing: no one else will have exactly the same directory configuration as you.
+
+There's another important difference between operating systems: how you separate the components of the path.
+Mac and Linux uses slashes (e.g., `data/diamonds.csv`) and Windows uses backslashes (e.g., `data\diamonds.csv`).
+R can work with either type (no matter what platform you're currently using), but unfortunately, backslashes mean something special to R, and to get a single backslash in the path, you need to type two backslashes!
+That makes life frustrating, so we recommend always using the Linux/Mac style with forward slashes.
+
+## Exercises
+
+1.  Go to the RStudio Tips Twitter account, <https://twitter.com/rstudiotips> and find one tip that looks interesting.
+    Practice using it!
+
+2.  What other common mistakes will RStudio diagnostics report?
+    Read <https://support.posit.co/hc/en-us/articles/205753617-Code-Diagnostics> to find out.
+
+## Summary
+
+In this chapter, you've learned how to organize your R code in scripts (files) and projects (directories).
+Much like code style, this may feel like busywork at first.
+But as you accumulate more code across multiple projects, you'll learn to appreciate how a little up front organisation can save you a bunch of time down the road.
+
+In summary, scripts and projects give you a solid workflow that will serve you well in the future:
+
+-   Create one RStudio project for each data analysis project.
+-   Save your scripts (with informative names) in the project, edit them, run them in bits or as a whole. Restart R frequently to make sure you've captured everything in your scripts.
+-   Only ever use relative paths, not absolute paths.
+
+Then everything you need is in one place and cleanly separated from all the other projects that you are working on.
+
+So far, we've worked with datasets bundled inside of R packages.
+This makes it easier to get some practice on pre-prepared data, but obviously your data won't be available in this way.
+So in the next chapter, you're going to learn how load data from disk into your R session using the readr package.
diff --git a/workflow-style.qmd b/workflow-style.qmd
new file mode 100644
index 000000000..9873206a1
--- /dev/null
+++ b/workflow-style.qmd
@@ -0,0 +1,301 @@
+# Workflow: code style {#sec-workflow-style}
+
+```{r}
+#| echo: false
+
+source("_common.R")
+```
+
+Good coding style is like correct punctuation: you can manage without it, butitsuremakesthingseasiertoread.
+Even as a very new programmer, it's a good idea to work on your code style.
+Using a consistent style makes it easier for others (including future-you!) to read your work and is particularly important if you need to get help from someone else.
+This chapter will introduce the most important points of the [tidyverse style guide](https://style.tidyverse.org), which is used throughout this book.
+
+Styling your code will feel a bit tedious to start with, but if you practice it, it will soon become second nature.
+Additionally, there are some great tools to quickly restyle existing code, like the [**styler**](https://styler.r-lib.org) package by Lorenz Walthert.
+Once you've installed it with `install.packages("styler")`, an easy way to use it is via RStudio's **command palette**.
+The command palette lets you use any built-in RStudio command and many addins provided by packages.
+Open the palette by pressing Cmd/Ctrl + Shift + P, then type "styler" to see all the shortcuts offered by styler.
+@fig-styler shows the results.
+
+```{r}
+#| label: fig-styler
+#| echo: false
+#| out-width: null
+#| fig-cap: | 
+#|   RStudio's command palette makes it easy to access every RStudio command
+#|   using only the keyboard.
+#| fig-alt: |
+#|   A screenshot showing the command palette after typing "styler", showing
+#|   the four styling tool provided by the package.
+
+knitr::include_graphics("screenshots/rstudio-palette.png")
+```
+
+We'll use the tidyverse and nycflights13 packages for code examples in this chapter.
+
+```{r}
+#| label: setup
+#| message: false
+
+library(tidyverse)
+library(nycflights13)
+```
+
+## Names
+
+We talked briefly about names in @sec-whats-in-a-name.
+Remember that variable names (those created by `<-` and those created by `mutate()`) should use only lowercase letters, numbers, and `_`.
+Use `_` to separate words within a name.
+
+```{r}
+#| eval: false
+
+# Strive for:
+short_flights <- flights |> filter(air_time < 60)
+
+# Avoid:
+SHORTFLIGHTS <- flights |> filter(air_time < 60)
+```
+
+As a general rule of thumb, it's better to prefer long, descriptive names that are easy to understand rather than concise names that are fast to type.
+Short names save relatively little time when writing code (especially since autocomplete will help you finish typing them), but it can be time-consuming when you come back to old code and are forced to puzzle out a cryptic abbreviation.
+
+If you have a bunch of names for related things, do your best to be consistent.
+It's easy for inconsistencies to arise when you forget a previous convention, so don't feel bad if you have to go back and rename things.
+In general, if you have a bunch of variables that are a variation on a theme, you're better off giving them a common prefix rather than a common suffix because autocomplete works best on the start of a variable.
+
+## Spaces
+
+Put spaces on either side of mathematical operators apart from `^` (i.e. `+`, `-`, `==`, `<`, ...), and around the assignment operator (`<-`).
+
+```{r}
+#| eval: false
+
+# Strive for
+z <- (a + b)^2 / d
+
+# Avoid
+z<-( a + b ) ^ 2/d
+```
+
+Don't put spaces inside or outside parentheses for regular function calls.
+Always put a space after a comma, just like in standard English.
+
+```{r}
+#| eval: false
+
+# Strive for
+mean(x, na.rm = TRUE)
+
+# Avoid
+mean (x ,na.rm=TRUE)
+```
+
+It's OK to add extra spaces if it improves alignment.
+For example, if you're creating multiple variables in `mutate()`, you might want to add spaces so that all the `=` line up.[^workflow-style-1]
+This makes it easier to skim the code.
+
+[^workflow-style-1]: Since `dep_time` is in `HMM` or `HHMM` format, we use integer division (`%/%`) to get hour and remainder (also known as modulo, `%%`) to get minute.
+
+```{r}
+#| eval: false
+
+flights |> 
+  mutate(
+    speed      = distance / air_time,
+    dep_hour   = dep_time %/% 100,
+    dep_minute = dep_time %%  100
+  )
+```
+
+## Pipes {#sec-pipes}
+
+`|>` should always have a space before it and should typically be the last thing on a line.
+This makes it easier to add new steps, rearrange existing steps, modify elements within a step, and get a 10,000 ft view by skimming the verbs on the left-hand side.
+
+```{r}
+#| eval: false
+
+# Strive for 
+flights |>  
+  filter(!is.na(arr_delay), !is.na(tailnum)) |> 
+  count(dest)
+
+# Avoid
+flights|>filter(!is.na(arr_delay), !is.na(tailnum))|>count(dest)
+```
+
+If the function you're piping into has named arguments (like `mutate()` or `summarize()`), put each argument on a new line.
+If the function doesn't have named arguments (like `select()` or `filter()`), keep everything on one line unless it doesn't fit, in which case you should put each argument on its own line.
+
+```{r}
+#| eval: false
+
+# Strive for
+flights |>  
+  group_by(tailnum) |> 
+  summarize(
+    delay = mean(arr_delay, na.rm = TRUE),
+    n = n()
+  )
+
+# Avoid
+flights |>
+  group_by(
+    tailnum
+  ) |> 
+  summarize(delay = mean(arr_delay, na.rm = TRUE), n = n())
+```
+
+After the first step of the pipeline, indent each line by two spaces.
+RStudio will automatically put the spaces in for you after a line break following a `|>` .
+If you're putting each argument on its own line, indent by an extra two spaces.
+Make sure `)` is on its own line, and un-indented to match the horizontal position of the function name.
+
+```{r}
+#| eval: false
+
+# Strive for 
+flights |>  
+  group_by(tailnum) |> 
+  summarize(
+    delay = mean(arr_delay, na.rm = TRUE),
+    n = n()
+  )
+
+# Avoid
+flights|>
+  group_by(tailnum) |> 
+  summarize(
+             delay = mean(arr_delay, na.rm = TRUE), 
+             n = n()
+           )
+
+# Avoid
+flights|>
+  group_by(tailnum) |> 
+  summarize(
+  delay = mean(arr_delay, na.rm = TRUE), 
+  n = n()
+  )
+```
+
+It's OK to shirk some of these rules if your pipeline fits easily on one line.
+But in our collective experience, it's common for short snippets to grow longer, so you'll usually save time in the long run by starting with all the vertical space you need.
+
+```{r}
+#| eval: false
+
+# This fits compactly on one line
+df |> mutate(y = x + 1)
+
+# While this takes up 4x as many lines, it's easily extended to 
+# more variables and more steps in the future
+df |> 
+  mutate(
+    y = x + 1
+  )
+```
+
+Finally, be wary of writing very long pipes, say longer than 10-15 lines.
+Try to break them up into smaller sub-tasks, giving each task an informative name.
+The names will help cue the reader into what's happening and makes it easier to check that intermediate results are as expected.
+Whenever you can give something an informative name, you should give it an informative name, for example when you fundamentally change the structure of the data, e.g., after pivoting or summarizing.
+Don't expect to get it right the first time!
+This means breaking up long pipelines if there are intermediate states that can get good names.
+
+## ggplot2
+
+The same basic rules that apply to the pipe also apply to ggplot2; just treat `+` the same way as `|>`.
+
+```{r}
+#| eval: false
+
+flights |> 
+  group_by(month) |> 
+  summarize(
+    delay = mean(arr_delay, na.rm = TRUE)
+  ) |> 
+  ggplot(aes(x = month, y = delay)) +
+  geom_point() + 
+  geom_line()
+```
+
+Again, if you can't fit all of the arguments to a function on to a single line, put each argument on its own line:
+
+```{r}
+#| eval: false
+
+flights |> 
+  group_by(dest) |> 
+  summarize(
+    distance = mean(distance),
+    speed = mean(distance / air_time, na.rm = TRUE)
+  ) |> 
+  ggplot(aes(x = distance, y = speed)) +
+  geom_smooth(
+    method = "loess",
+    span = 0.5,
+    se = FALSE, 
+    color = "white", 
+    linewidth = 4
+  ) +
+  geom_point()
+```
+
+Watch for the transition from `|>` to `+`.
+We wish this transition wasn't necessary, but unfortunately, ggplot2 was written before the pipe was discovered.
+
+## Sectioning comments
+
+As your scripts get longer, you can use **sectioning** comments to break up your file into manageable pieces:
+
+```{r}
+#| eval: false
+
+# Load data --------------------------------------
+
+# Plot data --------------------------------------
+```
+
+RStudio provides a keyboard shortcut to create these headers (Cmd/Ctrl + Shift + R), and will display them in the code navigation drop-down at the bottom-left of the editor, as shown in @fig-rstudio-sections.
+
+```{r}
+#| label: fig-rstudio-sections
+#| echo: false
+#| out-width: null
+#| fig-cap: | 
+#|   After adding sectioning comments to your script, you can
+#|   easily navigate to them using the code navigation tool in the
+#|   bottom-left of the script editor.
+
+knitr::include_graphics("screenshots/rstudio-nav.png")
+```
+
+## Exercises
+
+1.  Restyle the following pipelines following the guidelines above.
+
+    ```{r}
+    #| eval: false
+
+    flights|>filter(dest=="IAH")|>group_by(year,month,day)|>summarize(n=n(),
+    delay=mean(arr_delay,na.rm=TRUE))|>filter(n>10)
+
+    flights|>filter(carrier=="UA",dest%in%c("IAH","HOU"),sched_dep_time>
+    0900,sched_arr_time<2000)|>group_by(flight)|>summarize(delay=mean(
+    arr_delay,na.rm=TRUE),cancelled=sum(is.na(arr_delay)),n=n())|>filter(n>10)
+    ```
+
+## Summary
+
+In this chapter, you've learned the most important principles of code style.
+These may feel like a set of arbitrary rules to start with (because they are!) but over time, as you write more code, and share code with more people, you'll see how important a consistent style is.
+And don't forget about the styler package: it's a great way to quickly improve the quality of poorly styled code.
+
+In the next chapter, we switch back to data science tools, learning about tidy data.
+Tidy data is a consistent way of organizing your data frames that is used throughout the tidyverse.
+This consistency makes your life easier because once you have tidy data, it just works with the vast majority of tidyverse functions.
+Of course, life is never easy, and most datasets you encounter in the wild will not already be tidy.
+So we'll also teach you how to use the tidyr package to tidy your untidy data.
diff --git a/wrangle.Rmd b/wrangle.Rmd
deleted file mode 100644
index b75a241b3..000000000
--- a/wrangle.Rmd
+++ /dev/null
@@ -1,39 +0,0 @@
-# (PART) Wrangle {-}
-
-# Introduction {#wrangle-intro}
-
-In this part of the book, you'll learn about data wrangling, the art of getting your data into R in a useful form for visualisation and modelling. Data wrangling is very important: without it you can't work with your own data! There are three main parts to data wrangling:
-
-```{r echo = FALSE, out.width = "75%"}
-knitr::include_graphics("diagrams/data-science-wrangle.png")
-```
-
-This part of the book proceeds as follows:
-
-*   In [tibbles], you'll learn about the variant of the data frame that we use
-    in this book: the __tibble__.  You'll learn what makes them different
-    from regular data frames, and how you can construct them "by hand".
-
-*   In [data import], you'll learn how to get your data from disk and into R.
-    We'll focus on plain-text rectangular formats, but will give you pointers 
-    to packages that help with other types of data.
-
-*   In [tidy data], you'll learn about tidy data, a consistent way of storing
-    your data that makes transformation, visualisation, and modelling easier.
-    You'll learn the underlying principles, and how to get your data into a 
-    tidy form.
-
-Data wrangling also encompasses data transformation, which you've already learned a little about. Now we'll focus on new skills for three specific types of data you will frequently encounter in practice:
-
-*   [Relational data] will give you tools for working with multiple
-    interrelated datasets.
-    
-*   [Strings] will introduce regular expressions, a powerful tool for
-    manipulating strings.
-
-*   [Factors] are how R stores categorical data. They are used when a variable
-    has a fixed set of possible values, or when you want to use a non-alphabetical
-    ordering of a string.
-    
-*   [Dates and times] will give you the key tools for working with 
-    dates and date-times.