diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf2f..1958855bf 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,5 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.travis\.yml$ +^\.github$ +^CODE_OF_CONDUCT\.md$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 000000000..2d19fc766 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/build_book.yaml b/.github/workflows/build_book.yaml new file mode 100644 index 000000000..cdcba86cb --- /dev/null +++ b/.github/workflows/build_book.yaml @@ -0,0 +1,60 @@ +on: + push: + branches: main + pull_request: + branches: main + # to be able to trigger a manual build + workflow_dispatch: + schedule: + # run every day at 11 PM + - cron: '0 23 * * *' + +name: Render and deploy Book to Netlify + +env: + isExtPR: ${{ github.event.pull_request.head.repo.fork == true }} + RUST_BACKTRACE: 1 + +jobs: + build-deploy: + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - name: Install Quarto + uses: quarto-dev/quarto-actions/install-quarto@v1 + with: + # To install LaTeX to build PDF book + tinytex: true + # uncomment below and fill to pin a version + # version: 0.9.105 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + + - name: Render book to all format + # Add any command line argument needed + run: | + quarto render + + - name: Deploy to Netlify + if: contains(env.isExtPR, 'false') + id: netlify-deploy + uses: nwtgck/actions-netlify@v1.1 + with: + publish-dir: './_book' + production-branch: main + github-token: ${{ secrets.GITHUB_TOKEN }} + deploy-message: + 'Deploy from GHA: ${{ github.event.pull_request.title || github.event.head_commit.message }} (${{ github.sha }})' + enable-pull-request-comment: false + enable-commit-comment: false + env: + NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} + NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID_2E }} + timeout-minutes: 1 diff --git a/.gitignore b/.gitignore index 0c451465c..00b8f139f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,11 +5,18 @@ _main.rds _book *.md -*.html +!CODE_OF_CONDUCT.md +/*.html +!plausible.html search_index.json libs *.rds _main.* -bookdown* tmp-pdfcrop-* figures +/.quarto/ +site_libs +/data/seattle-library-checkouts.csv +/data/seattle-library-checkouts.parquet +/data/seattle-library-checkouts +oreilly diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d3c3534e3..000000000 --- a/.travis.yml +++ /dev/null @@ -1,22 +0,0 @@ -language: R -cache: - packages: true - directories: - - _book - -addons: - apt: - packages: - - libxml2-dev - -script: - - Rscript -e 'bookdown::render_book("index.rmd")' - -deploy: - skip_cleanup: true - provider: s3 - access_key_id: AKIAJ6PXDYWD72R6HBYA - secret_access_key: - secure: "KB6D4dRFyqABOUBC6q6CTI7WZQ+4kFOSDWNQFAbXJQR4TzR8J6uddAiSZyG8T1/8z+9Lm1VK417Zi0dGm3r3epbSnLClitBetvE11DoByomK+ey+NJ0MdXuXbFCJhX9l+8QDbDRLd/b2MEr36JXNaNQaLf5wdHImVVfcCm5STAIOM42plYMvz4Uhao+VjIKo+0IqiGHQHsNcU4qQXS4jd4FtO/t1xCwa7SgH0wwV2yJmeh8mM7QpmUEpBcZTHDvqZu6BitxtkYQDCh1iuBwhbPlYug/WOtyHmKYgU/c3+C+xW4OLv10OsE+eK6noEzIXQ80sPIyKMpkn+9P+7MnoRU/oZTXmYJOuXE5mvy+CiJ4TzZZxzB/g8HzklRRI4eFBmJ/zTTMmJMwBdbUhCXepARe4gr7pDFKhSTXvBVxljJBrkiGz6W1JeZ9nKzUbuIlWNJ9aaYM2UDMbRef7xyKlKbBNw1+90aTTW8Jo+0Sz3/R7daBTcnr0Bszg4QCaOMoxJJF/Ty/tTHiComAt/kNRqlSiU2g/Ch0jOz5TRV3c29OjQQ/a9ftf5pqlvgStwjjszgHQfRrd4mxGq2E/1gkPGL7ada+TWPAVjCc8HtPGK/36IjSccFB6qGkwTFf3uOBmAC2XVnJJlwG8v20nL5ZZwpCCbQANeQq/ILQsYUmk7RM=" - bucket: r4ds.had.co.nz - local-dir: _book diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..b36903fa8 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, +and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall +community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or +advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email +address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards +of acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies +when an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at [INSERT CONTACT +METHOD]. All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, +available at https://www.contributor-covenant.org/version/2/0/ +code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at https:// +www.contributor-covenant.org/translations. diff --git a/DESCRIPTION b/DESCRIPTION index eca8156c5..7e4996ebf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,32 +2,39 @@ Package: r4ds Title: R for data science. Version: 0.1 Authors@R: c( - person("Hadley", "Wickham", , "hadley@rstudio.com", c("aut", "cre")), - person("Garrett", "Grolemund", , "garrett@rstudio.com", "aut") + person("Hadley", "Wickham", , "hadley@rstudio.com", role = c("aut", "cre")), + person("Mine", "\u00C7etinkaya-Rundel", , "mine@rstudio.com", role = "aut"), + person("Garrett", "Grolemund", , "garrett@rstudio.com", role = "aut") ) -Depends: R (>= 3.1.0) URL: https://github.com/hadley/r4ds +Depends: + R (>= 3.1.0) Imports: - bookdown, - condvis, - gapminder, - ggrepel, - hexbin, - htmltools, - htmlwidgets, - jpeg, - knitr, - Lahman, - leaflet, - maps, - microbenchmark, - nycflights13, - png, - pryr, - tidyverse, - viridis -Remotes: - hadley/ggplot2, - slowkow/ggrepel, - rstudio/bookdown, - rstudio/rmarkdown + arrow, + babynames, + curl (>= 5.0.0), + duckdb, + gapminder, + ggrepel, + ggridges, + ggthemes, + hexbin, + janitor, + Lahman, + leaflet, + maps, + nycflights13, + openxlsx, + palmerpenguins, + repurrrsive (>= 1.1.0), + tidymodels, + tidyverse (>= 2.0.0), + writexl +Suggests: + downlit, + jpeg, + knitr, + sessioninfo +Remotes: tidyverse/dplyr +Encoding: UTF-8 +License: CC NC ND 3.0 diff --git a/EDA.Rmd b/EDA.Rmd deleted file mode 100644 index b8a2977b8..000000000 --- a/EDA.Rmd +++ /dev/null @@ -1,595 +0,0 @@ -# Exploratory Data Analysis - -## Introduction - -This chapter will show you how to use visualisation and transformation to explore your data in a systematic way, a task that statisticians call exploratory data analysis, or EDA for short. EDA is an iterative cycle. You: - -1. Generate questions about your data. - -1. Search for answers by visualising, transforming, and modelling your data. - -1. Use what you learn to refine your questions and/or generate new questions. - -EDA is not a formal process with a strict set of rules. More than anything, EDA is a state of mind. During the initial phases of EDA you should feel free to investigate every idea that occurs to you. Some of these ideas will pan out, and some will be dead ends. As your exploration continues, you will home in on a few particularly productive areas that you'll eventually write up and communicate to others. - -EDA is an important part of any data analysis, even if the questions are handed to you on a platter, because you always need to investigate the quality of your data. Data cleaning is just one application of EDA: you ask questions about whether your data meets your expectations or not. To do data cleaning, you'll need to deploy all the tools of EDA: visualisation, transformation, and modelling. - -### Prerequisites - -In this chapter we'll combine what you've learned about dplyr and ggplot2 to interactively ask questions, answer them with data, and then ask new questions. - -```{r setup, message = FALSE} -library(tidyverse) -``` - -## Questions - -> "There are no routine statistical questions, only questionable statistical -> routines." --- Sir David Cox - -> "Far better an approximate answer to the right question, which is often -> vague, than an exact answer to the wrong question, which can always be made -> precise." --- John Tukey - -Your goal during EDA is to develop an understanding of your data. The easiest way to do this is to use questions as tools to guide your investigation. When you ask a question, the question focuses your attention on a specific part of your dataset and helps you decide which graphs, models, or transformations to make. - -EDA is fundamentally a creative process. And like most creative processes, the key to asking _quality_ questions is to generate a large _quantity_ of questions. It is difficult to ask revealing questions at the start of your analysis because you do not know what insights are contained in your dataset. On the other hand, each new question that you ask will expose you to a new aspect of your data and increase your chance of making a discovery. You can quickly drill down into the most interesting parts of your data---and develop a set of thought-provoking questions---if you follow up each question with a new question based on what you find. - -There is no rule about which questions you should ask to guide your research. However, two types of questions will always be useful for making discoveries within your data. You can loosely word these questions as: - -1. What type of variation occurs within my variables? - -1. What type of covariation occurs between my variables? - -The rest of this chapter will look at these two questions. I'll explain what variation and covariation are, and I'll show you several ways to answer each question. To make the discussion easier, let's define some terms: - -* A __variable__ is a quantity, quality, or property that you can measure. - -* A __value__ is the state of a variable when you measure it. The value of a - variable may change from measurement to measurement. - -* An __observation__ is a set of measurements made under similar conditions - (you usually make all of the measurements in an observation at the same - time and on the same object). An observation will contain several values, - each associated with a different variable. I'll sometimes refer to - an observation as a data point. - -* __Tabular data__ is a set of values, each associated with a variable and an - observation. Tabular data is _tidy_ if each value is placed in its own - "cell", each variable in its own column, and each observation in its own - row. - -So far, all of the data that you've seen has been tidy. In real-life, most data isn't tidy, so we'll come back to these ideas again in [tidy data]. - -## Variation - -**Variation** is the tendency of the values of a variable to change from measurement to measurement. You can see variation easily in real life; if you measure any continuous variable twice, you will get two different results. This is true even if you measure quantities that are constant, like the speed of light. Each of your measurements will include a small amount of error that varies from measurement to measurement. Categorical variables can also vary if you measure across different subjects (e.g. the eye colors of different people), or different times (e.g. the energy levels of an electron at different moments). -Every variable has its own pattern of variation, which can reveal interesting information. The best way to understand that pattern is to visualise the distribution of the variable's values. - -### Visualising distributions - -How you visualise the distribution of a variable will depend on whether the variable is categorical or continuous. A variable is **categorical** if it can only take one of a small set of values. In R, categorical variables are usually saved as factors or character vectors. To examine the distribution of a categorical variable, use a bar chart: - -```{r} -ggplot(data = diamonds) + - geom_bar(mapping = aes(x = cut)) -``` - -The height of the bars displays how many observations occurred with each x value. You can compute these values manually with `dplyr::count()`: - -```{r} -diamonds %>% - count(cut) -``` - -A variable is **continuous** if it can take any of an infinite set of ordered values. Numbers and date-times are two examples of continuous variables. To examine the distribution of a continuous variable, use a histogram: - -```{r} -ggplot(data = diamonds) + - geom_histogram(mapping = aes(x = carat), binwidth = 0.5) -``` - -You can compute this by hand by combining `dplyr::count()` and `ggplot2::cut_width()`: - -```{r} -diamonds %>% - count(cut_width(carat, 0.5)) -``` - -A histogram divides the x-axis into equally spaced bins and then uses the height of a bar to display the number of observations that fall in each bin. In the graph above, the tallest bar shows that almost 30,000 observations have a `carat` value between 0.25 and 0.75, which are the left and right edges of the bar. - -You can set the width of the intervals in a histogram with the `binwidth` argument, which is measured in the units of the `x` variable. You should always explore a variety of binwidths when working with histograms, as different binwidths can reveal different patterns. For example, here is how the graph above looks when we zoom into just the diamonds with a size of less than three carats and choose a smaller binwidth. - -```{r} -smaller <- diamonds %>% - filter(carat < 3) - -ggplot(data = smaller, mapping = aes(x = carat)) + - geom_histogram(binwidth = 0.1) -``` - -If you wish to overlay multiple histograms in the same plot, I recommend using `geom_freqpoly()` instead of `geom_histogram()`. `geom_freqpoly()` performs the same calculation as `geom_histogram()`, but instead of displaying the counts with bars, uses lines instead. It's much easier to understand overlapping lines than bars. - -```{r} -ggplot(data = smaller, mapping = aes(x = carat, colour = cut)) + - geom_freqpoly(binwidth = 0.1) -``` - -There are a few challenges with this type of plot, which we will come back to in [visualising a categorical and a continuous variable](#cat-cont). - -Now that you can visualise variation, what should you look for in your plots? And what type of follow-up questions should you ask? I've put together a list below of the most useful types of information that you will find in your graphs, along with some follow-up questions for each type of information. The key to asking good follow-up questions will be to rely on your curiosity (What do you want to learn more about?) as well as your skepticism (How could this be misleading?). - -### Typical values - -In both bar charts and histograms, tall bars show the common values of a variable, and shorter bars show less-common values. Places that do not have bars reveal values that were not seen in your data. To turn this information into useful questions, look for anything unexpected: - -* Which values are the most common? Why? - -* Which values are rare? Why? Does that match your expectations? - -* Can you see any unusual patterns? What might explain them? - -As an example, the histogram below suggests several interesting questions: - -* Why are there more diamonds at whole carats and common fractions of carats? - -* Why are there more diamonds slightly to the right of each peak than there - are slightly to the left of each peak? - -* Why are there no diamonds bigger than 3 carats? - -```{r} -ggplot(data = smaller, mapping = aes(x = carat)) + - geom_histogram(binwidth = 0.01) -``` - -Clusters of similar values suggest that subgroups exist in your data. To understand the subgroups, ask: - -* How are the observations within each cluster similar to each other? - -* How are the observations in separate clusters different from each other? - -* How can you explain or describe the clusters? - -* Why might the appearance of clusters be misleading? - -The histogram below shows the length (in minutes) of 272 eruptions of the Old Faithful Geyser in Yellowstone National Park. Eruption times appear to be clustered into two groups: there are short eruptions (of around 2 minutes) and long eruptions (4-5 minutes), but little in between. - -```{r} -ggplot(data = faithful, mapping = aes(x = eruptions)) + - geom_histogram(binwidth = 0.25) -``` - -Many of the questions above will prompt you to explore a relationship *between* variables, for example, to see if the values of one variable can explain the behavior of another variable. We'll get to that shortly. - -### Unusual values - -Outliers are observations that are unusual; data points that don't seem to fit the pattern. Sometimes outliers are data entry errors; other times outliers suggest important new science. When you have a lot of data, outliers are sometimes difficult to see in a histogram. For example, take the distribution of the `y` variable from the diamonds dataset. The only evidence of outliers is the unusually wide limits on the x-axis. - -```{r} -ggplot(diamonds) + - geom_histogram(mapping = aes(x = y), binwidth = 0.5) -``` - -There are so many observations in the common bins that the rare bins are so short that you can't see them (although maybe if you stare intently at 0 you'll spot something). To make it easy to see the unusual values, we need to zoom to small values of the y-axis with `coord_cartesian()`: - -```{r} -ggplot(diamonds) + - geom_histogram(mapping = aes(x = y), binwidth = 0.5) + - coord_cartesian(ylim = c(0, 50)) -``` - -(`coord_cartesian()` also has an `xlim()` argument for when you need to zoom into the x-axis. ggplot2 also has `xlim()` and `ylim()` functions that work slightly differently: they throw away the data outside the limits.) - -This allows us to see that there are three unusual values: 0, ~30, and ~60. We pluck them out with dplyr: - -```{r, include = FALSE} -old <- options(tibble.print_max = 10, tibble.print_min = 10) -``` - -```{r} -unusual <- diamonds %>% - filter(y < 3 | y > 20) %>% - select(price, x, y, z) %>% - arrange(y) -unusual -``` - -```{r, include = FALSE} -options(old) -``` - -The `y` variable measures one of the three dimensions of these diamonds, in mm. We know that diamonds can't have a width of 0mm, so these values must be incorrect. We might also suspect that measurements of 32mm and 59mm are implausible: those diamonds are over an inch long, but don't cost hundreds of thousands of dollars! - -It's good practice to repeat your analysis with and without the outliers. If they have minimal effect on the results, and you can't figure out why they're there, it's reasonable to replace them with missing values, and move on. However, if they have a substantial effect on your results, you shouldn't drop them without justification. You'll need to figure out what caused them (e.g. a data entry error) and disclose that you removed them in your write-up. - - -### Exercises - -1. Explore the distribution of each of the `x`, `y`, and `z` variables - in `diamonds`. What do you learn? Think about a diamond and how you - might decide which dimension is the length, width, and depth. - -1. Explore the distribution of `price`. Do you discover anything unusual - or surprising? (Hint: Carefully think about the `binwidth` and make sure - you try a wide range of values.) - -1. How many diamonds are 0.99 carat? How many are 1 carat? What - do you think is the cause of the difference? - -1. Compare and contrast `coord_cartesian()` vs `xlim()` or `ylim()` when - zooming in on a histogram. What happens if you leave `binwidth` unset? - What happens if you try and zoom so only half a bar shows? - -## Missing values - -If you've encountered unusual values in your dataset, and simply want to move on to the rest of your analysis, you have two options. - -1. Drop the entire row with the strange values: - - ```{r, eval = FALSE} - diamonds2 <- diamonds %>% - filter(between(y, 3, 20)) - ``` - - I don't recommend this option because just because one measurement - is invalid, doesn't mean all the measurements are. Additionally, if you - have low quality data, by time that you've applied this approach to every - variable you might find that you don't have any data left! - -1. Instead, I recommend replacing the unusual values with missing values. - The easiest way to do this is to use `mutate()` to replace the variable - with a modified copy. You can use the `ifelse()` function to replace - unusual values with `NA`: - - ```{r} - diamonds2 <- diamonds %>% - mutate(y = ifelse(y < 3 | y > 20, NA, y)) - ``` - -`ifelse()` has three arguments. The first argument `test` should be a logical vector. The result will contain the value of the second argument, `yes`, when `test` is `TRUE`, and the value of the third argument, `no`, when it is false. Alternatively to ifelse, use `dplyr::case_when()`. `case_when()` is particularly useful inside mutate when you want to create a new variable that relies on a complex combination of existing variables. - -Like R, ggplot2 subscribes to the philosophy that missing values should never silently go missing. It's not obvious where you should plot missing values, so ggplot2 doesn't include them in the plot, but it does warn that they've been removed: - -```{r, dev = "png"} -ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + - geom_point() -``` - -To suppress that warning, set `na.rm = TRUE`: - -```{r, eval = FALSE} -ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + - geom_point(na.rm = TRUE) -``` - -Other times you want to understand what makes observations with missing values different to observations with recorded values. For example, in `nycflights13::flights`, missing values in the `dep_time` variable indicate that the flight was cancelled. So you might want to compare the scheduled departure times for cancelled and non-cancelled times. You can do this by making a new variable with `is.na()`. - -```{r} -nycflights13::flights %>% - mutate( - cancelled = is.na(dep_time), - sched_hour = sched_dep_time %/% 100, - sched_min = sched_dep_time %% 100, - sched_dep_time = sched_hour + sched_min / 60 - ) %>% - ggplot(mapping = aes(sched_dep_time)) + - geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4) -``` - -However this plot isn't great because there are many more non-cancelled flights than cancelled flights. In the next section we'll explore some techniques for improving this comparison. - -### Exercises - -1. What happens to missing values in a histogram? What happens to missing - values in a bar chart? Why is there a difference? - -1. What does `na.rm = TRUE` do in `mean()` and `sum()`? - -## Covariation - -If variation describes the behavior _within_ a variable, covariation describes the behavior _between_ variables. **Covariation** is the tendency for the values of two or more variables to vary together in a related way. The best way to spot covariation is to visualise the relationship between two or more variables. How you do that should again depend on the type of variables involved. - -### A categorical and continuous variable {#cat-cont} - -It's common to want to explore the distribution of a continuous variable broken down by a categorical variable, as in the previous frequency polygon. The default appearance of `geom_freqpoly()` is not that useful for that sort of comparison because the height is given by the count. That means if one of the groups is much smaller than the others, it's hard to see the differences in shape. For example, let's explore how the price of a diamond varies with its quality: - -```{r} -ggplot(data = diamonds, mapping = aes(x = price)) + - geom_freqpoly(mapping = aes(colour = cut), binwidth = 500) -``` - -It's hard to see the difference in distribution because the overall counts differ so much: - -```{r, fig.width = "50%", fig.width = 4} -ggplot(diamonds) + - geom_bar(mapping = aes(x = cut)) -``` - -To make the comparison easier we need to swap what is displayed on the y-axis. Instead of displaying count, we'll display __density__, which is the count standardised so that the area under each frequency polygon is one. - -```{r} -ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) + - geom_freqpoly(mapping = aes(colour = cut), binwidth = 500) -``` - -There's something rather surprising about this plot - it appears that fair diamonds (the lowest quality) have the highest average price! But maybe that's because frequency polygons are a little hard to interpret - there's a lot going on in this plot. - -Another alternative to display the distribution of a continuous variable broken down by a categorical variable is the boxplot. A **boxplot** is a type of visual shorthand for a distribution of values that is popular among statisticians. Each boxplot consists of: - -* A box that stretches from the 25th percentile of the distribution to the - 75th percentile, a distance known as the interquartile range (IQR). In the - middle of the box is a line that displays the median, i.e. 50th percentile, - of the distribution. These three lines give you a sense of the spread of the - distribution and whether or not the distribution is symmetric about the - median or skewed to one side. - -* Visual points that display observations that fall more than 1.5 times the - IQR from either edge of the box. These outlying points are unusual - so are plotted individually. - -* A line (or whisker) that extends from each end of the box and goes to the - farthest non-outlier point in the distribution. - -```{r, echo = FALSE, out.width = "100%"} -knitr::include_graphics("images/EDA-boxplot.png") -``` - -Let's take a look at the distribution of price by cut using `geom_boxplot()`: - -```{r fig.height = 3} -ggplot(data = diamonds, mapping = aes(x = cut, y = price)) + - geom_boxplot() -``` - -We see much less information about the distribution, but the boxplots are much more compact so we can more easily compare them (and fit more on one plot). It supports the counterintuitive finding that better quality diamonds are cheaper on average! In the exercises, you'll be challenged to figure out why. - -`cut` is an ordered factor: fair is worse than good, which is worse than very good and so on. Many categorical variables don't have such an intrinsic order, so you might want to reorder them to make a more informative display. One way to do that is with the `reorder()` function. - -For example, take the `class` variable in the `mpg` dataset. You might be interested to know how highway mileage varies across classes: - -```{r} -ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + - geom_boxplot() -``` - -To make the trend easier to see, we can reorder `class` based on the median value of `hwy`: - -```{r fig.height = 3} -ggplot(data = mpg) + - geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) -``` - -If you have long variable names, `geom_boxplot()` will work better if you flip it 90°. You can do that with `coord_flip()`. - -```{r} -ggplot(data = mpg) + - geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) + - coord_flip() -``` - -#### Exercises - -1. Use what you've learned to improve the visualisation of the departure times - of cancelled vs. non-cancelled flights. - -1. What variable in the diamonds dataset is most important for predicting - the price of a diamond? How is that variable correlated with cut? - Why does the combination of those two relationships lead to lower quality - diamonds being more expensive? - -1. Install the ggstance package, and create a horizontal boxplot. - How does this compare to using `coord_flip()`? - -1. One problem with boxplots is that they were developed in an era of - much smaller datasets and tend to display a prohibitively large - number of "outlying values". One approach to remedy this problem is - the letter value plot. Install the lvplot package, and try using - `geom_lv()` to display the distribution of price vs cut. What - do you learn? How do you interpret the plots? - -1. Compare and contrast `geom_violin()` with a facetted `geom_histogram()`, - or a coloured `geom_freqpoly()`. What are the pros and cons of each - method? - -1. If you have a small dataset, it's sometimes useful to use `geom_jitter()` - to see the relationship between a continuous and categorical variable. - The ggbeeswarm package provides a number of methods similar to - `geom_jitter()`. List them and briefly describe what each one does. - -### Two categorical variables - -To visualise the covariation between categorical variables, you'll need to count the number of observations for each combination. One way to do that is to rely on the built-in `geom_count()`: - -```{r} -ggplot(data = diamonds) + - geom_count(mapping = aes(x = cut, y = color)) -``` - -The size of each circle in the plot displays how many observations occurred at each combination of values. Covariation will appear as a strong correlation between specific x values and specific y values. - -Another approach is to compute the count with dplyr: - -```{r} -diamonds %>% - count(color, cut) -``` - -Then visualise with `geom_tile()` and the fill aesthetic: - -```{r} -diamonds %>% - count(color, cut) %>% - ggplot(mapping = aes(x = color, y = cut)) + - geom_tile(mapping = aes(fill = n)) -``` - -If the categorical variables are unordered, you might want to use the seriation package to simultaneously reorder the rows and columns in order to more clearly reveal interesting patterns. For larger plots, you might want to try the d3heatmap or heatmaply packages, which create interactive plots. - -#### Exercises - -1. How could you rescale the count dataset above to more clearly show - the distribution of cut within colour, or colour within cut? - -1. Use `geom_tile()` together with dplyr to explore how average flight - delays vary by destination and month of year. What makes the - plot difficult to read? How could you improve it? - -1. Why is it slightly better to use `aes(x = color, y = cut)` rather - than `aes(x = cut, y = color)` in the example above? - -### Two continuous variables - -You've already seen one great way to visualise the covariation between two continuous variables: draw a scatterplot with `geom_point()`. You can see covariation as a pattern in the points. For example, you can see an exponential relationship between the carat size and price of a diamond. - -```{r, dev = "png"} -ggplot(data = diamonds) + - geom_point(mapping = aes(x = carat, y = price)) -``` - -Scatterplots become less useful as the size of your dataset grows, because points begin to overplot, and pile up into areas of uniform black (as above). -You've already seen one way to fix the problem: using the `alpha` aesthetic to add transparency. - -```{r, dev = "png"} -ggplot(data = diamonds) + - geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100) -``` - -But using transparency can be challenging for very large datasets. Another solution is to use bin. Previously you used `geom_histogram()` and `geom_freqpoly()` to bin in one dimension. Now you'll learn how to use `geom_bin2d()` and `geom_hex()` to bin in two dimensions. - -`geom_bin2d()` and `geom_hex()` divide the coordinate plane into 2d bins and then use a fill color to display how many points fall into each bin. `geom_bin2d()` creates rectangular bins. `geom_hex()` creates hexagonal bins. You will need to install the hexbin package to use `geom_hex()`. - -```{r, fig.asp = 1, out.width = "50%", fig.align = "default", message = FALSE} -ggplot(data = smaller) + - geom_bin2d(mapping = aes(x = carat, y = price)) - -# install.packages("hexbin") -ggplot(data = smaller) + - geom_hex(mapping = aes(x = carat, y = price)) -``` - -Another option is to bin one continuous variable so it acts like a categorical variable. Then you can use one of the techniques for visualising the combination of a categorical and a continuous variable that you learned about. For example, you could bin `carat` and then for each group, display a boxplot: - -```{r} -ggplot(data = smaller, mapping = aes(x = carat, y = price)) + - geom_boxplot(mapping = aes(group = cut_width(carat, 0.1))) -``` - -`cut_width(x, width)`, as used above, divides `x` into bins of width `width`. By default, boxplots look roughly the same (apart from number of outliers) regardless of how many observations there are, so it's difficult to tell that each boxplot summarises a different number of points. One way to show that is to make the width of the boxplot proportional to the number of points with `varwidth = TRUE`. - -Another approach is to display approximately the same number of points in each bin. That's the job of `cut_number()`: - -```{r} -ggplot(data = smaller, mapping = aes(x = carat, y = price)) + - geom_boxplot(mapping = aes(group = cut_number(carat, 20))) -``` - -#### Exercises - -1. Instead of summarising the conditional distribution with a boxplot, you - could use a frequency polygon. What do you need to consider when using - `cut_width()` vs `cut_number()`? How does that impact a visualisation of - the 2d distribution of `carat` and `price`? - -1. Visualise the distribution of carat, partitioned by price. - -1. How does the price distribution of very large diamonds compare to small - diamonds? Is it as you expect, or does it surprise you? - -1. Combine two of the techniques you've learned to visualise the - combined distribution of cut, carat, and price. - -1. Two dimensional plots reveal outliers that are not visible in one - dimensional plots. For example, some points in the plot below have an - unusual combination of `x` and `y` values, which makes the points outliers - even though their `x` and `y` values appear normal when examined separately. - - ```{r, dev = "png"} - ggplot(data = diamonds) + - geom_point(mapping = aes(x = x, y = y)) + - coord_cartesian(xlim = c(4, 11), ylim = c(4, 11)) - ``` - - Why is a scatterplot a better display than a binned plot for this case? - -## Patterns and models - -Patterns in your data provide clues about relationships. If a systematic relationship exists between two variables it will appear as a pattern in the data. If you spot a pattern, ask yourself: - -+ Could this pattern be due to coincidence (i.e. random chance)? - -+ How can you describe the relationship implied by the pattern? - -+ How strong is the relationship implied by the pattern? - -+ What other variables might affect the relationship? - -+ Does the relationship change if you look at individual subgroups of the data? - -A scatterplot of Old Faithful eruption lengths versus the wait time between eruptions shows a pattern: longer wait times are associated with longer eruptions. The scatterplot also displays the two clusters that we noticed above. - -```{r fig.height = 2} -ggplot(data = faithful) + - geom_point(mapping = aes(x = eruptions, y = waiting)) -``` - -Patterns provide one of the most useful tools for data scientists because they reveal covariation. If you think of variation as a phenomenon that creates uncertainty, covariation is a phenomenon that reduces it. If two variables covary, you can use the values of one variable to make better predictions about the values of the second. If the covariation is due to a causal relationship (a special case), then you can use the value of one variable to control the value of the second. - -Models are a tool for extracting patterns out of data. For example, consider the diamonds data. It's hard to understand the relationship between cut and price, because cut and carat, and carat and price are tightly related. It's possible to use a model to remove the very strong relationship between price and carat so we can explore the subtleties that remain. The following code fits a model that predicts `price` from `carat` and then computes the residuals (the difference between the predicted value and the actual value). The residuals give us a view of the price of the diamond, once the effect of carat has been removed. - -```{r, dev = "png"} -library(modelr) - -mod <- lm(log(price) ~ log(carat), data = diamonds) - -diamonds2 <- diamonds %>% - add_residuals(mod) %>% - mutate(resid = exp(resid)) - -ggplot(data = diamonds2) + - geom_point(mapping = aes(x = carat, y = resid)) -``` - -Once you've removed the strong relationship between carat and price, you can see what you expect in the relationship between cut and price: relative to their size, better quality diamonds are more expensive. - -```{r} -ggplot(data = diamonds2) + - geom_boxplot(mapping = aes(x = cut, y = resid)) -``` - -You'll learn how models, and the modelr package, work in the final part of the book, [model](#model-intro). We're saving modelling for later because understanding what models are and how they work is easiest once you have tools of data wrangling and programming in hand. - -## ggplot2 calls - -As we move on from these introductory chapters, we'll transition to a more concise expression of ggplot2 code. So far we've been very explicit, which is helpful when you are learning: - -```{r, eval = FALSE} -ggplot(data = faithful, mapping = aes(x = eruptions)) + - geom_freqpoly(binwidth = 0.25) -``` - -Typically, the first one or two arguments to a function are so important that you should know them by heart. The first two arguments to `ggplot()` are `data` and `mapping`, and the first two arguments to `aes()` are `x` and `y`. In the remainder of the book, we won't supply those names. That saves typing, and, by reducing the amount of boilerplate, makes it easier to see what's different between plots. That's a really important programming concern that we'll come back in [functions]. - -Rewriting the previous plot more concisely yields: - -```{r, eval = FALSE} -ggplot(faithful, aes(eruptions)) + - geom_freqpoly(binwidth = 0.25) -``` - -Sometimes we'll turn the end of a pipeline of data transformation into a plot. Watch for the transition from `%>%` to `+`. I wish this transition wasn't necessary but unfortunately ggplot2 was created before the pipe was discovered. - -```{r, eval = FALSE} -diamonds %>% - count(cut, clarity) %>% - ggplot(aes(clarity, cut, fill = n)) + - geom_tile() -``` - -## Learning more - -If you want to learn more about the mechanics of ggplot2, I'd highly recommend grabbing a copy of the ggplot2 book: . It's been recently updated, so it includes dplyr and tidyr code, and has much more space to explore all the facets of visualisation. Unfortunately the book isn't generally available for free, but if you have a connection to a university you can probably get an electronic version for free through SpringerLink. - -Another useful resource is the [_R Graphics Cookbook_](https://amzn.com/1449316956) by Winston Chang. Much of the contents are available online at . - -I also recommend [_Graphical Data Analysis with R_](https://amzn.com/1498715230), by Antony Unwin. This is a book-length treatment similar to the material covered in this chapter, but has the space to go into much greater depth. diff --git a/EDA.qmd b/EDA.qmd new file mode 100644 index 000000000..720d27120 --- /dev/null +++ b/EDA.qmd @@ -0,0 +1,715 @@ +# Exploratory data analysis {#sec-exploratory-data-analysis} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +This chapter will show you how to use visualization and transformation to explore your data in a systematic way, a task that statisticians call exploratory data analysis, or EDA for short. +EDA is an iterative cycle. +You: + +1. Generate questions about your data. + +2. Search for answers by visualizing, transforming, and modelling your data. + +3. Use what you learn to refine your questions and/or generate new questions. + +EDA is not a formal process with a strict set of rules. +More than anything, EDA is a state of mind. +During the initial phases of EDA you should feel free to investigate every idea that occurs to you. +Some of these ideas will pan out, and some will be dead ends. +As your exploration continues, you will home in on a few particularly productive insights that you'll eventually write up and communicate to others. + +EDA is an important part of any data analysis, even if the primary research questions are handed to you on a platter, because you always need to investigate the quality of your data. +Data cleaning is just one application of EDA: you ask questions about whether your data meets your expectations or not. +To do data cleaning, you'll need to deploy all the tools of EDA: visualization, transformation, and modelling. + +### Prerequisites + +In this chapter we'll combine what you've learned about dplyr and ggplot2 to interactively ask questions, answer them with data, and then ask new questions. + +```{r} +#| label: setup +#| message: false + +library(tidyverse) +``` + +## Questions + +> "There are no routine statistical questions, only questionable statistical routines." --- Sir David Cox + +> "Far better an approximate answer to the right question, which is often vague, than an exact answer to the wrong question, which can always be made precise." --- John Tukey + +Your goal during EDA is to develop an understanding of your data. +The easiest way to do this is to use questions as tools to guide your investigation. +When you ask a question, the question focuses your attention on a specific part of your dataset and helps you decide which graphs, models, or transformations to make. + +EDA is fundamentally a creative process. +And like most creative processes, the key to asking *quality* questions is to generate a large *quantity* of questions. +It is difficult to ask revealing questions at the start of your analysis because you do not know what insights can be gleaned from your dataset. +On the other hand, each new question that you ask will expose you to a new aspect of your data and increase your chance of making a discovery. +You can quickly drill down into the most interesting parts of your data---and develop a set of thought-provoking questions---if you follow up each question with a new question based on what you find. + +There is no rule about which questions you should ask to guide your research. +However, two types of questions will always be useful for making discoveries within your data. +You can loosely word these questions as: + +1. What type of variation occurs within my variables? + +2. What type of covariation occurs between my variables? + +The rest of this chapter will look at these two questions. +We'll explain what variation and covariation are, and we'll show you several ways to answer each question. + +## Variation + +**Variation** is the tendency of the values of a variable to change from measurement to measurement. +You can see variation easily in real life; if you measure any continuous variable twice, you will get two different results. +This is true even if you measure quantities that are constant, like the speed of light. +Each of your measurements will include a small amount of error that varies from measurement to measurement. +Variables can also vary if you measure across different subjects (e.g., the eye colors of different people) or at different times (e.g., the energy levels of an electron at different moments). +Every variable has its own pattern of variation, which can reveal interesting information about how that it varies between measurements on the same observation as well as across observations. +The best way to understand that pattern is to visualize the distribution of the variable's values, which you've learned about in @sec-data-visualization. + +We'll start our exploration by visualizing the distribution of weights (`carat`) of \~54,000 diamonds from the `diamonds` dataset. +Since `carat` is a numerical variable, we can use a histogram: + +```{r} +#| fig-alt: | +#| A histogram of carats of diamonds, with the x-axis ranging from 0 to 4.5 +#| and the y-axis ranging from 0 to 30000. The distribution is right skewed +#| with very few diamonds in the bin centered at 0, almost 30000 diamonds in +#| the bin centered at 0.5, approximately 15000 diamonds in the bin centered +#| at 1, and much fewer, approximately 5000 diamonds in the bin centered at +#| 1.5. Beyond this, there's a trailing tail. + +ggplot(diamonds, aes(x = carat)) + + geom_histogram(binwidth = 0.5) +``` + +Now that you can visualize variation, what should you look for in your plots? +And what type of follow-up questions should you ask? +We've put together a list below of the most useful types of information that you will find in your graphs, along with some follow-up questions for each type of information. +The key to asking good follow-up questions will be to rely on your curiosity (What do you want to learn more about?) as well as your skepticism (How could this be misleading?). + +### Typical values + +In both bar charts and histograms, tall bars show the common values of a variable, and shorter bars show less-common values. +Places that do not have bars reveal values that were not seen in your data. +To turn this information into useful questions, look for anything unexpected: + +- Which values are the most common? + Why? + +- Which values are rare? + Why? + Does that match your expectations? + +- Can you see any unusual patterns? + What might explain them? + +Let's take a look at the distribution of `carat` for smaller diamonds. + +```{r} +#| fig-alt: | +#| A histogram of carats of diamonds, with the x-axis ranging from 0 to 3 and +#| the y-axis ranging from 0 to roughly 2500. The binwidth is quite narrow +#| (0.01), resulting in a very large number of skinny bars. The distribution +#| is right skewed, with many peaks followed by bars in decreasing heights, +#| until a sharp increase at the next peak. + +smaller <- diamonds |> + filter(carat < 3) + +ggplot(smaller, aes(x = carat)) + + geom_histogram(binwidth = 0.01) +``` + +This histogram suggests several interesting questions: + +- Why are there more diamonds at whole carats and common fractions of carats? + +- Why are there more diamonds slightly to the right of each peak than there are slightly to the left of each peak? + +Visualizations can also reveal clusters, which suggest that subgroups exist in your data. +To understand the subgroups, ask: + +- How are the observations within each subgroup similar to each other? + +- How are the observations in separate clusters different from each other? + +- How can you explain or describe the clusters? + +- Why might the appearance of clusters be misleading? + +Some of these questions can be answered with the data while some will require domain expertise about the data. +Many of them will prompt you to explore a relationship *between* variables, for example, to see if the values of one variable can explain the behavior of another variable. +We'll get to that shortly. + +### Unusual values + +Outliers are observations that are unusual; data points that don't seem to fit the pattern. +Sometimes outliers are data entry errors, sometimes they are simply values at the extremes that happened to be observed in this data collection, and other times they suggest important new discoveries. +When you have a lot of data, outliers are sometimes difficult to see in a histogram. +For example, take the distribution of the `y` variable from the diamonds dataset. +The only evidence of outliers is the unusually wide limits on the x-axis. + +```{r} +#| fig-alt: | +#| A histogram of lengths of diamonds. The x-axis ranges from 0 to 60 and +#| the y-axis ranges from 0 to 12000. There is a peak around 5, and the +#| data appear to be completely clustered around the peak. + +ggplot(diamonds, aes(x = y)) + + geom_histogram(binwidth = 0.5) +``` + +There are so many observations in the common bins that the rare bins are very short, making it very difficult to see them (although maybe if you stare intently at 0 you'll spot something). +To make it easy to see the unusual values, we need to zoom to small values of the y-axis with `coord_cartesian()`: + +```{r} +#| fig-alt: | +#| A histogram of lengths of diamonds. The x-axis ranges from 0 to 60 and the +#| y-axis ranges from 0 to 50. There is a peak around 5, and the data +#| appear to be completely clustered around the peak. Other than those data, +#| there is one bin at 0 with a height of about 8, one a little over 30 with +#| a height of 1 and another one a little below 60 with a height of 1. + +ggplot(diamonds, aes(x = y)) + + geom_histogram(binwidth = 0.5) + + coord_cartesian(ylim = c(0, 50)) +``` + +`coord_cartesian()` also has an `xlim()` argument for when you need to zoom into the x-axis. +ggplot2 also has `xlim()` and `ylim()` functions that work slightly differently: they throw away the data outside the limits. + +This allows us to see that there are three unusual values: 0, \~30, and \~60. +We pluck them out with dplyr: + +```{r} +#| include: false + +old <- options(tibble.print_max = 10, tibble.print_min = 10) +``` + +```{r} +unusual <- diamonds |> + filter(y < 3 | y > 20) |> + select(price, x, y, z) |> + arrange(y) +unusual +``` + +```{r} +#| include: false + +options(old) +``` + +The `y` variable measures one of the three dimensions of these diamonds, in mm. +We know that diamonds can't have a width of 0mm, so these values must be incorrect. +By doing EDA, we have discovered missing data that was coded as 0, which we never would have found by simply searching for `NA`s. +Going forward we might choose to re-code these values as `NA`s in order to prevent misleading calculations. +We might also suspect that measurements of 32mm and 59mm are implausible: those diamonds are over an inch long, but don't cost hundreds of thousands of dollars! + +It's good practice to repeat your analysis with and without the outliers. +If they have minimal effect on the results, and you can't figure out why they're there, it's reasonable to omit them, and move on. +However, if they have a substantial effect on your results, you shouldn't drop them without justification. +You'll need to figure out what caused them (e.g., a data entry error) and disclose that you removed them in your write-up. + +### Exercises + +1. Explore the distribution of each of the `x`, `y`, and `z` variables in `diamonds`. + What do you learn? + Think about a diamond and how you might decide which dimension is the length, width, and depth. + +2. Explore the distribution of `price`. + Do you discover anything unusual or surprising? + (Hint: Carefully think about the `binwidth` and make sure you try a wide range of values.) + +3. How many diamonds are 0.99 carat? + How many are 1 carat? + What do you think is the cause of the difference? + +4. Compare and contrast `coord_cartesian()` vs. `xlim()` or `ylim()` when zooming in on a histogram. + What happens if you leave `binwidth` unset? + What happens if you try and zoom so only half a bar shows? + +## Unusual values {#sec-unusual-values-eda} + +If you've encountered unusual values in your dataset, and simply want to move on to the rest of your analysis, you have two options. + +1. Drop the entire row with the strange values: + + ```{r} + #| eval: false + + diamonds2 <- diamonds |> + filter(between(y, 3, 20)) + ``` + + We don't recommend this option because one invalid value doesn't imply that all the other values for that observation are also invalid. + Additionally, if you have low quality data, by the time that you've applied this approach to every variable you might find that you don't have any data left! + +2. Instead, we recommend replacing the unusual values with missing values. + The easiest way to do this is to use `mutate()` to replace the variable with a modified copy. + You can use the `if_else()` function to replace unusual values with `NA`: + + ```{r} + diamonds2 <- diamonds |> + mutate(y = if_else(y < 3 | y > 20, NA, y)) + ``` + +It's not obvious where you should plot missing values, so ggplot2 doesn't include them in the plot, but it does warn that they've been removed: + +```{r} +#| dev: "png" +#| fig-alt: | +#| A scatterplot of widths vs. lengths of diamonds. There is a strong, +#| linear association between the two variables. All but one of the diamonds +#| has length greater than 3. The one outlier has a length of 0 and a width +#| of about 6.5. + +ggplot(diamonds2, aes(x = x, y = y)) + + geom_point() +``` + +To suppress that warning, set `na.rm = TRUE`: + +```{r} +#| eval: false + +ggplot(diamonds2, aes(x = x, y = y)) + + geom_point(na.rm = TRUE) +``` + +Other times you want to understand what makes observations with missing values different to observations with recorded values. +For example, in `nycflights13::flights`[^eda-1], missing values in the `dep_time` variable indicate that the flight was cancelled. +So you might want to compare the scheduled departure times for cancelled and non-cancelled times. +You can do this by making a new variable, using `is.na()` to check if `dep_time` is missing. + +[^eda-1]: Remember that when we need to be explicit about where a function (or dataset) comes from, we'll use the special form `package::function()` or `package::dataset`. + +```{r} +#| fig-alt: | +#| A frequency polygon of scheduled departure times of flights. Two lines +#| represent flights that are cancelled and not cancelled. The x-axis ranges +#| from 0 to 25 minutes and the y-axis ranges from 0 to 10000. The number of +#| flights not cancelled are much higher than those cancelled. + +nycflights13::flights |> + mutate( + cancelled = is.na(dep_time), + sched_hour = sched_dep_time %/% 100, + sched_min = sched_dep_time %% 100, + sched_dep_time = sched_hour + (sched_min / 60) + ) |> + ggplot(aes(x = sched_dep_time)) + + geom_freqpoly(aes(color = cancelled), binwidth = 1/4) +``` + +However this plot isn't great because there are many more non-cancelled flights than cancelled flights. +In the next section we'll explore some techniques for improving this comparison. + +### Exercises + +1. What happens to missing values in a histogram? + What happens to missing values in a bar chart? + Why is there a difference in how missing values are handled in histograms and bar charts? + +2. What does `na.rm = TRUE` do in `mean()` and `sum()`? + +3. Recreate the frequency plot of `scheduled_dep_time` colored by whether the flight was cancelled or not. + Also facet by the `cancelled` variable. + Experiment with different values of the `scales` variable in the faceting function to mitigate the effect of more non-cancelled flights than cancelled flights. + +## Covariation + +If variation describes the behavior *within* a variable, covariation describes the behavior *between* variables. +**Covariation** is the tendency for the values of two or more variables to vary together in a related way. +The best way to spot covariation is to visualize the relationship between two or more variables. + +### A categorical and a numerical variable {#sec-cat-num} + +For example, let's explore how the price of a diamond varies with its quality (measured by `cut`) using `geom_freqpoly()`: + +```{r} +#| fig-alt: | +#| A frequency polygon of prices of diamonds where each cut of carat (Fair, +#| Good, Very Good, Premium, and Ideal) is represented with a different color +#| line. The x-axis ranges from 0 to 30000 and the y-axis ranges from 0 to +#| 5000. The lines overlap a great deal, suggesting similar frequency +#| distributions of prices of diamonds. One notable feature is that +#| Ideal diamonds have the highest peak around 1500. + +ggplot(diamonds, aes(x = price)) + + geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75) +``` + +Note that ggplot2 uses an ordered color scale for `cut` because it's defined as an ordered factor variable in the data. +You'll learn more about these in @sec-ordered-factors. + +The default appearance of `geom_freqpoly()` is not that useful here because the height, determined by the overall count, differs so much across `cut`s, making it hard to see the differences in the shapes of their distributions. + +To make the comparison easier we need to swap what is displayed on the y-axis. +Instead of displaying count, we'll display the **density**, which is the count standardized so that the area under each frequency polygon is one. + +```{r} +#| fig-alt: | +#| A frequency polygon of densities of prices of diamonds where each cut of +#| carat (Fair, Good, Very Good, Premium, and Ideal) is represented with a +#| different color line. The x-axis ranges from 0 to 20000. The lines overlap +#| a great deal, suggesting similar density distributions of prices of +#| diamonds. One notable feature is that all but Fair diamonds have high peaks +#| around a price of 1500 and Fair diamonds have a higher mean than others. + +ggplot(diamonds, aes(x = price, y = after_stat(density))) + + geom_freqpoly(aes(color = cut), binwidth = 500, linewidth = 0.75) +``` + +Note that we're mapping the density the `y`, but since `density` is not a variable in the `diamonds` dataset, we need to first calculate it. +We use the `after_stat()` function to do so. + +There's something rather surprising about this plot - it appears that fair diamonds (the lowest quality) have the highest average price! +But maybe that's because frequency polygons are a little hard to interpret - there's a lot going on in this plot. + +A visually simpler plot for exploring this relationship is using side-by-side boxplots. + +```{r} +#| fig-alt: | +#| Side-by-side boxplots of prices of diamonds by cut. The distribution of +#| prices is right skewed for each cut (Fair, Good, Very Good, Premium, and +#| Ideal). The medians are close to each other, with the median for Ideal +#| diamonds lowest and that for Fair highest. + +ggplot(diamonds, aes(x = cut, y = price)) + + geom_boxplot() +``` + +We see much less information about the distribution, but the boxplots are much more compact so we can more easily compare them (and fit more on one plot). +It supports the counter-intuitive finding that better quality diamonds are typically cheaper! +In the exercises, you'll be challenged to figure out why. + +`cut` is an ordered factor: fair is worse than good, which is worse than very good and so on. +Many categorical variables don't have such an intrinsic order, so you might want to reorder them to make a more informative display. +One way to do that is with `fct_reorder()`. +You'll learn more about that function in @sec-modifying-factor-order, but we want to give you a quick preview here because it's so useful. +For example, take the `class` variable in the `mpg` dataset. +You might be interested to know how highway mileage varies across classes: + +```{r} +#| fig-alt: | +#| Side-by-side boxplots of highway mileages of cars by class. Classes are +#| on the x-axis (2seaters, compact, midsize, minivan, pickup, subcompact, +#| and suv). + +ggplot(mpg, aes(x = class, y = hwy)) + + geom_boxplot() +``` + +To make the trend easier to see, we can reorder `class` based on the median value of `hwy`: + +```{r} +#| fig-alt: | +#| Side-by-side boxplots of highway mileages of cars by class. Classes are +#| on the x-axis and ordered by increasing median highway mileage (pickup, +#| suv, minivan, 2seater, subcompact, compact, and midsize). + +ggplot(mpg, aes(x = fct_reorder(class, hwy, median), y = hwy)) + + geom_boxplot() +``` + +If you have long variable names, `geom_boxplot()` will work better if you flip it 90°. +You can do that by exchanging the x and y aesthetic mappings. + +```{r} +#| fig-alt: | +#| Side-by-side boxplots of highway mileages of cars by class. Classes are +#| on the y-axis and ordered by increasing median highway mileage. + +ggplot(mpg, aes(x = hwy, y = fct_reorder(class, hwy, median))) + + geom_boxplot() +``` + +#### Exercises + +1. Use what you've learned to improve the visualization of the departure times of cancelled vs. non-cancelled flights. + +2. Based on EDA, what variable in the diamonds dataset appears to be most important for predicting the price of a diamond? + How is that variable correlated with cut? + Why does the combination of those two relationships lead to lower quality diamonds being more expensive? + +3. Instead of exchanging the x and y variables, add `coord_flip()` as a new layer to the vertical boxplot to create a horizontal one. + How does this compare to exchanging the variables? + +4. One problem with boxplots is that they were developed in an era of much smaller datasets and tend to display a prohibitively large number of "outlying values". + One approach to remedy this problem is the letter value plot. + Install the lvplot package, and try using `geom_lv()` to display the distribution of price vs. cut. + What do you learn? + How do you interpret the plots? + +5. Create a visualization of diamond prices vs. a categorical variable from the `diamonds` dataset using `geom_violin()`, then a faceted `geom_histogram()`, then a colored `geom_freqpoly()`, and then a colored `geom_density()`. + Compare and contrast the four plots. + What are the pros and cons of each method of visualizing the distribution of a numerical variable based on the levels of a categorical variable? + +6. If you have a small dataset, it's sometimes useful to use `geom_jitter()` to avoid overplotting to more easily see the relationship between a continuous and categorical variable. + The ggbeeswarm package provides a number of methods similar to `geom_jitter()`. + List them and briefly describe what each one does. + +### Two categorical variables + +To visualize the covariation between categorical variables, you'll need to count the number of observations for each combination of levels of these categorical variables. +One way to do that is to rely on the built-in `geom_count()`: + +```{r} +#| fig-alt: | +#| A scatterplot of color vs. cut of diamonds. There is one point for each +#| combination of levels of cut (Fair, Good, Very Good, Premium, and Ideal) +#| and color (D, E, F, G, G, I, and J). The sizes of the points represent +#| the number of observations for that combination. The legend indicates +#| that these sizes range between 1000 and 4000. + +ggplot(diamonds, aes(x = cut, y = color)) + + geom_count() +``` + +The size of each circle in the plot displays how many observations occurred at each combination of values. +Covariation will appear as a strong correlation between specific x values and specific y values. + +Another approach for exploring the relationship between these variables is computing the counts with dplyr: + +```{r} +diamonds |> + count(color, cut) +``` + +Then visualize with `geom_tile()` and the fill aesthetic: + +```{r} +#| fig-alt: | +#| A tile plot of cut vs. color of diamonds. Each tile represents a +#| cut/color combination and tiles are colored according to the number of +#| observations in each tile. There are more Ideal diamonds than other cuts, +#| with the highest number being Ideal diamonds with color G. Fair diamonds +#| and diamonds with color I are the lowest in frequency. + +diamonds |> + count(color, cut) |> + ggplot(aes(x = color, y = cut)) + + geom_tile(aes(fill = n)) +``` + +If the categorical variables are unordered, you might want to use the seriation package to simultaneously reorder the rows and columns in order to more clearly reveal interesting patterns. +For larger plots, you might want to try the heatmaply package, which creates interactive plots. + +#### Exercises + +1. How could you rescale the count dataset above to more clearly show the distribution of cut within color, or color within cut? + +2. What different data insights do you get with a segmented bar chart if color is mapped to the `x` aesthetic and `cut` is mapped to the `fill` aesthetic? + Calculate the counts that fall into each of the segments. + +3. Use `geom_tile()` together with dplyr to explore how average flight departure delays vary by destination and month of year. + What makes the plot difficult to read? + How could you improve it? + +### Two numerical variables + +You've already seen one great way to visualize the covariation between two numerical variables: draw a scatterplot with `geom_point()`. +You can see covariation as a pattern in the points. +For example, you can see a positive relationship between the carat size and price of a diamond: diamonds with more carats have a higher price. +The relationship is exponential. + +```{r} +#| dev: "png" +#| fig-alt: | +#| A scatterplot of price vs. carat. The relationship is positive, somewhat +#| strong, and exponential. + +ggplot(smaller, aes(x = carat, y = price)) + + geom_point() +``` + +(In this section we'll use the `smaller` dataset to stay focused on the bulk of the diamonds that are smaller than 3 carats) + +Scatterplots become less useful as the size of your dataset grows, because points begin to overplot, and pile up into areas of uniform black, making it hard to judge differences in the density of the data across the 2-dimensional space as well as making it hard to spot the trend. +You've already seen one way to fix the problem: using the `alpha` aesthetic to add transparency. + +```{r} +#| dev: "png" +#| fig-alt: | +#| A scatterplot of price vs. carat. The relationship is positive, somewhat +#| strong, and exponential. The points are transparent, showing clusters where +#| the number of points is higher than other areas, The most obvious clusters +#| are for diamonds with 1, 1.5, and 2 carats. + +ggplot(smaller, aes(x = carat, y = price)) + + geom_point(alpha = 1 / 100) +``` + +But using transparency can be challenging for very large datasets. +Another solution is to use bin. +Previously you used `geom_histogram()` and `geom_freqpoly()` to bin in one dimension. +Now you'll learn how to use `geom_bin2d()` and `geom_hex()` to bin in two dimensions. + +`geom_bin2d()` and `geom_hex()` divide the coordinate plane into 2d bins and then use a fill color to display how many points fall into each bin. +`geom_bin2d()` creates rectangular bins. +`geom_hex()` creates hexagonal bins. +You will need to install the hexbin package to use `geom_hex()`. + +```{r} +#| layout-ncol: 2 +#| fig-width: 3 +#| fig-alt: | +#| Plot 1: A binned density plot of price vs. carat. Plot 2: A hexagonal bin +#| plot of price vs. carat. Both plots show that the highest density of +#| diamonds have low carats and low prices. + +ggplot(smaller, aes(x = carat, y = price)) + + geom_bin2d() + +# install.packages("hexbin") +ggplot(smaller, aes(x = carat, y = price)) + + geom_hex() +``` + +Another option is to bin one continuous variable so it acts like a categorical variable. +Then you can use one of the techniques for visualizing the combination of a categorical and a continuous variable that you learned about. +For example, you could bin `carat` and then for each group, display a boxplot: + +```{r} +#| fig-alt: | +#| Side-by-side box plots of price by carat. Each box plot represents diamonds +#| that are 0.1 carats apart in weight. The box plots show that as carat +#| increases the median price increases as well. Additionally, diamonds with +#| 1.5 carats or lower have right skewed price distributions, 1.5 to 2 have +#| roughly symmetric price distributions, and diamonds that weigh more have +#| left skewed distributions. Cheaper, smaller diamonds have outliers on the +#| higher end, more expensive, bigger diamonds have outliers on the lower end. + +ggplot(smaller, aes(x = carat, y = price)) + + geom_boxplot(aes(group = cut_width(carat, 0.1))) +``` + +`cut_width(x, width)`, as used above, divides `x` into bins of width `width`. +By default, boxplots look roughly the same (apart from number of outliers) regardless of how many observations there are, so it's difficult to tell that each boxplot summaries a different number of points. +One way to show that is to make the width of the boxplot proportional to the number of points with `varwidth = TRUE`. + +#### Exercises + +1. Instead of summarizing the conditional distribution with a boxplot, you could use a frequency polygon. + What do you need to consider when using `cut_width()` vs. `cut_number()`? + How does that impact a visualization of the 2d distribution of `carat` and `price`? + +2. Visualize the distribution of `carat`, partitioned by `price`. + +3. How does the price distribution of very large diamonds compare to small diamonds? + Is it as you expect, or does it surprise you? + +4. Combine two of the techniques you've learned to visualize the combined distribution of cut, carat, and price. + +5. Two dimensional plots reveal outliers that are not visible in one dimensional plots. + For example, some points in the following plot have an unusual combination of `x` and `y` values, which makes the points outliers even though their `x` and `y` values appear normal when examined separately. + Why is a scatterplot a better display than a binned plot for this case? + + ```{r} + #| eval: false + diamonds |> + filter(x >= 4) |> + ggplot(aes(x = x, y = y)) + + geom_point() + + coord_cartesian(xlim = c(4, 11), ylim = c(4, 11)) + ``` + +6. Instead of creating boxes of equal width with `cut_width()`, we could create boxes that contain roughly equal number of points with `cut_number()`. + What are the advantages and disadvantages of this approach? + + ```{r} + #| eval: false + ggplot(smaller, aes(x = carat, y = price)) + + geom_boxplot(aes(group = cut_number(carat, 20))) + ``` + +## Patterns and models + +If a systematic relationship exists between two variables it will appear as a pattern in the data. +If you spot a pattern, ask yourself: + +- Could this pattern be due to coincidence (i.e. random chance)? + +- How can you describe the relationship implied by the pattern? + +- How strong is the relationship implied by the pattern? + +- What other variables might affect the relationship? + +- Does the relationship change if you look at individual subgroups of the data? + +Patterns in your data provide clues about relationships, i.e., they reveal covariation. +If you think of variation as a phenomenon that creates uncertainty, covariation is a phenomenon that reduces it. +If two variables covary, you can use the values of one variable to make better predictions about the values of the second. +If the covariation is due to a causal relationship (a special case), then you can use the value of one variable to control the value of the second. + +Models are a tool for extracting patterns out of data. +For example, consider the diamonds data. +It's hard to understand the relationship between cut and price, because cut and carat, and carat and price are tightly related. +It's possible to use a model to remove the very strong relationship between price and carat so we can explore the subtleties that remain. +The following code fits a model that predicts `price` from `carat` and then computes the residuals (the difference between the predicted value and the actual value). +The residuals give us a view of the price of the diamond, once the effect of carat has been removed. +Note that instead of using the raw values of `price` and `carat`, we log transform them first, and fit a model to the log-transformed values. +Then, we exponentiate the residuals to put them back in the scale of raw prices. + +```{r} +#| message: false +#| dev: "png" +#| fig-alt: | +#| A scatterplot of residuals vs. carat of diamonds. The x-axis ranges from 0 +#| to 5, the y-axis ranges from 0 to almost 4. Much of the data are clustered +#| around low values of carat and residuals. There is a clear, curved pattern +#| showing decrease in residuals as carat increases. + +library(tidymodels) + +diamonds <- diamonds |> + mutate( + log_price = log(price), + log_carat = log(carat) + ) + +diamonds_fit <- linear_reg() |> + fit(log_price ~ log_carat, data = diamonds) + +diamonds_aug <- augment(diamonds_fit, new_data = diamonds) |> + mutate(.resid = exp(.resid)) + +ggplot(diamonds_aug, aes(x = carat, y = .resid)) + + geom_point() +``` + +Once you've removed the strong relationship between carat and price, you can see what you expect in the relationship between cut and price: relative to their size, better quality diamonds are more expensive. + +```{r} +#| fig-alt: | +#| Side-by-side box plots of residuals by cut. The x-axis displays the various +#| cuts (Fair to Ideal), the y-axis ranges from 0 to almost 5. The medians are +#| quite similar, between roughly 0.75 to 1.25. Each of the distributions of +#| residuals is right skewed, with many outliers on the higher end. + +ggplot(diamonds_aug, aes(x = cut, y = .resid)) + + geom_boxplot() +``` + +We're not discussing modelling in this book because understanding what models are and how they work is easiest once you have tools of data wrangling and programming in hand. + +## Summary + +In this chapter you've learned a variety of tools to help you understand the variation within your data. +You've seen techniques that work with a single variable at a time and with a pair of variables. +This might seem painfully restrictive if you have tens or hundreds of variables in your data, but they're foundation upon which all other techniques are built. + +In the next chapter, we'll focus on the tools we can use to communicate our results. diff --git a/README.md b/README.md index 7ec718ef5..b9ce43b97 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,66 @@ # R for Data Science -This is code and text behind the [R for Data Science](http://r4ds.had.co.nz) -book. + -The R packages used in this book can be installed via +[![Render and deploy Book to Netlify](https://github.com/hadley/r4ds/actions/workflows/build_book.yaml/badge.svg)](https://github.com/hadley/r4ds/actions/workflows/build_book.yaml) + + + +This repository contains the source of [R for Data Science](http://r4ds.hadley.nz) book. +The book is built using [Quarto](https://quarto.org/). + +## Images + +### Omnigraffle drawings + +- Font: 12pt Guardian Sans Condensed / Ubuntu mono + +- Export as 300 dpi png. + +- Website font is 18 px = 13.5 pt, so scale dpi to match font sizes: 270 = 300 \* 12 / 13.5. + (I also verified this empirically by screenshotting.) + + ``` r + #| echo: FALSE + #| out.width: NULL + knitr::include_graphics("diagrams/transform.png", dpi = 270) + ``` + +### Screenshots + +- Make sure you're using a light theme. + For small interface elements (eg. toolbars), zoom in twice. + +- Screenshot with Cmd + Shift + 4. + +- Don't need to set dpi: + + ``` r + #| echo: FALSE + #| out.width: NULL + knitr::include_graphics("screenshots/rstudio-wg.png") + ``` + +### O'Reilly + +To generate book for O'Reilly, build the book then: ```{r} -devtools::install_github("hadley/r4ds") +# pak::pak("hadley/htmlbook") +htmlbook::convert_book() + +html <- list.files("oreilly", pattern = "[.]html$", full.names = TRUE) +file.copy(html, "../r-for-data-science-2e/", overwrite = TRUE) + +pngs <- list.files("oreilly", pattern = "[.]png$", full.names = TRUE, recursive = TRUE) +dest <- gsub("oreilly", "../r-for-data-science-2e/", pngs) +fs::dir_create(unique(dirname(dest))) +file.copy(pngs, dest, overwrite = TRUE) ``` -The site is built using [bookdown package](https://github.com/rstudio/bookdown). -To create the site, you also need: -* [pandoc](http://johnmacfarlane.net/pandoc/) +Then commit and push to atlas. + +## Code of Conduct + +Please note that r4ds uses a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/0/CODE_OF_CONDUCT.html). +By contributing to this book, you agree to abide by its terms. diff --git a/_bookdown.yml b/_bookdown.yml deleted file mode 100644 index 679c1e9ed..000000000 --- a/_bookdown.yml +++ /dev/null @@ -1,42 +0,0 @@ -new_session: yes - -rmd_files: [ - "index.rmd", - "intro.Rmd", - - "explore.Rmd", - "visualize.Rmd", - "workflow-basics.Rmd", - "transform.Rmd", - "workflow-scripts.Rmd", - "EDA.Rmd", - "workflow-projects.Rmd", - - "wrangle.Rmd", - "tibble.Rmd", - "import.Rmd", - "tidy.Rmd", - "relational-data.Rmd", - "strings.Rmd", - "factors.Rmd", - "datetimes.Rmd", - - "program.Rmd", - "pipes.Rmd", - "functions.Rmd", - "vectors.Rmd", - "iteration.Rmd", - - "model.Rmd", - "model-basics.Rmd", - "model-building.Rmd", - "model-many.Rmd", - - "communicate.Rmd", - "rmarkdown.Rmd", - "communicate-plots.Rmd", - "rmarkdown-formats.Rmd", - "rmarkdown-workflow.Rmd", -] - -before_chapter_script: "_common.R" diff --git a/_common.R b/_common.R index 363008388..036b73352 100644 --- a/_common.R +++ b/_common.R @@ -1,15 +1,55 @@ set.seed(1014) -options(digits = 3) knitr::opts_chunk$set( comment = "#>", collapse = TRUE, - cache = TRUE, - out.width = "70%", - fig.align = 'center', + # cache = TRUE, + fig.retina = 2, fig.width = 6, - fig.asp = 0.618, # 1 / phi + fig.asp = 2/3, fig.show = "hold" ) -options(dplyr.print_min = 6, dplyr.print_max = 6) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + # Temporarily deactivate cli output for quarto + cli.num_colors = 0, + cli.hyperlink = FALSE, + pillar.bold = TRUE, + width = 77 # 80 - 3 for #> comment +) + +ggplot2::theme_set(ggplot2::theme_gray(12)) + +# use results: "asis" when setting a status for a chapter +status <- function(type) { + status <- switch(type, + polishing = "should be readable but is currently undergoing final polishing", + restructuring = "is undergoing heavy restructuring and may be confusing or incomplete", + drafting = "is currently a dumping ground for ideas, and we don't recommend reading it", + complete = "is largely complete and just needs final proof reading", + stop("Invalid `type`", call. = FALSE) + ) + + class <- switch(type, + polishing = "note", + restructuring = "important", + drafting = "important", + complete = "note" + ) + + cat(paste0( + "\n", + ":::: status\n", + "::: callout-", class, " \n", + "You are reading the work-in-progress second edition of R for Data Science. ", + "This chapter ", status, ". ", + "You can find the complete first edition at .\n", + ":::\n", + "::::\n" + )) +} diff --git a/_freeze/arrow/execute-results/html.json b/_freeze/arrow/execute-results/html.json new file mode 100644 index 000000000..686eabe6d --- /dev/null +++ b/_freeze/arrow/execute-results/html.json @@ -0,0 +1,14 @@ +{ + "hash": "8ae7678995e8995f137d44c9c7d335ba", + "result": { + "markdown": "---\nfreeze: true\n---\n\n\n# Arrow {#sec-arrow}\n\n\n\n:::: status\n::: callout-note \nYou are reading the work-in-progress second edition of R for Data Science. This chapter is largely complete and just needs final proof reading. You can find the complete first edition at .\n:::\n::::\n\n\n## Introduction\n\nCSV files are designed to be easily read by humans.\nThey're a good interchange format because they're very simple and they can be read by every tool under the sun.\nBut CSV files aren't very efficient: you have to do quite a lot of work to read the data into R.\nIn this chapter, you'll learn about a powerful alternative: the [parquet format](https://parquet.apache.org/), an open standards-based format widely used by big data systems.\n\nWe'll pair parquet files with [Apache Arrow](https://arrow.apache.org), a multi-language toolbox designed for efficient analysis and transport of large datasets.\nWe'll use Apache Arrow via the the [arrow package](https://arrow.apache.org/docs/r/), which provides a dplyr backend allowing you to analyze larger-than-memory datasets using familiar dplyr syntax.\nAs an additional benefit, arrow is extremely fast: you'll see some examples later in the chapter.\n\nBoth arrow and dbplyr provide dplyr backends, so you might wonder when to use each.\nIn many cases, the choice is made for you, as in the data is already in a database or in parquet files, and you'll want to work with it as is.\nBut if you're starting with your own data (perhaps CSV files), you can either load it into a database or convert it to parquet.\nIn general, it's hard to know what will work best, so in the early stages of your analysis we'd encourage you to try both and pick the one that works the best for you.\n\n(A big thanks to Danielle Navarro who contributed the initial version of this chapter.)\n\n### Prerequisites\n\nIn this chapter, we'll continue to use the tidyverse, particularly dplyr, but we'll pair it with the arrow package which is designed specifically for working with large data.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(tidyverse)\nlibrary(arrow)\n```\n:::\n\n\nLater in the chapter, we'll also see some connections between arrow and duckdb, so we'll also need dbplyr and duckdb.\n\n\n::: {.cell}\n\n```{.r .cell-code}\nlibrary(dbplyr, warn.conflicts = FALSE)\nlibrary(duckdb)\n#> Loading required package: DBI\n```\n:::\n\n\n## Getting the data\n\nWe begin by getting a dataset worthy of these tools: a dataset of item checkouts from Seattle public libraries, available online at [data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6](https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6).\nThis dataset contains 41,389,465 rows that tell you how many times each book was checked out each month from April 2005 to October 2022.\n\nThe following code will get you a cached copy of the data.\nThe data is a 9GB CSV file, so it will take some time to download.\nI highly recommend using `curl::multidownload()` to get very large files as it's built for exactly this purpose: it gives you a progress bar and it can resume the download if its interrupted.\n\n\n::: {.cell}\n\n```{.r .cell-code}\ndir.create(\"data\", showWarnings = FALSE)\n\ncurl::multi_download(\n \"https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv\",\n \"data/seattle-library-checkouts.csv\",\n resume = TRUE\n)\n```\n:::\n\n\n## Opening a dataset\n\nLet's start by taking a look at the data.\nAt 9GB, this file is large enough that we probably don't want to load the whole thing into memory.\nA good rule of thumb is that you usually want at least twice as much memory as the size of the data, and many laptops top out at 16 Gb.\nThis means we want to avoid `read_csv()` and instead use the `arrow::open_dataset()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv <- open_dataset(\n sources = \"data/seattle-library-checkouts.csv\", \n format = \"csv\"\n)\n```\n:::\n\n\nWhat happens when this code is run?\n`open_dataset()` will scan a few thousand rows to figure out the structure of the dataset.\nThen it records what it's found and stops; it will only read further rows as you specifically request them.\nThis metadata is what we see if we print `seattle_csv`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv\n#> FileSystemDataset with 1 csv file\n#> UsageClass: string\n#> CheckoutType: string\n#> MaterialType: string\n#> CheckoutYear: int64\n#> CheckoutMonth: int64\n#> Checkouts: int64\n#> Title: string\n#> ISBN: null\n#> Creator: string\n#> Subjects: string\n#> Publisher: string\n#> PublicationYear: string\n```\n:::\n\n\nThe first line in the output tells you that `seattle_csv` is stored locally on-disk as a single CSV file; it will only be loaded into memory as needed.\nThe remainder of the output tells you the column type that arrow has imputed for each column.\n\nWe can see what's actually in with `glimpse()`.\nThis reveals that there are \\~41 million rows and 12 columns, and shows us a few values.\n\n\n::: {.cell hash='arrow_cache/html/glimpse-data_07c924738790eb185ebdd8973443e90d'}\n\n```{.r .cell-code}\nseattle_csv |> glimpse()\n#> FileSystemDataset with 1 csv file\n#> 41,389,465 rows x 12 columns\n#> $ UsageClass \"Physical\", \"Physical\", \"Digital\", \"Physical\", \"Ph…\n#> $ CheckoutType \"Horizon\", \"Horizon\", \"OverDrive\", \"Horizon\", \"Hor…\n#> $ MaterialType \"BOOK\", \"BOOK\", \"EBOOK\", \"BOOK\", \"SOUNDDISC\", \"BOO…\n#> $ CheckoutYear 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 20…\n#> $ CheckoutMonth 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,…\n#> $ Checkouts 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 2, 3, 2, 1, 3, 2,…\n#> $ Title \"Super rich : a guide to having it all / Russell S…\n#> $ ISBN \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\", \"\"…\n#> $ Creator \"Simmons, Russell\", \"Barclay, James, 1965-\", \"Tim …\n#> $ Subjects \"Self realization, Conduct of life, Attitude Psych…\n#> $ Publisher \"Gotham Books,\", \"Pyr,\", \"Random House, Inc.\", \"Di…\n#> $ PublicationYear \"c2011.\", \"2010.\", \"2015\", \"2005.\", \"c2004.\", \"c20…\n```\n:::\n\n\nWe can start to use this dataset with dplyr verbs, using `collect()` to force arrow to perform the computation and return some data.\nFor example, this code tells us the total number of checkouts per year:\n\n\n::: {.cell hash='arrow_cache/html/unnamed-chunk-5_7a5e1ce0bed4d69e849dff75d0c0d8d3'}\n\n```{.r .cell-code}\nseattle_csv |> \n count(CheckoutYear, wt = Checkouts) |> \n arrange(CheckoutYear) |> \n collect()\n#> # A tibble: 18 × 2\n#> CheckoutYear n\n#> \n#> 1 2005 3798685\n#> 2 2006 6599318\n#> 3 2007 7126627\n#> 4 2008 8438486\n#> 5 2009 9135167\n#> 6 2010 8608966\n#> # … with 12 more rows\n```\n:::\n\n\nThanks to arrow, this code will work regardless of how large the underlying dataset is.\nBut it's currently rather slow: on Hadley's computer, it took \\~10s to run.\nThat's not terrible given how much data we have, but we can make it much faster by switching to a better format.\n\n## The parquet format {#sec-parquet}\n\nTo make this data easier to work with, lets switch to the parquet file format and split it up into multiple files.\nThe following sections will first introduce you to parquet and partitioning, and then apply what we learned to the Seattle library data.\n\n### Advantages of parquet\n\nLike CSV, parquet is used for rectangular data, but instead of being a text format that you can read with any file editor, it's a custom binary format designed specifically for the needs of big data.\nThis means that:\n\n- Parquet files are usually smaller the equivalent CSV file.\n Parquet relies on [efficient encodings](https://parquet.apache.org/docs/file-format/data-pages/encodings/) to keep file size down, and supports file compression.\n This helps make parquet files fast because there's less data to move from disk to memory.\n\n- Parquet files have a rich type system.\n As we talked about in @sec-col-types, a CSV file does not provide any information about column types.\n For example, a CSV reader has to guess whether `\"08-10-2022\"` should be parsed as a string or a date.\n In contrast, parquet files store data in a way that records the type along with the data.\n\n- Parquet files are \"column-oriented\".\n This means that they're organized column-by-column, much like R's data frame.\n This typically leads to better performance for data analysis tasks compared to CSV files, which are organized row-by-row.\n\n- Parquet files are \"chunked\", which makes it possible to work on different parts of the file at the same time, and, if you're lucky, to skip some chunks all together.\n\n### Partitioning\n\nAs datasets get larger and larger, storing all the data in a single file gets increasingly painful and it's often useful to split large datasets across many files.\nWhen this structuring is done intelligently, this strategy can lead to significant improvements in performance because many analyses will only require a subset of the files.\n\nThere are no hard and fast rules about how to partition your dataset: the results will depend on your data, access patterns, and the systems that read the data.\nYou're likely to need to do some experimentation before you find the ideal partitioning for your situation.\nAs a rough guide, arrow suggests that you avoid files smaller than 20MB and larger than 2GB and avoid partitions that produce more than 10,000 files.\nYou should also try to partition by variables that you filter by; as you'll see shortly, that allows arrow to skip a lot of work by reading only the relevant files.\n\n### Rewriting the Seattle library data\n\nLet's apply these ideas to the Seattle library data to see how they play out in practice.\nWe're going to partition by `CheckoutYear`, since it's likely some analyses will only want to look at recent data and partitioning by year yields 18 chunks of a reasonable size.\n\nTo rewrite the data we define the partition using `dplyr::group_by()` and then save the partitions to a directory with `arrow::write_dataset()`.\n`write_dataset()` has two important arguments: a directory where we'll create the files and the format we'll use.\n\n\n::: {.cell}\n\n```{.r .cell-code}\npq_path <- \"data/seattle-library-checkouts\"\n```\n:::\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_csv |>\n group_by(CheckoutYear) |>\n write_dataset(path = pq_path, format = \"parquet\")\n```\n:::\n\n\nThis takes about a minute to run; as we'll see shortly this is an initial investment that pays off by making future operations much much faster.\n\nLet's take a look at what we just produced:\n\n\n::: {.cell}\n\n```{.r .cell-code}\ntibble(\n files = list.files(pq_path, recursive = TRUE),\n size_MB = file.size(file.path(pq_path, files)) / 1024^2\n)\n#> # A tibble: 18 × 2\n#> files size_MB\n#> \n#> 1 CheckoutYear=2005/part-0.parquet 109.\n#> 2 CheckoutYear=2006/part-0.parquet 164.\n#> 3 CheckoutYear=2007/part-0.parquet 178.\n#> 4 CheckoutYear=2008/part-0.parquet 195.\n#> 5 CheckoutYear=2009/part-0.parquet 214.\n#> 6 CheckoutYear=2010/part-0.parquet 222.\n#> # … with 12 more rows\n```\n:::\n\n\nOur single 9GB CSV file has been rewritten into 18 parquet files.\nThe file names use a \"self-describing\" convention used by the [Apache Hive](https://hive.apache.org) project.\nHive-style partitions name folders with a \"key=value\" convention, so as you might guess, the `CheckoutYear=2005` directory contains all the data where `CheckoutYear` is 2005.\nEach file is between 100 and 300 MB and the total size is now around 4 GB, a little over half the size of the original CSV file.\nThis is as we expect since parquet is a much more efficient format.\n\n## Using dplyr with arrow\n\nNow we've created these parquet files, we'll need to read them in again.\nWe use `open_dataset()` again, but this time we give it a directory:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_pq <- open_dataset(pq_path)\n```\n:::\n\n\nNow we can write our dplyr pipeline.\nFor example, we could count the total number of books checked out in each month for the last five years:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nquery <- seattle_pq |> \n filter(CheckoutYear >= 2018, MaterialType == \"BOOK\") |>\n group_by(CheckoutYear, CheckoutMonth) |>\n summarize(TotalCheckouts = sum(Checkouts)) |>\n arrange(CheckoutYear, CheckoutMonth)\n```\n:::\n\n\nWriting dplyr code for arrow data is conceptually similar to dbplyr, @sec-import-databases: you write dplyr code, which is automatically transformed into a query that the Apache Arrow C++ library understands, which is then executed when you call `collect()`.\nIf we print out the `query` object we can see a little information about what we expect Arrow to return when the execution takes place:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nquery\n#> FileSystemDataset (query)\n#> CheckoutYear: int32\n#> CheckoutMonth: int64\n#> TotalCheckouts: int64\n#> \n#> * Grouped by CheckoutYear\n#> * Sorted by CheckoutYear [asc], CheckoutMonth [asc]\n#> See $.data for the source Arrow object\n```\n:::\n\n\nAnd we can get the results by calling `collect()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nquery |> collect()\n#> # A tibble: 58 × 3\n#> # Groups: CheckoutYear [5]\n#> CheckoutYear CheckoutMonth TotalCheckouts\n#> \n#> 1 2018 1 355101\n#> 2 2018 2 309813\n#> 3 2018 3 344487\n#> 4 2018 4 330988\n#> 5 2018 5 318049\n#> 6 2018 6 341825\n#> # … with 52 more rows\n```\n:::\n\n\nLike dbplyr, arrow only understands some R expressions, so you may not be able to write exactly the same code you usually would.\nHowever, the list of operations and functions supported is fairly extensive and continues to grow; find a complete list of currently supported functions in `?acero`.\n\n### Performance {#sec-parquet-fast}\n\nLet's take a quick look at the performance impact of switching from CSV to parquet.\nFirst, let's time how long it takes to calculate the number of books checked out in each month of 2021, when the data is stored as a single large csv:\n\n\n::: {.cell hash='arrow_cache/html/dataset-performance-csv_483a703c116b20d0e51a2183c096cfa2'}\n\n```{.r .cell-code}\nseattle_csv |> \n filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n group_by(CheckoutMonth) |>\n summarize(TotalCheckouts = sum(Checkouts)) |>\n arrange(desc(CheckoutMonth)) |>\n collect() |> \n system.time()\n#> user system elapsed \n#> 11.997 1.189 11.343\n```\n:::\n\n\nNow let's use our new version of the dataset in which the Seattle library checkout data has been partitioned into 18 smaller parquet files:\n\n\n::: {.cell hash='arrow_cache/html/dataset-performance-multiple-parquet_de9e0ac3cfc08b2e6eef4a12f94f8391'}\n\n```{.r .cell-code}\nseattle_pq |> \n filter(CheckoutYear == 2021, MaterialType == \"BOOK\") |>\n group_by(CheckoutMonth) |>\n summarize(TotalCheckouts = sum(Checkouts)) |>\n arrange(desc(CheckoutMonth)) |>\n collect() |> \n system.time()\n#> user system elapsed \n#> 0.272 0.063 0.063\n```\n:::\n\n\nThe \\~100x speedup in performance is attributable to two factors: the multi-file partitioning, and the format of individual files:\n\n- Partitioning improves performance because this query uses `CheckoutYear == 2021` to filter the data, and arrow is smart enough to recognize that it only needs to read 1 of the 18 parquet files.\n- The parquet format improves performance by storing data in a binary format that can be read more directly into memory. The column-wise format and rich metadata means that arrow only needs to read the four columns actually used in the query (`CheckoutYear`, `MaterialType`, `CheckoutMonth`, and `Checkouts`).\n\nThis massive difference in performance is why it pays off to convert large CSVs to parquet!\n\n### Using dbplyr with arrow\n\nThere's one last advantage of parquet and arrow --- it's very easy to turn an arrow dataset into a DuckDB database (@sec-import-databases) by calling `arrow::to_duckdb()`:\n\n\n::: {.cell}\n\n```{.r .cell-code}\nseattle_pq |> \n to_duckdb() |>\n filter(CheckoutYear >= 2018, MaterialType == \"BOOK\") |>\n group_by(CheckoutYear) |>\n summarize(TotalCheckouts = sum(Checkouts)) |>\n arrange(desc(CheckoutYear)) |>\n collect()\n#> Warning: Missing values are always removed in SQL aggregation functions.\n#> Use `na.rm = TRUE` to silence this warning\n#> This warning is displayed once every 8 hours.\n#> # A tibble: 5 × 2\n#> CheckoutYear TotalCheckouts\n#> \n#> 1 2022 2431502\n#> 2 2021 2266438\n#> 3 2020 1241999\n#> 4 2019 3931688\n#> 5 2018 3987569\n```\n:::\n\n\nThe neat thing about `to_duckdb()` is that the transfer doesn't involve any memory copying, and speaks to the goals of the arrow ecosystem: enabling seamless transitions from one computing environment to another.\n\n## Summary\n\nIn this chapter, you've been given a taste of the arrow package, which provides a dplyr backend for working with large on-disk datasets.\nIt can work with CSV files, its much much faster if you convert your data to parquet.\nParquet is a binary data format that's designed specifically for data analysis on modern computers.\nFar fewer tools can work with parquet files compared to CSV, but it's partitioned, compressed, and columnar structure makes it much more efficient to analyze.\n\nNext up you'll learn about your first non-rectangular data source, which you'll handle using tools provided by the tidyr package.\nWe'll focus on data that comes from JSON files, but the general principles apply to tree-like data regardless of its source.\n", + "supporting": [], + "filters": [ + "rmarkdown/pagebreak.lua" + ], + "includes": {}, + "engineDependencies": {}, + "preserve": {}, + "postProcess": true + } +} \ No newline at end of file diff --git a/_output.yaml b/_output.yaml deleted file mode 100644 index a82ee78bf..000000000 --- a/_output.yaml +++ /dev/null @@ -1,15 +0,0 @@ -bookdown::gitbook: - config: - toc: - collapse: section - before: | -
  • R for Data Science
  • - edit: - link: https://github.com/hadley/r4ds/edit/master/%s - text: "Edit" - sharing: no - css: r4ds.css - -bookdown::pdf_book: - latex_engine: "xelatex" - diff --git a/_quarto.yml b/_quarto.yml new file mode 100644 index 000000000..4768ee9f3 --- /dev/null +++ b/_quarto.yml @@ -0,0 +1,86 @@ +project: + type: book + output-dir: _book + +book: + title: "R for Data Science (2e)" + reader-mode: true + + page-footer: + left: | + R for Data Science (2e) was written by Hadley Wickham, Mine + Çetinkaya-Rundel, and Garrett Grolemund. + right: | + This book was built with Quarto. + cover-image: cover.jpg + favicon: cover.jpg + site-url: https://r4ds.hadley.nz/ + repo-url: https://github.com/hadley/r4ds/ + repo-branch: main + repo-actions: [edit, issue] + chapters: + - index.qmd + + - preface-2e.qmd + - intro.qmd + + - part: whole-game.qmd + chapters: + - data-visualize.qmd + - workflow-basics.qmd + - data-transform.qmd + - workflow-style.qmd + - data-tidy.qmd + - workflow-scripts.qmd + - data-import.qmd + - workflow-help.qmd + + - part: visualize.qmd + chapters: + - layers.qmd + - EDA.qmd + - communication.qmd + + - part: transform.qmd + chapters: + - logicals.qmd + - numbers.qmd + - strings.qmd + - regexps.qmd + - factors.qmd + - datetimes.qmd + - missing-values.qmd + - joins.qmd + + - part: import.qmd + chapters: + - spreadsheets.qmd + - databases.qmd + - arrow.qmd + - rectangling.qmd + - webscraping.qmd + + - part: program.qmd + chapters: + - functions.qmd + - iteration.qmd + - base-R.qmd + + - part: communicate.qmd + chapters: + - quarto.qmd + - quarto-formats.qmd + +format: + html: + theme: + - cosmo + - r4ds.scss + code-link: true + + author-meta: "Hadley Wickham, Mine Çetinkaya-Rundel, and Garrett Grolemund" + include-in-header: "plausible.html" + callout-appearance: simple + +editor: visual + diff --git a/arrow.qmd b/arrow.qmd new file mode 100644 index 000000000..36f3e21d3 --- /dev/null +++ b/arrow.qmd @@ -0,0 +1,299 @@ +--- +freeze: true +--- + +# Arrow {#sec-arrow} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +CSV files are designed to be easily read by humans. +They're a good interchange format because they're very simple and they can be read by every tool under the sun. +But CSV files aren't very efficient: you have to do quite a lot of work to read the data into R. +In this chapter, you'll learn about a powerful alternative: the [parquet format](https://parquet.apache.org/), an open standards-based format widely used by big data systems. + +We'll pair parquet files with [Apache Arrow](https://arrow.apache.org), a multi-language toolbox designed for efficient analysis and transport of large datasets. +We'll use Apache Arrow via the [arrow package](https://arrow.apache.org/docs/r/), which provides a dplyr backend allowing you to analyze larger-than-memory datasets using familiar dplyr syntax. +As an additional benefit, arrow is extremely fast: you'll see some examples later in the chapter. + +Both arrow and dbplyr provide dplyr backends, so you might wonder when to use each. +In many cases, the choice is made for you, as the data is already in a database or in parquet files, and you'll want to work with it as is. +But if you're starting with your own data (perhaps CSV files), you can either load it into a database or convert it to parquet. +In general, it's hard to know what will work best, so in the early stages of your analysis we'd encourage you to try both and pick the one that works the best for you. + +(A big thanks to Danielle Navarro who contributed the initial version of this chapter.) + +### Prerequisites + +In this chapter, we'll continue to use the tidyverse, particularly dplyr, but we'll pair it with the arrow package which is designed specifically for working with large data. + +```{r setup} +#| message: false +#| warning: false +library(tidyverse) +library(arrow) +``` + +Later in the chapter, we'll also see some connections between arrow and duckdb, so we'll also need dbplyr and duckdb. + +```{r} +library(dbplyr, warn.conflicts = FALSE) +library(duckdb) +``` + +## Getting the data + +We begin by getting a dataset worthy of these tools: a dataset of item checkouts from Seattle public libraries, available online at [data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6](https://data.seattle.gov/Community/Checkouts-by-Title/tmmm-ytt6). +This dataset contains 41,389,465 rows that tell you how many times each book was checked out each month from April 2005 to October 2022. + +The following code will get you a cached copy of the data. +The data is a 9GB CSV file, so it will take some time to download. +I highly recommend using `curl::multi_download()` to get very large files as it's built for exactly this purpose: it gives you a progress bar and it can resume the download if its interrupted. + +```{r} +#| eval: false +dir.create("data", showWarnings = FALSE) + +curl::multi_download( + "https://r4ds.s3.us-west-2.amazonaws.com/seattle-library-checkouts.csv", + "data/seattle-library-checkouts.csv", + resume = TRUE +) +``` + +## Opening a dataset + +Let's start by taking a look at the data. +At 9GB, this file is large enough that we probably don't want to load the whole thing into memory. +A good rule of thumb is that you usually want at least twice as much memory as the size of the data, and many laptops top out at 16 Gb. +This means we want to avoid `read_csv()` and instead use the `arrow::open_dataset()`: + +```{r open-dataset} +seattle_csv <- open_dataset( + sources = "data/seattle-library-checkouts.csv", + col_types = schema(ISBN = string()), + format = "csv" +) +``` + +What happens when this code is run? +`open_dataset()` will scan a few thousand rows to figure out the structure of the dataset. +The `ISBN` column contains blank values for the first 80,000 rows, so we have to specify the column type to help arrow work out the data structure. +Once the data has been scanned by `open_dataset()`, it records what it's found and stops; it will only read further rows as you specifically request them. +This metadata is what we see if we print `seattle_csv`: + +```{r} +seattle_csv +``` + +The first line in the output tells you that `seattle_csv` is stored locally on-disk as a single CSV file; it will only be loaded into memory as needed. +The remainder of the output tells you the column type that arrow has imputed for each column. + +We can see what's actually in with `glimpse()`. +This reveals that there are \~41 million rows and 12 columns, and shows us a few values. + +```{r glimpse-data} +#| cache: true +seattle_csv |> glimpse() +``` + +We can start to use this dataset with dplyr verbs, using `collect()` to force arrow to perform the computation and return some data. +For example, this code tells us the total number of checkouts per year: + +```{r} +#| cache: true +seattle_csv |> + group_by(CheckoutYear) |> + summarise(Checkouts = sum(Checkouts)) |> + arrange(CheckoutYear) |> + collect() +``` + +Thanks to arrow, this code will work regardless of how large the underlying dataset is. +But it's currently rather slow: on Hadley's computer, it took \~10s to run. +That's not terrible given how much data we have, but we can make it much faster by switching to a better format. + +## The parquet format {#sec-parquet} + +To make this data easier to work with, let's switch to the parquet file format and split it up into multiple files. +The following sections will first introduce you to parquet and partitioning, and then apply what we learned to the Seattle library data. + +### Advantages of parquet + +Like CSV, parquet is used for rectangular data, but instead of being a text format that you can read with any file editor, it's a custom binary format designed specifically for the needs of big data. +This means that: + +- Parquet files are usually smaller than the equivalent CSV file. + Parquet relies on [efficient encodings](https://parquet.apache.org/docs/file-format/data-pages/encodings/) to keep file size down, and supports file compression. + This helps make parquet files fast because there's less data to move from disk to memory. + +- Parquet files have a rich type system. + As we talked about in @sec-col-types, a CSV file does not provide any information about column types. + For example, a CSV reader has to guess whether `"08-10-2022"` should be parsed as a string or a date. + In contrast, parquet files store data in a way that records the type along with the data. + +- Parquet files are "column-oriented". + This means that they're organized column-by-column, much like R's data frame. + This typically leads to better performance for data analysis tasks compared to CSV files, which are organized row-by-row. + +- Parquet files are "chunked", which makes it possible to work on different parts of the file at the same time, and, if you're lucky, to skip some chunks altogether. + +There's one primary disadvantage to parquet files: they are no longer "human readable", i.e. if you look at a parquet file using `readr::read_file()`, you'll just see a bunch of gibberish. + +### Partitioning + +As datasets get larger and larger, storing all the data in a single file gets increasingly painful and it's often useful to split large datasets across many files. +When this structuring is done intelligently, this strategy can lead to significant improvements in performance because many analyses will only require a subset of the files. + +There are no hard and fast rules about how to partition your dataset: the results will depend on your data, access patterns, and the systems that read the data. +You're likely to need to do some experimentation before you find the ideal partitioning for your situation. +As a rough guide, arrow suggests that you avoid files smaller than 20MB and larger than 2GB and avoid partitions that produce more than 10,000 files. +You should also try to partition by variables that you filter by; as you'll see shortly, that allows arrow to skip a lot of work by reading only the relevant files. + +### Rewriting the Seattle library data + +Let's apply these ideas to the Seattle library data to see how they play out in practice. +We're going to partition by `CheckoutYear`, since it's likely some analyses will only want to look at recent data and partitioning by year yields 18 chunks of a reasonable size. + +To rewrite the data we define the partition using `dplyr::group_by()` and then save the partitions to a directory with `arrow::write_dataset()`. +`write_dataset()` has two important arguments: a directory where we'll create the files and the format we'll use. + +```{r} +pq_path <- "data/seattle-library-checkouts" +``` + +```{r write-dataset} +#| eval: !expr "!file.exists(pq_path)" + +seattle_csv |> + group_by(CheckoutYear) |> + write_dataset(path = pq_path, format = "parquet") +``` + +This takes about a minute to run; as we'll see shortly this is an initial investment that pays off by making future operations much much faster. + +Let's take a look at what we just produced: + +```{r show-parquet-files} +tibble( + files = list.files(pq_path, recursive = TRUE), + size_MB = file.size(file.path(pq_path, files)) / 1024^2 +) +``` + +Our single 9GB CSV file has been rewritten into 18 parquet files. +The file names use a "self-describing" convention used by the [Apache Hive](https://hive.apache.org) project. +Hive-style partitions name folders with a "key=value" convention, so as you might guess, the `CheckoutYear=2005` directory contains all the data where `CheckoutYear` is 2005. +Each file is between 100 and 300 MB and the total size is now around 4 GB, a little over half the size of the original CSV file. +This is as we expect since parquet is a much more efficient format. + +## Using dplyr with arrow + +Now we've created these parquet files, we'll need to read them in again. +We use `open_dataset()` again, but this time we give it a directory: + +```{r} +seattle_pq <- open_dataset(pq_path) +``` + +Now we can write our dplyr pipeline. +For example, we could count the total number of books checked out in each month for the last five years: + +```{r books-by-year-query} +query <- seattle_pq |> + filter(CheckoutYear >= 2018, MaterialType == "BOOK") |> + group_by(CheckoutYear, CheckoutMonth) |> + summarize(TotalCheckouts = sum(Checkouts)) |> + arrange(CheckoutYear, CheckoutMonth) +``` + +Writing dplyr code for arrow data is conceptually similar to dbplyr, @sec-import-databases: you write dplyr code, which is automatically transformed into a query that the Apache Arrow C++ library understands, which is then executed when you call `collect()`. +If we print out the `query` object we can see a little information about what we expect Arrow to return when the execution takes place: + +```{r} +query +``` + +And we can get the results by calling `collect()`: + +```{r books-by-year} +query |> collect() +``` + +Like dbplyr, arrow only understands some R expressions, so you may not be able to write exactly the same code you usually would. +However, the list of operations and functions supported is fairly extensive and continues to grow; find a complete list of currently supported functions in `?acero`. + +### Performance {#sec-parquet-fast} + +Let's take a quick look at the performance impact of switching from CSV to parquet. +First, let's time how long it takes to calculate the number of books checked out in each month of 2021, when the data is stored as a single large csv: + +```{r dataset-performance-csv} +#| cache: true + +seattle_csv |> + filter(CheckoutYear == 2021, MaterialType == "BOOK") |> + group_by(CheckoutMonth) |> + summarize(TotalCheckouts = sum(Checkouts)) |> + arrange(desc(CheckoutMonth)) |> + collect() |> + system.time() +``` + +Now let's use our new version of the dataset in which the Seattle library checkout data has been partitioned into 18 smaller parquet files: + +```{r dataset-performance-multiple-parquet} +#| cache: true + +seattle_pq |> + filter(CheckoutYear == 2021, MaterialType == "BOOK") |> + group_by(CheckoutMonth) |> + summarize(TotalCheckouts = sum(Checkouts)) |> + arrange(desc(CheckoutMonth)) |> + collect() |> + system.time() +``` + +The \~100x speedup in performance is attributable to two factors: the multi-file partitioning, and the format of individual files: + +- Partitioning improves performance because this query uses `CheckoutYear == 2021` to filter the data, and arrow is smart enough to recognize that it only needs to read 1 of the 18 parquet files. +- The parquet format improves performance by storing data in a binary format that can be read more directly into memory. The column-wise format and rich metadata means that arrow only needs to read the four columns actually used in the query (`CheckoutYear`, `MaterialType`, `CheckoutMonth`, and `Checkouts`). + +This massive difference in performance is why it pays off to convert large CSVs to parquet! + +### Using duckdb with arrow + +There's one last advantage of parquet and arrow --- it's very easy to turn an arrow dataset into a DuckDB database (@sec-import-databases) by calling `arrow::to_duckdb()`: + +```{r use-duckdb} +seattle_pq |> + to_duckdb() |> + filter(CheckoutYear >= 2018, MaterialType == "BOOK") |> + group_by(CheckoutYear) |> + summarize(TotalCheckouts = sum(Checkouts)) |> + arrange(desc(CheckoutYear)) |> + collect() +``` + +The neat thing about `to_duckdb()` is that the transfer doesn't involve any memory copying, and speaks to the goals of the arrow ecosystem: enabling seamless transitions from one computing environment to another. + +### Exercises + +1. Figure out the most popular book each year. +2. Which author has the most books in the Seattle library system? +3. How has checkouts of books vs ebooks changed over the last 10 years? + +## Summary + +In this chapter, you've been given a taste of the arrow package, which provides a dplyr backend for working with large on-disk datasets. +It can work with CSV files, and it's much much faster if you convert your data to parquet. +Parquet is a binary data format that's designed specifically for data analysis on modern computers. +Far fewer tools can work with parquet files compared to CSV, but its partitioned, compressed, and columnar structure makes it much more efficient to analyze. + +Next up you'll learn about your first non-rectangular data source, which you'll handle using tools provided by the tidyr package. +We'll focus on data that comes from JSON files, but the general principles apply to tree-like data regardless of its source. diff --git a/base-R.qmd b/base-R.qmd new file mode 100644 index 000000000..b3de68cce --- /dev/null +++ b/base-R.qmd @@ -0,0 +1,546 @@ +# A field guide to base R {#sec-base-r} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +To finish off the programming section, we're going to give you a quick tour of the most important base R functions that we don't otherwise discuss in the book. +These tools are particularly useful as you do more programming and will help you read code you'll encounter in the wild. + +This is a good place to remind you that the tidyverse is not the only way to solve data science problems. +We teach the tidyverse in this book because tidyverse packages share a common design philosophy, increasing the consistency across functions, and making each new function or package a little easier to learn and use. +It's not possible to use the tidyverse without using base R, so we've actually already taught you a **lot** of base R functions: from `library()` to load packages, to `sum()` and `mean()` for numeric summaries, to the factor, date, and POSIXct data types, and of course all the basic operators like `+`, `-`, `/`, `*`, `|`, `&`, and `!`. +What we haven't focused on so far is base R workflows, so we will highlight a few of those in this chapter. + +After you read this book, you'll learn other approaches to the same problems using base R, data.table, and other packages. +You'll undoubtedly encounter these other approaches when you start reading R code written by others, particularly if you're using StackOverflow. +It's 100% okay to write code that uses a mix of approaches, and don't let anyone tell you otherwise! + +In this chapter, we'll focus on four big topics: subsetting with `[`, subsetting with `[[` and `$`, the apply family of functions, and `for` loops. +To finish off, we'll briefly discuss two essential plotting functions. + +### Prerequisites + +This package focuses on base R so doesn't have any real prerequisites, but we'll load the tidyverse in order to explain some of the differences. + +```{r} +#| label: setup +#| message: false + +library(tidyverse) +``` + +## Selecting multiple elements with `[` {#sec-subset-many} + +`[` is used to extract sub-components from vectors and data frames, and is called like `x[i]` or `x[i, j]`. +In this section, we'll introduce you to the power of `[`, first showing you how you can use it with vectors, then how the same principles extend in a straightforward way to two-dimensional (2d) structures like data frames. +We'll then help you cement that knowledge by showing how various dplyr verbs are special cases of `[`. + +### Subsetting vectors + +There are five main types of things that you can subset a vector with, i.e., that can be the `i` in `x[i]`: + +1. **A vector of positive integers**. + Subsetting with positive integers keeps the elements at those positions: + + ```{r} + x <- c("one", "two", "three", "four", "five") + x[c(3, 2, 5)] + ``` + + By repeating a position, you can actually make a longer output than input, making the term "subsetting" a bit of a misnomer. + + ```{r} + x[c(1, 1, 5, 5, 5, 2)] + ``` + +2. **A vector of negative integers**. + Negative values drop the elements at the specified positions: + + ```{r} + x[c(-1, -3, -5)] + ``` + +3. **A logical vector**. + Subsetting with a logical vector keeps all values corresponding to a `TRUE` value. + This is most often useful in conjunction with the comparison functions. + + ```{r} + x <- c(10, 3, NA, 5, 8, 1, NA) + + # All non-missing values of x + x[!is.na(x)] + + # All even (or missing!) values of x + x[x %% 2 == 0] + ``` + + Unlike `filter()`, `NA` indices will be included in the output as `NA`s. + +4. **A character vector**. + If you have a named vector, you can subset it with a character vector: + + ```{r} + x <- c(abc = 1, def = 2, xyz = 5) + x[c("xyz", "def")] + ``` + + As with subsetting with positive integers, you can use a character vector to duplicate individual entries. + +5. **Nothing**. + The final type of subsetting is nothing, `x[]`, which returns the complete `x`. + This is not useful for subsetting vectors, but as we'll see shortly, it is useful when subsetting 2d structures like tibbles. + +### Subsetting data frames + +There are quite a few different ways[^base-r-1] that you can use `[` with a data frame, but the most important way is to select rows and columns independently with `df[rows, cols]`. Here `rows` and `cols` are vectors as described above. +For example, `df[rows, ]` and `df[, cols]` select just rows or just columns, using the empty subset to preserve the other dimension. + +[^base-r-1]: Read to see how you can also subset a data frame like it is a 1d object and how you can subset it with a matrix. + +Here are a couple of examples: + +```{r} +df <- tibble( + x = 1:3, + y = c("a", "e", "f"), + z = runif(3) +) + +# Select first row and second column +df[1, 2] + +# Select all rows and columns x and y +df[, c("x" , "y")] + +# Select rows where `x` is greater than 1 and all columns +df[df$x > 1, ] +``` + +We'll come back to `$` shortly, but you should be able to guess what `df$x` does from the context: it extracts the `x` variable from `df`. +We need to use it here because `[` doesn't use tidy evaluation, so you need to be explicit about the source of the `x` variable. + +There's an important difference between tibbles and data frames when it comes to `[`. +In this book, we've mainly used tibbles, which *are* data frames, but they tweak some behaviors to make your life a little easier. +In most places, you can use "tibble" and "data frame" interchangeably, so when we want to draw particular attention to R's built-in data frame, we'll write `data.frame`. +If `df` is a `data.frame`, then `df[, cols]` will return a vector if `col` selects a single column and a data frame if it selects more than one column. +If `df` is a tibble, then `[` will always return a tibble. + +```{r} +df1 <- data.frame(x = 1:3) +df1[, "x"] + +df2 <- tibble(x = 1:3) +df2[, "x"] +``` + +One way to avoid this ambiguity with `data.frame`s is to explicitly specify `drop = FALSE`: + +```{r} +df1[, "x" , drop = FALSE] +``` + +### dplyr equivalents + +Several dplyr verbs are special cases of `[`: + +- `filter()` is equivalent to subsetting the rows with a logical vector, taking care to exclude missing values: + + ```{r} + #| results: false + + df <- tibble( + x = c(2, 3, 1, 1, NA), + y = letters[1:5], + z = runif(5) + ) + df |> filter(x > 1) + + # same as + df[!is.na(df$x) & df$x > 1, ] + ``` + + Another common technique in the wild is to use `which()` for its side-effect of dropping missing values: `df[which(df$x > 1), ]`. + +- `arrange()` is equivalent to subsetting the rows with an integer vector, usually created with `order()`: + + ```{r} + #| results: false + + df |> arrange(x, y) + + # same as + df[order(df$x, df$y), ] + ``` + + You can use `order(decreasing = TRUE)` to sort all columns in descending order or `-rank(col)` to sort columns in decreasing order individually. + +- Both `select()` and `relocate()` are similar to subsetting the columns with a character vector: + + ```{r} + #| results: false + + df |> select(x, z) + + # same as + df[, c("x", "z")] + ``` + +Base R also provides a function that combines the features of `filter()` and `select()`[^base-r-2] called `subset()`: + +[^base-r-2]: But it doesn't handle grouped data frames differently and it doesn't support selection helper functions like `starts_with()`. + +```{r} +df |> + filter(x > 1) |> + select(y, z) +``` + +```{r} +#| results: false + +# same as +df |> subset(x > 1, c(y, z)) +``` + +This function was the inspiration for much of dplyr's syntax. + +### Exercises + +1. Create functions that take a vector as input and return: + + a. The elements at even-numbered positions. + b. Every element except the last value. + c. Only even values (and no missing values). + +2. Why is `x[-which(x > 0)]` not the same as `x[x <= 0]`? + Read the documentation for `which()` and do some experiments to figure it out. + +## Selecting a single element with `$` and `[[` {#sec-subset-one} + +`[`, which selects many elements, is paired with `[[` and `$`, which extract a single element. +In this section, we'll show you how to use `[[` and `$` to pull columns out of data frames, discuss a couple more differences between `data.frames` and tibbles, and emphasize some important differences between `[` and `[[` when used with lists. + +### Data frames + +`[[` and `$` can be used to extract columns out of a data frame. +`[[` can access by position or by name, and `$` is specialized for access by name: + +```{r} +tb <- tibble( + x = 1:4, + y = c(10, 4, 1, 21) +) + +# by position +tb[[1]] + +# by name +tb[["x"]] +tb$x +``` + +They can also be used to create new columns, the base R equivalent of `mutate()`: + +```{r} +tb$z <- tb$x + tb$y +tb +``` + +There are several other base R approaches to creating new columns including with `transform()`, `with()`, and `within()`. +Hadley collected a few examples at . + +Using `$` directly is convenient when performing quick summaries. +For example, if you just want to find the size of the biggest diamond or the possible values of `cut`, there's no need to use `summarize()`: + +```{r} +max(diamonds$carat) + +levels(diamonds$cut) +``` + +dplyr also provides an equivalent to `[[`/`$` that we didn't mention in @sec-data-transform: `pull()`. +`pull()` takes either a variable name or variable position and returns just that column. +That means we could rewrite the above code to use the pipe: + +```{r} +diamonds |> pull(carat) |> max() + +diamonds |> pull(cut) |> levels() +``` + +### Tibbles + +There are a couple of important differences between tibbles and base `data.frame`s when it comes to `$`. +Data frames match the prefix of any variable names (so-called **partial matching**) and don't complain if a column doesn't exist: + +```{r} +df <- data.frame(x1 = 1) +df$x +df$z +``` + +Tibbles are more strict: they only ever match variable names exactly and they will generate a warning if the column you are trying to access doesn't exist: + +```{r} +tb <- tibble(x1 = 1) + +tb$x +tb$z +``` + +For this reason we sometimes joke that tibbles are lazy and surly: they do less and complain more. + +### Lists + +`[[` and `$` are also really important for working with lists, and it's important to understand how they differ from `[`. +Let's illustrate the differences with a list named `l`: + +```{r} +l <- list( + a = 1:3, + b = "a string", + c = pi, + d = list(-1, -5) +) +``` + +- `[` extracts a sub-list. + It doesn't matter how many elements you extract, the result will always be a list. + + ```{r} + str(l[1:2]) + + str(l[1]) + + str(l[4]) + ``` + + Like with vectors, you can subset with a logical, integer, or character vector. + +- `[[` and `$` extract a single component from a list. + They remove a level of hierarchy from the list. + + ```{r} + str(l[[1]]) + + str(l[[4]]) + + str(l$a) + ``` + +The difference between `[` and `[[` is particularly important for lists because `[[` drills down into the list while `[` returns a new, smaller list. +To help you remember the difference, take a look at the unusual pepper shaker shown in @fig-pepper. +If this pepper shaker is your list `pepper`, then, `pepper[1]` is a pepper shaker containing a single pepper packet. +`pepper[2]` would look the same, but would contain the second packet. +`pepper[1:2]` would be a pepper shaker containing two pepper packets. +`pepper[[1]]` would extract the pepper packet itself. + +```{r} +#| label: fig-pepper +#| echo: false +#| out-width: "100%" +#| fig-cap: | +#| (Left) A pepper shaker that Hadley once found in his hotel room. +#| (Middle) `pepper[1]`. +#| (Right) `pepper[[1]]` +#| fig-alt: | +#| Three photos. On the left is a photo of a glass pepper shaker. Instead of +#| the pepper shaker containing pepper, it contains a single packet of pepper. +#| In the middle is a photo of a single packet of pepper. On the right is a +#| photo of the contents of a packet of pepper. + +knitr::include_graphics("diagrams/pepper.png") +``` + +This same principle applies when you use 1d `[` with a data frame: `df["x"]` returns a one-column data frame and `df[["x"]]` returns a vector. + +### Exercises + +1. What happens when you use `[[` with a positive integer that's bigger than the length of the vector? + What happens when you subset with a name that doesn't exist? + +2. What would `pepper[[1]][1]` be? + What about `pepper[[1]][[1]]`? + +## Apply family + +In @sec-iteration, you learned tidyverse techniques for iteration like `dplyr::across()` and the map family of functions. +In this section, you'll learn about their base equivalents, the **apply family**. +In this context apply and map are synonyms because another way of saying "map a function over each element of a vector" is "apply a function over each element of a vector". +Here we'll give you a quick overview of this family so you can recognize them in the wild. + +The most important member of this family is `lapply()`, which is very similar to `purrr::map()`[^base-r-3]. +In fact, because we haven't used any of `map()`'s more advanced features, you can replace every `map()` call in @sec-iteration with `lapply()`. + +[^base-r-3]: It just lacks convenient features like progress bars and reporting which element caused the problem if there's an error. + +There's no exact base R equivalent to `across()` but you can get close by using `[` with `lapply()`. +This works because under the hood, data frames are lists of columns, so calling `lapply()` on a data frame applies the function to each column. + +```{r} +df <- tibble(a = 1, b = 2, c = "a", d = "b", e = 4) + +# First find numeric columns +num_cols <- sapply(df, is.numeric) +num_cols + +# Then transform each column with lapply() then replace the original values +df[, num_cols] <- lapply(df[, num_cols, drop = FALSE], \(x) x * 2) +df +``` + +The code above uses a new function, `sapply()`. +It's similar to `lapply()` but it always tries to simplify the result, hence the `s` in its name, here producing a logical vector instead of a list. +We don't recommend using it for programming, because the simplification can fail and give you an unexpected type, but it's usually fine for interactive use. +purrr has a similar function called `map_vec()` that we didn't mention in @sec-iteration. + +Base R provides a stricter version of `sapply()` called `vapply()`, short for **v**ector apply. +It takes an additional argument that specifies the expected type, ensuring that simplification occurs the same way regardless of the input. +For example, we could replace the `sapply()` call above with this `vapply()` where we specify that we expect `is.numeric()` to return a logical vector of length 1: + +```{r} +vapply(df, is.numeric, logical(1)) +``` + +The distinction between `sapply()` and `vapply()` is really important when they're inside a function (because it makes a big difference to the function's robustness to unusual inputs), but it doesn't usually matter in data analysis. + +Another important member of the apply family is `tapply()` which computes a single grouped summary: + +```{r} +diamonds |> + group_by(cut) |> + summarize(price = mean(price)) + +tapply(diamonds$price, diamonds$cut, mean) +``` + +Unfortunately `tapply()` returns its results in a named vector which requires some gymnastics if you want to collect multiple summaries and grouping variables into a data frame (it's certainly possible to not do this and just work with free floating vectors, but in our experience that just delays the work). +If you want to see how you might use `tapply()` or other base techniques to perform other grouped summaries, Hadley has collected a few techniques [in a gist](https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec). + +The final member of the apply family is the titular `apply()`, which works with matrices and arrays. +In particular, watch out for `apply(df, 2, something)`, which is a slow and potentially dangerous way of doing `lapply(df, something)`. +This rarely comes up in data science because we usually work with data frames and not matrices. + +## `for` loops + +`for` loops are the fundamental building block of iteration that both the apply and map families use under the hood. +`for` loops are powerful and general tools that are important to learn as you become a more experienced R programmer. +The basic structure of a `for` loop looks like this: + +```{r} +#| eval: false + +for (element in vector) { + # do something with element +} +``` + +The most straightforward use of `for` loops is to achieve the same effect as `walk()`: call some function with a side-effect on each element of a list. +For example, in @sec-save-database instead of using `walk()`: + +```{r} +#| eval: false + +paths |> walk(append_file) +``` + +We could have used a `for` loop: + +```{r} +#| eval: false + +for (path in paths) { + append_file(path) +} +``` + +Things get a little trickier if you want to save the output of the `for` loop, for example reading all of the excel files in a directory like we did in @sec-iteration: + +```{r} +paths <- dir("data/gapminder", pattern = "\\.xlsx$", full.names = TRUE) +files <- map(paths, readxl::read_excel) +``` + +There are a few different techniques that you can use, but we recommend being explicit about what the output is going to look like upfront. +In this case, we're going to want a list the same length as `paths`, which we can create with `vector()`: + +```{r} +files <- vector("list", length(paths)) +``` + +Then instead of iterating over the elements of `paths`, we'll iterate over their indices, using `seq_along()` to generate one index for each element of paths: + +```{r} +seq_along(paths) +``` + +Using the indices is important because it allows us to link to each position in the input with the corresponding position in the output: + +```{r} +for (i in seq_along(paths)) { + files[[i]] <- readxl::read_excel(paths[[i]]) +} +``` + +To combine the list of tibbles into a single tibble you can use `do.call()` + `rbind()`: + +```{r} +do.call(rbind, files) +``` + +Rather than making a list and saving the results as we go, a simpler approach is to build up the data frame piece-by-piece: + +```{r} +out <- NULL +for (path in paths) { + out <- rbind(out, readxl::read_excel(path)) +} +``` + +We recommend avoiding this pattern because it can become very slow when the vector is very long. +This is the source of the persistent canard that `for` loops are slow: they're not, but iteratively growing a vector is. + +## Plots + +Many R users who don't otherwise use the tidyverse prefer ggplot2 for plotting due to helpful features like sensible defaults, automatic legends, and a modern look. +However, base R plotting functions can still be useful because they're so concise --- it takes very little typing to do a basic exploratory plot. + +There are two main types of base plot you'll see in the wild: scatterplots and histograms, produced with `plot()` and `hist()` respectively. +Here's a quick example from the diamonds dataset: + +```{r} +#| dev: png +#| fig-width: 4 +#| fig-asp: 1 +#| layout-ncol: 2 +#| fig-alt: | +#| On the left, histogram of carats of diamonds, ranging from 0 to 5 carats. +#| The distribution is unimodal and right-skewed. On the right, scatter +#| plot of price vs. carat of diamonds, showing a positive relationship +#| that fans out as both price and carat increases. The scatter plot +#| shows very few diamonds bigger than 3 carats compared to diamonds between +#| 0 to 3 carats. + +# Left +hist(diamonds$carat) + +# Right +plot(diamonds$carat, diamonds$price) +``` + +Note that base plotting functions work with vectors, so you need to pull columns out of the data frame using `$` or some other technique. + +## Summary + +In this chapter, we've shown you a selection of base R functions useful for subsetting and iteration. +Compared to approaches discussed elsewhere in the book, these functions tend to have more of a "vector" flavor than a "data frame" flavor because base R functions tend to take individual vectors, rather than a data frame and some column specification. +This often makes life easier for programming and so becomes more important as you write more functions and begin to write your own packages. + +This chapter concludes the programming section of the book. +You've made a solid start on your journey to becoming not just a data scientist who uses R, but a data scientist who can *program* in R. +We hope these chapters have sparked your interest in programming and that you're looking forward to learning more outside of this book. diff --git a/communicate-plots.Rmd b/communicate-plots.Rmd deleted file mode 100644 index 8cd678471..000000000 --- a/communicate-plots.Rmd +++ /dev/null @@ -1,594 +0,0 @@ -# Graphics for communication - -## Introduction - -In [exploratory data analysis], you learned how to use plots as tools for _exploration_. When you make exploratory plots, you know---even before looking---which variables the plot will display. You made each plot for a purpose, could quickly look at it, and then move on to the next plot. In the course of most analyses, you'll produce tens or hundreds of plots, most of which are immediately thrown away. - -Now that you understand your data, you need to _communicate_ your understanding to others. Your audience will likely not share your background knowledge and will not be deeply invested in the data. To help others quickly build up a good mental model of the data, you will need to invest considerable effort in making your plots as self-explanatory as possible. In this chapter, you'll learn some of the tools that ggplot2 provides to do so. - -This chapter focuses on the tools you need to create good graphics. I assume that you know what you want, and just need to know how to do it. For that reason, I highly recommend pairing this chapter with a good general visualisation book. I particularly like [_The Truthful Art_](https://amzn.com/0321934075), by Albert Cairo. It doesn't teach the mechanics of creating visualisations, but instead focuses on what you need to think about in order to create effective graphics. - -### Prerequisites - -In this chapter, we'll focus once again on ggplot2. We'll also use a little dplyr for data manipulation, and a few ggplot2 extension packages, including __ggrepel__ and __viridis__. Rather than loading those extensions here, we'll refer to their functions explicitly, using the `::` notation. This will help make it clear which functions are built into ggplot2, and which come from other packages. Don't forget you'll need to install those packages with `install.packages()` if you don't already have them. - -```{r, message = FALSE} -library(tidyverse) -``` - -## Label - -The easiest place to start when turning an exploratory graphic into an expository graphic is with good labels. You add labels with the `labs()` function. This example adds a plot title: - -```{r, message = FALSE} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(color = class)) + - geom_smooth(se = FALSE) + - labs(title = "Fuel efficiency generally decreases with engine size") -``` - -The purpose of a plot title is to summarise the main finding. Avoid titles that just describe what the plot is, e.g. "A scatterplot of engine displacement vs. fuel economy". - -If you need to add more text, there are two other useful labels that you can use in ggplot2 2.2.0 and above (which should be available by the time you're reading this book): - -* `subtitle` adds additional detail in a smaller font beneath the title. - -* `caption` adds text at the bottom right of the plot, often used to describe - the source of the data. - -```{r, message = FALSE} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(color = class)) + - geom_smooth(se = FALSE) + - labs( - title = "Fuel efficiency generally decreases with engine size", - subtitle = "Two seaters (sports cars) are an exception because of their light weight", - caption = "Data from fueleconomy.gov" - ) -``` - -You can also use `labs()` to replace the axis and legend titles. It's usually a good idea to replace short variable names with more detailed descriptions, and to include the units. - -```{r, message = FALSE} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) + - geom_smooth(se = FALSE) + - labs( - x = "Engine displacement (L)", - y = "Highway fuel economy (mpg)", - colour = "Car type" - ) -``` - -It's possible to use mathematical equations instead of text strings. Just switch `""` out for `quote()` and read about the available options in `?plotmath`: - -```{r, fig.asp = 1, out.width = "50%", fig.width = 3} -df <- tibble( - x = runif(10), - y = runif(10) -) -ggplot(df, aes(x, y)) + - geom_point() + - labs( - x = quote(sum(x[i] ^ 2, i == 1, n)), - y = quote(alpha + beta + frac(delta, theta)) - ) -``` - -### Exercises - -1. Create one plot on the fuel economy data with customised `title`, - `subtitle`, `caption`, `x`, `y`, and `colour` labels. - -1. The `geom_smooth()` is somewhat misleading because the `hwy` for - large engines is skewed upwards due to the inclusion of lightweight - sports cars with big engines. Use your modelling tools to fit and display - a better model. - -1. Take an exploratory graphic that you've created in the last month, and add - informative titles to make it easier for others to understand. - -## Annotations - -In addition to labelling major components of your plot, it's often useful to label individual observations or groups of observations. The first tool you have at your disposal is `geom_text()`. `geom_text()` is similar to `geom_point()`, but it has an additional aesthetic: `label`. This makes it possible to add textual labels to your plots. - -There are two possible sources of labels. First, you might have a tibble that provides labels. The plot below isn't terribly useful, but it illustrates a useful approach: pull out the most efficient car in each class with dplyr, and then label it on the plot: - -```{r} -best_in_class <- mpg %>% - group_by(class) %>% - filter(row_number(desc(hwy)) == 1) - -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) + - geom_text(aes(label = model), data = best_in_class) -``` - -This is hard to read because the labels overlap with each other, and with the points. We can make things a little better by switching to `geom_label()` which draws a rectangle behind the text. We also use the `nudge_y` parameter to move the labels slightly above the corresponding points: - -```{r} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) + - geom_label(aes(label = model), data = best_in_class, nudge_y = 2, alpha = 0.5) -``` - -That helps a bit, but if you look closely in the top-left hand corner, you'll notice that there are two labels practically on top of each other. This happens because the highway mileage and displacement for the best cars in the compact and subcompact categories are exactly the same. There's no way that we can fix these by applying the same transformation for every label. Instead, we can use the __ggrepel__ package by Kamil Slowikowski. This useful package will automatically adjust labels so that they don't overlap: - -```{r} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) + - geom_point(size = 3, shape = 1, data = best_in_class) + - ggrepel::geom_label_repel(aes(label = model), data = best_in_class) -``` - -Note another handy technique used here: I added a second layer of large, hollow points to highlight the points that I've labelled. - -You can sometimes use the same idea to replace the legend with labels placed directly on the plot. It's not wonderful for this plot, but it isn't too bad. (`theme(legend.position = "none"`) turns the legend off --- we'll talk about it more shortly.) - -```{r} -class_avg <- mpg %>% - group_by(class) %>% - summarise( - displ = median(displ), - hwy = median(hwy) - ) - -ggplot(mpg, aes(displ, hwy, colour = class)) + - ggrepel::geom_label_repel(aes(label = class), - data = class_avg, - size = 6, - label.size = 0, - segment.color = NA - ) + - geom_point() + - theme(legend.position = "none") -``` - -Alternatively, you might just want to add a single label to the plot, but you'll still need to create a data frame. Often, you want the label in the corner of the plot, so it's convenient to create a new data frame using `summarise()` to compute the maximum values of x and y. - -```{r} -label <- mpg %>% - summarise( - displ = max(displ), - hwy = max(hwy), - label = "Increasing engine size is \nrelated to decreasing fuel economy." - ) - -ggplot(mpg, aes(displ, hwy)) + - geom_point() + - geom_text(aes(label = label), data = label, vjust = "top", hjust = "right") -``` - -If you want to place the text exactly on the borders of the plot, you can use `+Inf` and `-Inf`. Since we're no longer computing the positions from `mpg`, we can use `tibble()` to create the data frame: - -```{r} -label <- tibble( - displ = Inf, - hwy = Inf, - label = "Increasing engine size is \nrelated to decreasing fuel economy." -) - -ggplot(mpg, aes(displ, hwy)) + - geom_point() + - geom_text(aes(label = label), data = label, vjust = "top", hjust = "right") -``` - -In these examples, I manually broke the label up into lines using `"\n"`. Another approach is to use `stringr::str_wrap()` to automatically add line breaks, given the number of characters you want per line: - -```{r} -"Increasing engine size is related to decreasing fuel economy." %>% - stringr::str_wrap(width = 40) %>% - writeLines() -``` - -Note the use of `hjust` and `vjust` to control the alignment of the label. Figure \@ref(fig:just) shows all nine possible combinations. - -```{r just, echo = FALSE, fig.cap = "All nine combinations of `hjust` and `vjust`.", fig.asp = 0.5, fig.width = 4.5, out.width = "60%"} -vjust <- c(bottom = 0, center = 0.5, top = 1) -hjust <- c(left = 0, center = 0.5, right = 1) - -df <- tidyr::crossing(hj = names(hjust), vj = names(vjust)) %>% - mutate( - y = vjust[vj], - x = hjust[hj], - label = paste0("hjust = '", hj, "'\n", "vjust = '", vj, "'") - ) - -ggplot(df, aes(x, y)) + - geom_point(colour = "grey70", size = 5) + - geom_point(size = 0.5, colour = "red") + - geom_text(aes(label = label, hjust = hj, vjust = vj), size = 4) + - labs(x = NULL, y = NULL) -``` - -Remember, in addition to `geom_text()`, you have many other geoms in ggplot2 available to help annotate your plot. A few ideas: - -* Use `geom_hline()` and `geom_vline()` to add reference lines. I often make - them thick (`size = 2`) and white (`colour = white`), and draw them - underneath the primary data layer. That makes them easy to see, without - drawing attention away from the data. - -* Use `geom_rect()` to draw a rectangle around points of interest. The - boundaries of the rectangle are defined by aesthetics `xmin`, `xmax`, - `ymin`, `ymax`. - -* Use `geom_segment()` with the `arrow` argument to draw attention - to a point with an arrow. Use aesthetics `x` and `y` to define the - starting location, and `xend` and `yend` to define the end location. - -The only limit is your imagination (and your patience with positioning annotations to be aesthetically pleasing)! - -### Exercises - -1. Use `geom_text()` with infinite positions to place text at the - four corners of the plot. - -1. Read the documentation for `annotate()`. How can you use it to add a text - label to a plot without having to create a tibble? - -1. How do labels with `geom_text()` interact with faceting? How can you - add a label to a single facet? How can you put a different label in - each facet? (Hint: think about the underlying data.) - -1. What arguments to `geom_label()` control the appearance of the background - box? - -1. What are the four arguments to `arrow()`? How do they work? Create a series - of plots that demonstrate the most important options. - -## Scales - -The third way you can make your plot better for communication is to adjust the scales. Scales control the mapping from data values to things that you can perceive. Normally, ggplot2 automatically adds scales for you. For example, when you type: - -```{r default-scales, fig.show = "hide"} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) -``` - -ggplot2 automatically adds default scales behind the scenes: - -```{r, fig.show = "hide"} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) + - scale_x_continuous() + - scale_y_continuous() + - scale_colour_discrete() -``` - -Note the naming scheme for scales: `scale_` followed by the name of the aesthetic, then `_`, then the name of the scale. The default scales are named according to the type of variable they align with: continuous, discrete, datetime, or date. There are lots of non-default scales which you'll learn about below. - -The default scales have been carefully chosen to do a good job for a wide range of inputs. Nevertheless, you might want to override the defaults for two reasons: - -* You might want to tweak some of the parameters of the default scale. - This allows you to do things like change the breaks on the axes, or the - key labels on the legend. - -* You might want to replace the scale altogether, and use a completely - different algorithm. Often you can do better than the default because - you know more about the data. - -### Axis ticks and legend keys - -There are two primary arguments that affect the appearance of the ticks on the axes and the keys on the legend: `breaks` and `labels`. Breaks controls the position of the ticks, or the values associated with the keys. Labels controls the text label associated with each tick/key. The most common use of `breaks` is to override the default choice: - -```{r} -ggplot(mpg, aes(displ, hwy)) + - geom_point() + - scale_y_continuous(breaks = seq(15, 40, by = 5)) -``` - -You can use `labels` in the same way (a character vector the same length as `breaks`), but you can also set it to `NULL` to suppress the labels altogether. This is useful for maps, or for publishing plots where you can't share the absolute numbers. - -```{r} -ggplot(mpg, aes(displ, hwy)) + - geom_point() + - scale_x_continuous(labels = NULL) + - scale_y_continuous(labels = NULL) -``` - -You can also use `breaks` and `labels` to control the appearance of legends. Collectively axes and legends are called __guides__. Axes are used for x and y aesthetics; legends are used for everything else. - -Another use of `breaks` is when you have relatively few data points and want to highlight exactly where the observations occur. For example, take this plot that shows when each US president started and ended their term. - -```{r} -presidential %>% - mutate(id = 33 + row_number()) %>% - ggplot(aes(start, id)) + - geom_point() + - geom_segment(aes(xend = end, yend = id)) + - scale_x_date(NULL, breaks = presidential$start, date_labels = "'%y") -``` - -Note that the specification of breaks and labels for date and datetime scales is a little different: - -* `date_labels` takes a format specification, in the same form as - `parse_datetime()`. - -* `date_breaks` (not shown here), takes a string like "2 days" or "1 month". - -### Legend layout - -You will most often use `breaks` and `labels` to tweak the axes. While they both also work for legends, there are a few other techniques you are more likely to use. - -To control the overall position of the legend, you need to use a `theme()` setting. We'll come back to themes at the end of the chapter, but in brief, they control the non-data parts of the plot. The theme setting `legend.position` controls where the legend is drawn: - -```{r fig.asp = 1, fig.align = "default", out.width = "50%", fig.width = 4} -base <- ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) - -base + theme(legend.position = "left") -base + theme(legend.position = "top") -base + theme(legend.position = "bottom") -base + theme(legend.position = "right") # the default -``` - -You can also use `legend.position = "none"` to suppress the display of the legend altogether. - -To control the display of individual legends, use `guides()` along with `guide_legend()` or `guide_colourbar()`. The following example shows two important settings: controlling the number of rows the legend uses with `nrow`, and overriding one of the aesthetics to make the points bigger. This is particularly useful if you have used a low `alpha` to display many points on a plot. - -```{r} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(colour = class)) + - geom_smooth(se = FALSE) + - theme(legend.position = "bottom") + - guides(colour = guide_legend(nrow = 1, override.aes = list(size = 4))) -``` - -### Replacing a scale - -Instead of just tweaking the details a little, you can instead replace the scale altogether. There are two types of scales you're mostly likely to want to switch out: continuous position scales and colour scales. Fortunately, the same principles apply to all the other aesthetics, so once you've mastered position and colour, you'll be able to quickly pick up other scale replacements. - -It's very useful to plot transformations of your variable. For example, as we've seen in [diamond prices](diamond-prices) it's easier to see the precise relationship between `carat` and `price` if we log transform them: - -```{r, fig.align = "default", out.width = "50%"} -ggplot(diamonds, aes(carat, price)) + - geom_bin2d() - -ggplot(diamonds, aes(log10(carat), log10(price))) + - geom_bin2d() -``` - -However, the disadvantage of this transformation is that the axes are now labelled with the transformed values, making it hard to interpret the plot. Instead of doing the transformation in the aesthetic mapping, we can instead do it with the scale. This is visually identical, except the axes are labelled on the original data scale. - -```{r} -ggplot(diamonds, aes(carat, price)) + - geom_bin2d() + - scale_x_log10() + - scale_y_log10() -``` - -Another scale that is frequently customised is colour.The default categorical scale picks colours that are evenly spaced around the colour wheel. Useful alternatives are the ColorBrewer scales which have been hand tuned to work better for people with common types of colour blindness. The two plots below look similar, but there is enough difference in the shades of red and green that the dots on the right can be distinguished even by people with red-green colour blindness. - -```{r, fig.align = "default", out.width = "50%"} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(color = drv)) - -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(color = drv)) + - scale_colour_brewer(palette = "Set1") -``` - -Don't forget simpler techniques. If there are just a few colours, you can add a redundant shape mapping. This will also help ensure your plot is interpretable in black and white. - -```{r} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(color = drv, shape = drv)) + - scale_colour_brewer(palette = "Set1") -``` - -The ColorBrewer scales are documented online at and made available in R via the __RColorBrewer__ package, by Erich Neuwirth. Figure \@ref(fig:brewer) shows the complete list of all palettes. The sequential (top) and diverging (bottom) palettes are particularly useful if your categorical values are ordered, or have a "middle". This often arises if you've used `cut()` to make a continuous variable into a categorical variable. - -```{r brewer, fig.asp = 2.5, echo = FALSE, fig.cap = "All ColourBrewer scales."} -par(mar = c(0, 3, 0, 0)) -RColorBrewer::display.brewer.all() -``` - -When you have a predefined mapping between values and colours, use `scale_colour_manual()`. For example, if we map presidential party to colour, we want to use the standard mapping of red for Republicans and blue for Democrats: - -```{r} -presidential %>% - mutate(id = 33 + row_number()) %>% - ggplot(aes(start, id, colour = party)) + - geom_point() + - geom_segment(aes(xend = end, yend = id)) + - scale_colour_manual(values = c(Republican = "red", Democratic = "blue")) -``` - -For continuous colour, you can use the built-in `scale_colour_gradient()` or `scale_fill_gradient()`. If you have a diverging scale, you can use `scale_colour_gradient2()`. That allows you to give, for example, positive and negative values different colours. That's sometimes also useful if you want to distinguish points above or below the mean. - -Another option is `scale_colour_viridis()` provided by the __viridis__ package. It's a continuous analog of the categorical ColorBrewer scales. The designers, Nathaniel Smith and Stéfan van der Walt, carefully tailored a continuous colour scheme that has good perceptual properties. Here's an example from the viridis vignette. - -```{r, fig.align = "default", fig.asp = 1, out.width = "50%", fig.width = 4} -df <- tibble( - x = rnorm(10000), - y = rnorm(10000) -) -ggplot(df, aes(x, y)) + - geom_hex() + - coord_fixed() - -ggplot(df, aes(x, y)) + - geom_hex() + - viridis::scale_fill_viridis() + - coord_fixed() -``` - -Note that all colour scales come in two variety: `scale_colour_x()` and `scale_fill_x()` for the `colour` and `fill` aesthetics respectively (the colour scales are available in both UK and US spellings). - -### Exercises - -1. Why doesn't the following code override the default scale? - - ```{r fig.show = "hide"} - ggplot(df, aes(x, y)) + - geom_hex() + - scale_colour_gradient(low = "white", high = "red") + - coord_fixed() - ``` - -1. What is the first argument to every scale? How does it compare to `labs()`? - -1. Change the display of the presidential terms by: - - 1. Combining the two variants shown above. - 1. Improving the display of the y axis. - 1. Labelling each term with the name of the president. - 1. Adding informative plot labels. - 1. Placing breaks every 4 years (this is trickier than it seems!). - -1. Use `override.aes` to make the legend on the following plot easier to see. - - ```{r, dev = "png", out.width = "50%"} - ggplot(diamonds, aes(carat, price)) + - geom_point(aes(colour = cut), alpha = 1/20) - ``` - -## Zooming - -There are three ways to control the plot limits: - -1. Adjusting what data are plotted -1. Setting the limits in each scale -1. Setting `xlim` and `ylim` in `coord_cartesian()` - -To zoom in on a region of the plot, it's generally best to use `coord_cartesian()`. Compare the following two plots: - -```{r out.width = "50%", fig.align = "default", message = FALSE} -ggplot(mpg, mapping = aes(displ, hwy)) + - geom_point(aes(color = class)) + - geom_smooth() + - coord_cartesian(xlim = c(5, 7), ylim = c(10, 30)) - -mpg %>% - filter(displ >= 5, displ <= 7, hwy >= 10, hwy <= 30) %>% - ggplot(aes(displ, hwy)) + - geom_point(aes(color = class)) + - geom_smooth() -``` - -You can also set the `limits` on individual scales. Reducing the limits is basically equivalent to subsetting the data. It is generally more useful if you want _expand_ the limits, for example, to match scales across different plots. For example, if we extract two classes of cars and plot them separately, it's difficult to compare the plots because all three scales (the x-axis, the y-axis, and the colour aesthetic) have different ranges. - -```{r out.width = "50%", fig.align = "default", fig.width = 4} -suv <- mpg %>% filter(class == "suv") -compact <- mpg %>% filter(class == "compact") - -ggplot(suv, aes(displ, hwy, colour = drv)) + - geom_point() - -ggplot(compact, aes(displ, hwy, colour = drv)) + - geom_point() -``` - -One way to overcome this problem is to share scales across multiple plots, training the scales with the `limits` of the full data. - -```{r out.width = "50%", fig.align = "default", fig.width = 4} -x_scale <- scale_x_continuous(limits = range(mpg$displ)) -y_scale <- scale_y_continuous(limits = range(mpg$hwy)) -col_scale <- scale_colour_discrete(limits = unique(mpg$drv)) - -ggplot(suv, aes(displ, hwy, colour = drv)) + - geom_point() + - x_scale + - y_scale + - col_scale - -ggplot(compact, aes(displ, hwy, colour = drv)) + - geom_point() + - x_scale + - y_scale + - col_scale -``` - -In this particular case, you could have simply used faceting, but this technique is useful more generally, if for instance, you want spread plots over multiple pages of a report. - -## Themes - -Finally, you can customise the non-data elements of your plot with a theme: - -```{r, message = FALSE} -ggplot(mpg, aes(displ, hwy)) + - geom_point(aes(color = class)) + - geom_smooth(se = FALSE) + - theme_bw() -``` - -ggplot2 includes eight themes by default, as shown in Figure \@ref(fig:themes). Many more are included in add-on packages like __ggthemes__ (), by Jeffrey Arnold. - -```{r themes, echo = FALSE, fig.cap = "The eight themes built-in to ggplot2."} -knitr::include_graphics("images/visualization-themes.png") -``` - -Many people wonder why the default theme has a grey background. This was a deliberate choice because it puts the data forward while still making the grid lines visible. The white grid lines are visible (which is important because they significantly aid position judgements), but they have little visual impact and we can easily tune them out. The grey background gives the plot a similar typographic colour to the text, ensuring that the graphics fit in with the flow of a document without jumping out with a bright white background. Finally, the grey background creates a continuous field of colour which ensures that the plot is perceived as a single visual entity. - -It's also possible to control individual components of each theme, like the size and colour of the font used for the y axis. Unfortunately, this level of detail is outside the scope of this book, so you'll need to read the [ggplot2 book](https://amzn.com/331924275X) for the full details. You can also create your own themes, if you are trying to match a particular corporate or journal style. - -## Saving your plots - -There are two main ways to get your plots out of R and into your final write-up: `ggsave()` and knitr. `ggsave()` will save the most recent plot to disk: - -```{r, fig.show = "none"} -ggplot(mpg, aes(displ, hwy)) + geom_point() -ggsave("my-plot.pdf") -``` -```{r, include = FALSE} -file.remove("my-plot.pdf") -``` - -If you don't specify the `width` and `height` they will be taken from the dimensions of the current plotting device. For reproducible code, you'll want to specify them. - -Generally, however, I think you should be assembling your final reports using R Markdown, so I want to focus on the important code chunk options that you should know about for graphics. You can learn more about `ggsave()` in the documentation. - -### Figure sizing - -The biggest challenge of graphics in R Markdown is getting your figures the right size and shape. There are five main options that control figure sizing: `fig.width`, `fig.height`, `fig.asp`, `out.width` and `out.height`. Image sizing is challenging because there are two sizes (the size of the figure created by R and the size at which it is inserted in the output document), and multiple ways of specifying the size (i.e., height, width, and aspect ratio: pick two of three). - -I only ever use three of the five options: - -* I find it most aesthetically pleasing for plots to have a consistent - width. To enforce this, I set `fig.width = 6` (6") and `fig.asp = 0.618` - (the golden ratio) in the defaults. Then in individual chunks, I only - adjust `fig.asp`. - -* I control the output size with `out.width` and set it to a percentage - of the line width). I default to `out.width = "70%"` - and `fig.align = "center"`. That give plots room to breathe, without taking - up too much space. - -* To put multiple plots in a single row I set the `out.width` to - `50%` for two plots, `33%` for 3 plots, or `25%` to 4 plots, and set - `fig.align = "default"`. Depending on what I'm trying to illustrate (e.g. - show data or show plot variations), I'll also tweak `fig.width`, as - discussed below. - -If you find that you're having to squint to read the text in your plot, you need to tweak `fig.width`. If `fig.width` is larger than the size the figure is rendered in the final doc, the text will be too small; if `fig.width` is smaller, the text will be too big. You'll often need to do a little experimentation to figure out the right ratio between the `fig.width` and the eventual width in your document. To illustrate the principle, the following three plots have `fig.width` of 4, 6, and 8 respectively: - -```{r, include = FALSE} -plot <- ggplot(mpg, aes(displ, hwy)) + geom_point() -``` -```{r, fig.width = 4, echo = FALSE} -plot -``` -```{r, fig.width = 6, echo = FALSE} -plot -``` -```{r, fig.width = 8, echo = FALSE} -plot -``` - -If you want to make sure the font size is consistent across all your figures, whenever you set `out.width`, you'll also need to adjust `fig.width` to maintain the same ratio with your default `out.width`. For example, if your default `fig.width` is 6 and `out.width` is 0.7, when you set `out.width = "50%"` you'll need to set `fig.width` to 4.3 (6 * 0.5 / 0.7). - -### Other important options - -When mingling code and text, like I do in this book, I recommend setting `fig.show = "hold"` so that plots are shown after the code. This has the pleasant side effect of forcing you to break up large blocks of code with their explanations. - -To add a caption to the plot, use `fig.cap`. In R Markdown this will change the figure from inline to "floating". - -If you're producing PDF output, the default graphics type is PDF. This is a good default because PDFs are high quality vector graphics. However, they can produce very large and slow plots if you are displaying thousands of points. In that case, set `dev = "png"` to force the use of PNGs. They are slightly lower quality, but will be much more compact. - -It's a good idea to name code chunks that produce figures, even if you don't routinely label other chunks. The chunk label is used to generate the file name of the graphic on disk, so naming your chunks makes it much easier to pick out plots and reuse in other circumstances (i.e. if you want to quickly drop a single plot into an email or a tweet). - -## Learning more - -The absolute best place to learn more is the ggplot2 book: [_ggplot2: Elegant graphics for data analysis_](https://amzn.com/331924275X). It goes into much more depth about the underlying theory, and has many more examples of how to combine the individual pieces to solve practical problems. Unfortunately, the book is not available online for free, although you can find the source code at . - -Another great resource is the ggplot2 extensions guide . This site lists many of the packages that extend ggplot2 with new geoms and scales. It's a great place to start if you're trying to do something that seems hard with ggplot2. diff --git a/communicate.Rmd b/communicate.Rmd deleted file mode 100644 index c4d9cccc5..000000000 --- a/communicate.Rmd +++ /dev/null @@ -1,32 +0,0 @@ -# (PART) Communicate {-} - -# Introduction {#communicate-intro} - -So far, you've learned the tools to get your data into R, tidy it into a form convenient for analysis, and then understand your data through transformation, visualisation and modelling. However, it doesn't matter how great your analysis is unless you can explain it to others: you need to __communicate__ your results. - -```{r echo = FALSE, out.width = "75%"} -knitr::include_graphics("diagrams/data-science-communicate.png") -``` - -Communication is the theme of the following four chapters: - -* In [R Markdown], you will learn about R Markdown, a tool for integrating - prose, code, and results. You can use R Markdown in notebook mode for - analyst-to-analyst communication, and in report mode for - analyst-to-decision-maker communication. Thanks to the power of R Markdown - formats, you can even use the same document for both purposes. - -* In [Graphics for communication], you will learn how to take your exploratory - graphics and turn them into expository graphics, graphics that help the - newcomer to your analysis understand what's going on as quickly and - easily as possible. - -* In [R Markdown formats], you'll learn a little about the many other varieties - of outputs you can produce using R Markdown, including dashboards, websites, - and books. - -* We'll finish up with [R Markdown workflow], where you'll learn about the - "analysis notebook" and how to systematically record your successes and - failures so that you can learn from them. - -Unfortunately, these chapters focus mostly on the technical mechanics of communication, not the really hard problems of communicating your thoughts to other humans. However, there are lot of other great books about communication, which we'll point you to at the end of each chapter. diff --git a/communicate.qmd b/communicate.qmd new file mode 100644 index 000000000..073efc6da --- /dev/null +++ b/communicate.qmd @@ -0,0 +1,36 @@ +# Communicate {#sec-communicate-intro .unnumbered} + +```{r} +#| echo: false + +source("_common.R") +``` + +So far, you've learned the tools to get your data into R, tidy it into a form convenient for analysis, and then understand your data through transformation, and visualization. +However, it doesn't matter how great your analysis is unless you can explain it to others: you need to **communicate** your results. + +```{r} +#| label: fig-ds-communicate +#| echo: false +#| fig-cap: | +#| Communication is the final part of the data science process; if you +#| can't communicate your results to other humans, it doesn't matter how +#| great your analysis is. +#| fig-alt: | +#| A diagram displaying the data science cycle with +#| communicate highlighed in blue. +#| out.width: NULL + +knitr::include_graphics("diagrams/data-science/communicate.png", dpi = 270) +``` + +Communication is the theme of the following two chapters: + +- In @sec-quarto, you will learn about Quarto, a tool for integrating prose, code, and results. + You can use Quarto for analyst-to-analyst communication as well as analyst-to-decision-maker communication. + Thanks to the power of Quarto formats, you can even use the same document for both purposes. + +- In @sec-quarto-formats, you'll learn a little about the many other varieties of outputs you can produce using Quarto, including dashboards, websites, and books. + +These chapters focus mostly on the technical mechanics of communication, not the really hard problems of communicating your thoughts to other humans. +However, there are lot of other great books about communication, which we'll point you to at the end of each chapter. diff --git a/communication.qmd b/communication.qmd new file mode 100644 index 000000000..af2ad1a1e --- /dev/null +++ b/communication.qmd @@ -0,0 +1,1090 @@ +# Communication {#sec-communication} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +In @sec-exploratory-data-analysis, you learned how to use plots as tools for *exploration*. +When you make exploratory plots, you know---even before looking---which variables the plot will display. +You made each plot for a purpose, could quickly look at it, and then move on to the next plot. +In the course of most analyses, you'll produce tens or hundreds of plots, most of which are immediately thrown away. + +Now that you understand your data, you need to *communicate* your understanding to others. +Your audience will likely not share your background knowledge and will not be deeply invested in the data. +To help others quickly build up a good mental model of the data, you will need to invest considerable effort in making your plots as self-explanatory as possible. +In this chapter, you'll learn some of the tools that ggplot2 provides to do so. + +This chapter focuses on the tools you need to create good graphics. +We assume that you know what you want, and just need to know how to do it. +For that reason, we highly recommend pairing this chapter with a good general visualization book. +We particularly like [The Truthful Art](https://www.amazon.com/gp/product/0321934075/), by Albert Cairo. +It doesn't teach the mechanics of creating visualizations, but instead focuses on what you need to think about in order to create effective graphics. + +### Prerequisites + +In this chapter, we'll focus once again on ggplot2. +We'll also use a little dplyr for data manipulation, **scales** to override the default breaks, labels, transformations and palettes, and a few ggplot2 extension packages, including **ggrepel** ([https://ggrepel.slowkow.com](https://ggrepel.slowkow.com/)) by Kamil Slowikowski and **patchwork** ([https://patchwork.data-imaginist.com](https://patchwork.data-imaginist.com/)) by Thomas Lin Pedersen. +Don't forget that you'll need to install those packages with `install.packages()` if you don't already have them. + +```{r} +#| label: setup +#| message: false + +library(tidyverse) +library(scales) +library(ggrepel) +library(patchwork) +``` + +## Labels + +The easiest place to start when turning an exploratory graphic into an expository graphic is with good labels. +You add labels with the `labs()` function. + +```{r} +#| message: false +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars, where +#| points are colored according to the car class. A smooth curve following +#| the trajectory of the relationship between highway fuel efficiency versus +#| engine size of cars is overlaid. The x-axis is labelled "Engine +#| displacement (L)" and the y-axis is labelled "Highway fuel economy (mpg)". +#| The legend is labelled "Car type". The plot is titled "Fuel efficiency +#| generally decreases with engine size". The subtitle is "Two seaters +#| (sports cars) are an exception because of their light weight" and the +#| caption is "Data from fueleconomy.gov". + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = class)) + + geom_smooth(se = FALSE) + + labs( + x = "Engine displacement (L)", + y = "Highway fuel economy (mpg)", + color = "Car type", + title = "Fuel efficiency generally decreases with engine size", + subtitle = "Two seaters (sports cars) are an exception because of their light weight", + caption = "Data from fueleconomy.gov" + ) +``` + +The purpose of a plot title is to summarize the main finding. +Avoid titles that just describe what the plot is, e.g., "A scatterplot of engine displacement vs. fuel economy". + +If you need to add more text, there are two other useful labels: `subtitle` adds additional detail in a smaller font beneath the title and `caption` adds text at the bottom right of the plot, often used to describe the source of the data. +You can also use `labs()` to replace the axis and legend titles. +It's usually a good idea to replace short variable names with more detailed descriptions, and to include the units. + +It's possible to use mathematical equations instead of text strings. +Just switch `""` out for `quote()` and read about the available options in `?plotmath`: + +```{r} +#| fig-asp: 1 +#| out-width: "50%" +#| fig-width: 3 +#| fig-alt: | +#| Scatterplot with math text on the x and y axis labels. X-axis label +#| says x_i, y-axis label says sum of x_i squared, for i from 1 to n. + +df <- tibble( + x = 1:10, + y = cumsum(x^2) +) + +ggplot(df, aes(x, y)) + + geom_point() + + labs( + x = quote(x[i]), + y = quote(sum(x[i] ^ 2, i == 1, n)) + ) +``` + +### Exercises + +1. Create one plot on the fuel economy data with customized `title`, `subtitle`, `caption`, `x`, `y`, and `color` labels. + +2. Recreate the following plot using the fuel economy data. + Note that both the colors and shapes of points vary by type of drive train. + + ```{r} + #| echo: false + #| fig-alt: | + #| Scatterplot of highway versus city fuel efficiency. Shapes and + #| colors of points are determined by type of drive train. + + ggplot(mpg, aes(x = cty, y = hwy, color = drv, shape = drv)) + + geom_point() + + labs( + x = "City MPG", + y = "Highway MPG", + shape = "Type of\ndrive train", + color = "Type of\ndrive train" + ) + ``` + +3. Take an exploratory graphic that you've created in the last month, and add informative titles to make it easier for others to understand. + +## Annotations + +In addition to labelling major components of your plot, it's often useful to label individual observations or groups of observations. +The first tool you have at your disposal is `geom_text()`. +`geom_text()` is similar to `geom_point()`, but it has an additional aesthetic: `label`. +This makes it possible to add textual labels to your plots. + +There are two possible sources of labels. +First, you might have a tibble that provides labels. +In the following plot we pull out the cars with the highest engine size in each drive type and save their information as a new data frame called `label_info`. + +```{r} +label_info <- mpg |> + group_by(drv) |> + arrange(desc(displ)) |> + slice_head(n = 1) |> + mutate( + drive_type = case_when( + drv == "f" ~ "front-wheel drive", + drv == "r" ~ "rear-wheel drive", + drv == "4" ~ "4-wheel drive" + ) + ) |> + select(displ, hwy, drv, drive_type) + +label_info +``` + +Then, we use this new data frame to directly label the three groups to replace the legend with labels placed directly on the plot. +Using the `fontface` and `size` arguments we can customize the look of the text labels. +They're larger than the rest of the text on the plot and bolded. +(`theme(legend.position = "none"`) turns all the legends off --- we'll talk about it more shortly.) + +```{r} +#| fig-alt: | +#| Scatterplot of highway mileage versus engine size where points are colored +#| by drive type. Smooth curves for each drive type are overlaid. +#| Text labels identify the curves as front-wheel, rear-wheel, and 4-wheel. + +ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + + geom_point(alpha = 0.3) + + geom_smooth(se = FALSE) + + geom_text( + data = label_info, + aes(x = displ, y = hwy, label = drive_type), + fontface = "bold", size = 5, hjust = "right", vjust = "bottom" + ) + + theme(legend.position = "none") +``` + +Note the use of `hjust` (horizontal justification) and `vjust` (vertical justification) to control the alignment of the label. + +However the annotated plot we made above is hard to read because the labels overlap with each other, and with the points. +We can use the `geom_label_repel()` function from the ggrepel package to address both of these issues. +This useful package will automatically adjust labels so that they don't overlap: + +```{r} +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars, where +#| points are colored according to the car class. Some points are labelled +#| with the car's name. The labels are box with white, transparent background +#| and positioned to not overlap. + +ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + + geom_point(alpha = 0.3) + + geom_smooth(se = FALSE) + + geom_label_repel( + data = label_info, + aes(x = displ, y = hwy, label = drive_type), + fontface = "bold", size = 5, nudge_y = 2 + ) + + theme(legend.position = "none") +``` + +You can also use the same idea to highlight certain points on a plot with `geom_text_repel()` from the ggrepel package. +Note another handy technique used here: we added a second layer of large, hollow points to further highlight the labelled points. + +```{r} +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars. Points +#| where highway mileage is above 40 as well as above 20 with engine size +#| above 5 are red, with a hollow red circle, and labelled with model name +#| of the car. + +potential_outliers <- mpg |> + filter(hwy > 40 | (hwy > 20 & displ > 5)) + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point() + + geom_text_repel(data = potential_outliers, aes(label = model)) + + geom_point(data = potential_outliers, color = "red") + + geom_point( + data = potential_outliers, + color = "red", size = 3, shape = "circle open" + ) +``` + +Remember, in addition to `geom_text()` and `geom_label()`, you have many other geoms in ggplot2 available to help annotate your plot. +A couple ideas: + +- Use `geom_hline()` and `geom_vline()` to add reference lines. + We often make them thick (`linewidth = 2`) and white (`color = white`), and draw them underneath the primary data layer. + That makes them easy to see, without drawing attention away from the data. + +- Use `geom_rect()` to draw a rectangle around points of interest. + The boundaries of the rectangle are defined by aesthetics `xmin`, `xmax`, `ymin`, `ymax`. + Alternatively, look into the [ggforce package](https://ggforce.data-imaginist.com/index.html), specifically [`geom_mark_hull()`](https://ggforce.data-imaginist.com/reference/geom_mark_hull.html), which allows you to annotate subsets of points with hulls. + +- Use `geom_segment()` with the `arrow` argument to draw attention to a point with an arrow. + Use aesthetics `x` and `y` to define the starting location, and `xend` and `yend` to define the end location. + +Another handy function for adding annotations to plots is `annotate()`. +As a rule of thumb, geoms are generally useful for highlighting a subset of the data while `annotate()` is useful for adding one or few annotation elements to a plot. + +To demonstrate using `annotate()`, let's create some text to add to our plot. +The text is a bit long, so we'll use `stringr::str_wrap()` to automatically add line breaks to it given the number of characters you want per line: + +```{r} +trend_text <- "Larger engine sizes tend to have lower fuel economy." |> + str_wrap(width = 30) +trend_text +``` + +Then, we add two layers of annotation: one with a label geom and the other with a segment geom. +The `x` and `y` aesthetics in both define where the annotation should start, and the `xend` and `yend` aesthetics in the segment annotation define the end location of the segment. +Note also that the segment is styled as an arrow. + +```{r} +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars. A red +#| arrow pointing down follows the trend of the points and the annotation +#| placed next to the arrow reads "Larger engine sizes tend to have lower +#| fuel economy". The arrow and the annotation text is red. + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point() + + annotate( + geom = "label", x = 3.5, y = 38, + label = trend_text, + hjust = "left", color = "red" + ) + + annotate( + geom = "segment", + x = 3, y = 35, xend = 5, yend = 25, color = "red", + arrow = arrow(type = "closed") + ) +``` + +Annotation is a powerful tool for communicating main takeaways and interesting features of your visualizations. +The only limit is your imagination (and your patience with positioning annotations to be aesthetically pleasing)! + +### Exercises + +1. Use `geom_text()` with infinite positions to place text at the four corners of the plot. + +2. Use `annotate()` to add a point geom in the middle of your last plot without having to create a tibble. + Customize the shape, size, or color of the point. + +3. How do labels with `geom_text()` interact with faceting? + How can you add a label to a single facet? + How can you put a different label in each facet? + (Hint: Think about the dataset that is being passed to `geom_text()`.) + +4. What arguments to `geom_label()` control the appearance of the background box? + +5. What are the four arguments to `arrow()`? + How do they work? + Create a series of plots that demonstrate the most important options. + +## Scales + +The third way you can make your plot better for communication is to adjust the scales. +Scales control how the aesthetic mappings manifest visually. + +### Default scales + +Normally, ggplot2 automatically adds scales for you. +For example, when you type: + +```{r} +#| label: default-scales +#| fig-show: "hide" + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = class)) +``` + +ggplot2 automatically adds default scales behind the scenes: + +```{r} +#| fig-show: "hide" + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = class)) + + scale_x_continuous() + + scale_y_continuous() + + scale_color_discrete() +``` + +Note the naming scheme for scales: `scale_` followed by the name of the aesthetic, then `_`, then the name of the scale. +The default scales are named according to the type of variable they align with: continuous, discrete, datetime, or date. +`scale_x_continuous()` puts the numeric values from `displ` on a continuous number line on the x-axis, `scale_color_discrete()` chooses colors for each of the `class` of car, etc. +There are lots of non-default scales which you'll learn about below. + +The default scales have been carefully chosen to do a good job for a wide range of inputs. +Nevertheless, you might want to override the defaults for two reasons: + +- You might want to tweak some of the parameters of the default scale. + This allows you to do things like change the breaks on the axes, or the key labels on the legend. + +- You might want to replace the scale altogether, and use a completely different algorithm. + Often you can do better than the default because you know more about the data. + +### Axis ticks and legend keys + +Collectively axes and legends are called **guides**. +Axes are used for x and y aesthetics; legends are used for everything else. + +There are two primary arguments that affect the appearance of the ticks on the axes and the keys on the legend: `breaks` and `labels`. +Breaks controls the position of the ticks, or the values associated with the keys. +Labels controls the text label associated with each tick/key. +The most common use of `breaks` is to override the default choice: + +```{r} +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars, +#| colored by drive. The y-axis has breaks starting at 15 and ending at 40, +#| increasing by 5. + +ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + + geom_point() + + scale_y_continuous(breaks = seq(15, 40, by = 5)) +``` + +You can use `labels` in the same way (a character vector the same length as `breaks`), but you can also set it to `NULL` to suppress the labels altogether. +This can be useful for maps, or for publishing plots where you can't share the absolute numbers. +You can also use `breaks` and `labels` to control the appearance of legends. +For discrete scales for categorical variables, `labels` can be a named list of the existing levels names and the desired labels for them. + +```{r} +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars, colored +#| by drive. The x and y-axes do not have any labels at the axis ticks. +#| The legend has custom labels: 4-wheel, front, rear. + +ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + + geom_point() + + scale_x_continuous(labels = NULL) + + scale_y_continuous(labels = NULL) + + scale_color_discrete(labels = c("4" = "4-wheel", "f" = "front", "r" = "rear")) +``` + +The `labels` argument coupled with labelling functions from the scales package is also useful for formatting numbers as currency, percent, etc. +The plot on the left shows default labelling with `label_dollar()`, which adds a dollar sign as well as a thousand separator comma. +The plot on the right adds further customization by dividing dollar values by 1,000 and adding a suffix "K" (for "thousands") as well as adding custom breaks. +Note that `breaks` is in the original scale of the data. + +```{r} +#| layout-ncol: 2 +#| fig-width: 4 +#| fig-alt: | +#| Two side-by-side box plots of price versus cut of diamonds. The outliers +#| are transparent. On both plots the x-axis labels are formatted as dollars. +#| The x-axis labels on the plot start at $0 and go to $15,000, increasing +#| by $5,000. The x-axis labels on the right plot start at $1K and go to +#| $19K, increasing by $6K. + +# Left +ggplot(diamonds, aes(x = price, y = cut)) + + geom_boxplot(alpha = 0.05) + + scale_x_continuous(labels = label_dollar()) + +# Right +ggplot(diamonds, aes(x = price, y = cut)) + + geom_boxplot(alpha = 0.05) + + scale_x_continuous( + labels = label_dollar(scale = 1/1000, suffix = "K"), + breaks = seq(1000, 19000, by = 6000) + ) +``` + +Another handy label function is `label_percent()`: + +```{r} +#| fig-alt: | +#| Segmented bar plots of cut, filled with levels of clarity. The y-axis +#| labels start at 0% and go to 100%, increasing by 25%. The y-axis label +#| name is "Percentage". + +ggplot(diamonds, aes(x = cut, fill = clarity)) + + geom_bar(position = "fill") + + scale_y_continuous(name = "Percentage", labels = label_percent()) +``` + +Another use of `breaks` is when you have relatively few data points and want to highlight exactly where the observations occur. +For example, take this plot that shows when each US president started and ended their term. + +```{r} +#| fig-alt: | +#| Line plot of id number of presidents versus the year they started their +#| presidency. Start year is marked with a point and a segment that starts +#| there and ends at the end of the presidency. The x-axis labels are +#| formatted as two digit years starting with an apostrophe, e.g., '53. + +presidential |> + mutate(id = 33 + row_number()) |> + ggplot(aes(x = start, y = id)) + + geom_point() + + geom_segment(aes(xend = end, yend = id)) + + scale_x_date(name = NULL, breaks = presidential$start, date_labels = "'%y") +``` + +Note that for the `breaks` argument we pulled out the `start` variable as a vector with `presidential$start` because we can't do an aesthetic mapping for this argument. +Also note that the specification of breaks and labels for date and datetime scales is a little different: + +- `date_labels` takes a format specification, in the same form as `parse_datetime()`. + +- `date_breaks` (not shown here), takes a string like "2 days" or "1 month". + +### Legend layout + +You will most often use `breaks` and `labels` to tweak the axes. +While they both also work for legends, there are a few other techniques you are more likely to use. + +To control the overall position of the legend, you need to use a `theme()` setting. +We'll come back to themes at the end of the chapter, but in brief, they control the non-data parts of the plot. +The theme setting `legend.position` controls where the legend is drawn: + +```{r} +#| layout-ncol: 2 +#| fig-width: 4 +#| fig-alt: | +#| Four scatterplots of highway fuel efficiency versus engine size of cars +#| where points are colored based on class of car. Clockwise, the legend +#| is placed on the right, left, top, and bottom of the plot. + +base <- ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = class)) + +base + theme(legend.position = "right") # the default +base + theme(legend.position = "left") +base + + theme(legend.position = "top") + + guides(color = guide_legend(nrow = 3)) +base + + theme(legend.position = "bottom") + + guides(color = guide_legend(nrow = 3)) +``` + +If your plot is short and wide, place the legend at the top or bottom, and if it's tall and narrow, place the legend at the left or right. +You can also use `legend.position = "none"` to suppress the display of the legend altogether. + +To control the display of individual legends, use `guides()` along with `guide_legend()` or `guide_colorbar()`. +The following example shows two important settings: controlling the number of rows the legend uses with `nrow`, and overriding one of the aesthetics to make the points bigger. +This is particularly useful if you have used a low `alpha` to display many points on a plot. + +```{r} +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars +#| where points are colored based on class of car. Overlaid on the plot is a +#| smooth curve. The legend is in the bottom and classes are listed +#| horizontally in two rows. The points in the legend are larger than the points +#| in the plot. + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = class)) + + geom_smooth(se = FALSE) + + theme(legend.position = "bottom") + + guides(color = guide_legend(nrow = 2, override.aes = list(size = 4))) +``` + +Note that the name of the argument in `guides()` matches the name of the aesthetic, just like in `labs()`. + +### Replacing a scale + +Instead of just tweaking the details a little, you can instead replace the scale altogether. +There are two types of scales you're mostly likely to want to switch out: continuous position scales and color scales. +Fortunately, the same principles apply to all the other aesthetics, so once you've mastered position and color, you'll be able to quickly pick up other scale replacements. + +It's very useful to plot transformations of your variable. +For example, it's easier to see the precise relationship between `carat` and `price` if we log transform them: + +```{r} +#| fig-align: default +#| layout-ncol: 2 +#| fig-width: 3 +#| fig-alt: | +#| Two plots of price versus carat of diamonds. Data binned and the color of +#| the rectangles representing each bin based on the number of points that +#| fall into that bin. In the plot on the right, price and carat values +#| are logged and the axis labels shows the logged values. + +# Left +ggplot(diamonds, aes(x = carat, y = price)) + + geom_bin2d() + +# Right +ggplot(diamonds, aes(x = log10(carat), y = log10(price))) + + geom_bin2d() +``` + +However, the disadvantage of this transformation is that the axes are now labelled with the transformed values, making it hard to interpret the plot. +Instead of doing the transformation in the aesthetic mapping, we can instead do it with the scale. +This is visually identical, except the axes are labelled on the original data scale. + +```{r} +#| fig-alt: | +#| Plot of price versus carat of diamonds. Data binned and the color of +#| the rectangles representing each bin based on the number of points that +#| fall into that bin. The axis labels are on the original data scale. + +ggplot(diamonds, aes(x = carat, y = price)) + + geom_bin2d() + + scale_x_log10() + + scale_y_log10() +``` + +Another scale that is frequently customized is color. +The default categorical scale picks colors that are evenly spaced around the color wheel. +Useful alternatives are the ColorBrewer scales which have been hand tuned to work better for people with common types of color blindness. +The two plots below look similar, but there is enough difference in the shades of red and green that the dots on the right can be distinguished even by people with red-green color blindness.[^communication-1] + +[^communication-1]: You can use a tool like [SimDaltonism](https://michelf.ca/projects/sim-daltonism/) to simulate color blindness to test these images. + +```{r} +#| fig-align: default +#| layout-ncol: 2 +#| fig-width: 3 +#| fig-alt: | +#| Two scatterplots of highway mileage versus engine size where points are +#| colored by drive type. The plot on the left uses the default +#| ggplot2 color palette and the plot on the right uses a different color +#| palette. + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = drv)) + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = drv)) + + scale_color_brewer(palette = "Set1") +``` + +Don't forget simpler techniques for improving accessibility. +If there are just a few colors, you can add a redundant shape mapping. +This will also help ensure your plot is interpretable in black and white. + +```{r} +#| fig-alt: | +#| Two scatterplots of highway mileage versus engine size where both color +#| and shape of points are based on drive type. The color palette is not +#| the default ggplot2 palette. + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = drv, shape = drv)) + + scale_color_brewer(palette = "Set1") +``` + +The ColorBrewer scales are documented online at and made available in R via the **RColorBrewer** package, by Erich Neuwirth. +@fig-brewer shows the complete list of all palettes. +The sequential (top) and diverging (bottom) palettes are particularly useful if your categorical values are ordered, or have a "middle". +This often arises if you've used `cut()` to make a continuous variable into a categorical variable. + +```{r} +#| label: fig-brewer +#| echo: false +#| fig-cap: All colorBrewer scales. +#| fig-asp: 2.5 +#| fig-alt: | +#| All colorBrewer scales. One group goes from light to dark colors. +#| Another group is a set of non ordinal colors. And the last group has +#| diverging scales (from dark to light to dark again). Within each set +#| there are a number of palettes. + +par(mar = c(0, 3, 0, 0)) +RColorBrewer::display.brewer.all() +``` + +When you have a predefined mapping between values and colors, use `scale_color_manual()`. +For example, if we map presidential party to color, we want to use the standard mapping of red for Republicans and blue for Democrats. +One approach for assigning these colors is using hex color codes: + +```{r} +#| fig-alt: | +#| Line plot of id number of presidents versus the year they started their +#| presidency. Start year is marked with a point and a segment that starts +#| there and ends at the end of the presidency. Democratic presidents are +#| represented in blue and Republicans in red. + +presidential |> + mutate(id = 33 + row_number()) |> + ggplot(aes(x = start, y = id, color = party)) + + geom_point() + + geom_segment(aes(xend = end, yend = id)) + + scale_color_manual(values = c(Republican = "#E81B23", Democratic = "#00AEF3")) +``` + +For continuous color, you can use the built-in `scale_color_gradient()` or `scale_fill_gradient()`. +If you have a diverging scale, you can use `scale_color_gradient2()`. +That allows you to give, for example, positive and negative values different colors. +That's sometimes also useful if you want to distinguish points above or below the mean. + +Another option is to use the viridis color scales. +The designers, Nathaniel Smith and Stéfan van der Walt, carefully tailored continuous color schemes that are perceptible to people with various forms of color blindness as well as perceptually uniform in both color and black and white. +These scales are available as continuous (`c`), discrete (`d`), and binned (`b`) palettes in ggplot2. + +```{r} +#| fig-align: default +#| layout-ncol: 2 +#| fig-width: 3 +#| fig-asp: 0.75 +#| fig-alt: | +#| Three hex plots where the color of the hexes show the number of observations +#| that fall into that hex bin. The first plot uses the default, continuous +#| ggplot2 scale. The second plot uses the viridis, continuous scale, and the +#| third plot uses the viridis, binned scale. + +df <- tibble( + x = rnorm(10000), + y = rnorm(10000) +) + +ggplot(df, aes(x, y)) + + geom_hex() + + coord_fixed() + + labs(title = "Default, continuous", x = NULL, y = NULL) + +ggplot(df, aes(x, y)) + + geom_hex() + + coord_fixed() + + scale_fill_viridis_c() + + labs(title = "Viridis, continuous", x = NULL, y = NULL) + +ggplot(df, aes(x, y)) + + geom_hex() + + coord_fixed() + + scale_fill_viridis_b() + + labs(title = "Viridis, binned", x = NULL, y = NULL) +``` + +Note that all color scales come in two varieties: `scale_color_*()` and `scale_fill_*()` for the `color` and `fill` aesthetics respectively (the color scales are available in both UK and US spellings). + +### Zooming + +There are three ways to control the plot limits: + +1. Adjusting what data are plotted. +2. Setting the limits in each scale. +3. Setting `xlim` and `ylim` in `coord_cartesian()`. + +We'll demonstrate these options in a series of plots. +The plot on the left shows the relationship between engine size and fuel efficiency, colored by type of drive train. +The plot on the right shows the same variables, but subsets the data that are plotted. +Subsetting the data has affected the x and y scales as well as the smooth curve. + +```{r} +#| layout-ncol: 2 +#| fig-width: 4 +#| message: false +#| fig-alt: | +#| On the left, scatterplot of highway mileage vs. displacement, with +#| displacement. The smooth curve overlaid shows a decreasing, and then +#| increasing trend, like a hockey stick. On the right, same variables +#| are plotted with displacement ranging only from 5 to 6 and highway +#| mileage ranging only from 10 to 25. The smooth curve overlaid shows a +#| trend that's slightly increasing first and then decreasing. + +# Left +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = drv)) + + geom_smooth() + +# Right +mpg |> + filter(displ >= 5 & displ <= 6 & hwy >= 10 & hwy <= 25) |> + ggplot(aes(x = displ, y = hwy)) + + geom_point(aes(color = drv)) + + geom_smooth() +``` + +Let's compare these to the two plots below where the plot on the left sets the `limits` on individual scales and the plot on the right sets them in `coord_cartesian()`. +We can see that reducing the limits is equivalent to subsetting the data. +Therefore, to zoom in on a region of the plot, it's generally best to use `coord_cartesian()`. + +```{r} +#| layout-ncol: 2 +#| fig-width: 4 +#| message: false +#| warning: false +#| fig-alt: | +#| On the left, scatterplot of highway mileage vs. displacement, with +#| displacement ranging from 5 to 6 and highway mileage ranging from +#| 10 to 25. The smooth curve overlaid shows a trend that's slightly +#| increasing first and then decreasing. On the right, same variables +#| are plotted with the same limits, however the smooth curve overlaid +#| shows a relatively flat trend with a slight increase at the end. + +# Left +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = drv)) + + geom_smooth() + + scale_x_continuous(limits = c(5, 6)) + + scale_y_continuous(limits = c(10, 25)) + +# Right +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = drv)) + + geom_smooth() + + coord_cartesian(xlim = c(5, 6), ylim = c(10, 25)) +``` + +On the other hand, setting the `limits` on individual scales is generally more useful if you want to *expand* the limits, e.g., to match scales across different plots. +For example, if we extract two classes of cars and plot them separately, it's difficult to compare the plots because all three scales (the x-axis, the y-axis, and the color aesthetic) have different ranges. + +```{r} +#| layout-ncol: 2 +#| fig-width: 4 +#| fig-alt: | +#| On the left, a scatterplot of highway mileage vs. displacement of SUVs. +#| On the right, a scatterplot of the same variables for compact cars. +#| Points are colored by drive type for both plots. Among SUVs more of +#| the cars are 4-wheel drive and the others are rear-wheel drive, while +#| among compact cars more of the cars are front-wheel drive and the others +#| are 4-wheel drive. SUV plot shows a clear negative relationship +#| between higway mileage and displacement while in the compact cars plot +#| the relationship is much flatter. + +suv <- mpg |> filter(class == "suv") +compact <- mpg |> filter(class == "compact") + +# Left +ggplot(suv, aes(x = displ, y = hwy, color = drv)) + + geom_point() + +# Right +ggplot(compact, aes(x = displ, y = hwy, color = drv)) + + geom_point() +``` + +One way to overcome this problem is to share scales across multiple plots, training the scales with the `limits` of the full data. + +```{r} +#| layout-ncol: 2 +#| fig-width: 4 +#| fig-alt: | +#| On the left, a scatterplot of highway mileage vs. displacement of SUVs. +#| On the right, a scatterplot of the same variables for compact cars. +#| Points are colored by drive type for both plots. Both plots are plotted +#| on the same scale for highway mileage, displacement, and drive type, +#| resulting in the legend showing all three types (front, rear, and 4-wheel +#| drive) for both plots even though there are no front-wheel drive SUVs and +#| no rear-wheel drive compact cars. Since the x and y scales are the same, +#| and go well beyond minimum or maximum highway mileage and displacement, +#| the points do not take up the entire plotting area. + +x_scale <- scale_x_continuous(limits = range(mpg$displ)) +y_scale <- scale_y_continuous(limits = range(mpg$hwy)) +col_scale <- scale_color_discrete(limits = unique(mpg$drv)) + +# Left +ggplot(suv, aes(x = displ, y = hwy, color = drv)) + + geom_point() + + x_scale + + y_scale + + col_scale + +# Right +ggplot(compact, aes(x = displ, y = hwy, color = drv)) + + geom_point() + + x_scale + + y_scale + + col_scale +``` + +In this particular case, you could have simply used faceting, but this technique is useful more generally, if for instance, you want to spread plots over multiple pages of a report. + +### Exercises + +1. Why doesn't the following code override the default scale? + + ```{r} + #| fig-show: "hide" + + df <- tibble( + x = rnorm(10000), + y = rnorm(10000) + ) + + ggplot(df, aes(x, y)) + + geom_hex() + + scale_color_gradient(low = "white", high = "red") + + coord_fixed() + ``` + +2. What is the first argument to every scale? + How does it compare to `labs()`? + +3. Change the display of the presidential terms by: + + a. Combining the two variants that customize colors and x axis breaks. + b. Improving the display of the y axis. + c. Labelling each term with the name of the president. + d. Adding informative plot labels. + e. Placing breaks every 4 years (this is trickier than it seems!). + +4. First, create the following plot. + Then, modify the code using `override.aes` to make the legend easier to see. + + ```{r} + #| fig-show: hide + + ggplot(diamonds, aes(x = carat, y = price)) + + geom_point(aes(color = cut), alpha = 1/20) + ``` + +## Themes {#sec-themes} + +Finally, you can customize the non-data elements of your plot with a theme: + +```{r} +#| message: false +#| fig-alt: | +#| Scatterplot of highway mileage vs. displacement of cars, colored by class +#| of car. The plot background is white, with gray grid lines. + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(color = class)) + + geom_smooth(se = FALSE) + + theme_bw() +``` + +ggplot2 includes the eight themes shown in @fig-themes, with `theme_gray()` as the default.[^communication-2] +Many more are included in add-on packages like **ggthemes** (), by Jeffrey Arnold. +You can also create your own themes, if you are trying to match a particular corporate or journal style. + +[^communication-2]: Many people wonder why the default theme has a gray background. + This was a deliberate choice because it puts the data forward while still making the grid lines visible. + The white grid lines are visible (which is important because they significantly aid position judgments), but they have little visual impact and we can easily tune them out. + The gray background gives the plot a similar typographic color to the text, ensuring that the graphics fit in with the flow of a document without jumping out with a bright white background. + Finally, the gray background creates a continuous field of color which ensures that the plot is perceived as a single visual entity. + +```{r} +#| label: fig-themes +#| echo: false +#| fig-cap: The eight themes built-in to ggplot2. +#| fig-alt: | +#| Eight barplots created with ggplot2, each +#| with one of the eight built-in themes: +#| theme_bw() - White background with grid lines, +#| theme_light() - Light axes and grid lines, +#| theme_classic() - Classic theme, axes but no grid +#| lines, theme_linedraw() - Only black lines, +#| theme_dark() - Dark background for contrast, +#| theme_minimal() - Minimal theme, no background, +#| theme_gray() - Gray background (default theme), +#| theme_void() - Empty theme, only geoms are visible. + +knitr::include_graphics("images/visualization-themes.png") +``` + +It's also possible to control individual components of each theme, like the size and color of the font used for the y axis. +We've already seen that `legend.position` controls where the legend is drawn. +There are many other aspects of the legend that can be customized with `theme()`. +For example, in the plot below we change the direction of the legend as well as put a black border around it. +Note that customization of the legend box and plot title elements of the theme are done with `element_*()` functions. +These functions specify the styling of non-data components, e.g., the title text is bolded in the `face` argument of `element_text()` and the legend border color is defined in the `color` argument of `element_rect()`. +The theme elements that control the position of the title and the caption are `plot.title.position` and `plot.caption.position`, respectively. +In the following plot these are set to `"plot"` to indicate these elements are aligned to the entire plot area, instead of the plot panel (the default). +A few other helpful `theme()` components are used to change the placement for format of the title and caption text. + +```{r} +#| fig-alt: | +#| Scatterplot of highway fuel efficiency versus engine size of cars, colored +#| by drive. The plot is titled 'Larger engine sizes tend to have lower fuel +#| economy' with the caption pointing to the source of the data, fueleconomy.gov. +#| The caption and title are left justified, the legend is inside of the plot +#| with a black border. + +ggplot(mpg, aes(x = displ, y = hwy, color = drv)) + + geom_point() + + labs( + title = "Larger engine sizes tend to have lower fuel economy", + caption = "Source: https://fueleconomy.gov." + ) + + theme( + legend.position = c(0.6, 0.7), + legend.direction = "horizontal", + legend.box.background = element_rect(color = "black"), + plot.title = element_text(face = "bold"), + plot.title.position = "plot", + plot.caption.position = "plot", + plot.caption = element_text(hjust = 0) + ) +``` + +For an overview of all `theme()` components, see help with `?theme`. +The [ggplot2 book](https://ggplot2-book.org/) is also a great place to go for the full details on theming. + +### Exercises + +1. Pick a theme offered by the ggthemes package and apply it to the last plot you made. +2. Make the axis labels of your plot blue and bolded. + +## Layout + +So far we talked about how to create and modify a single plot. +What if you have multiple plots you want to lay out in a certain way? +The patchwork package allows you to combine separate plots into the same graphic. +We loaded this package earlier in the chapter. + +To place two plots next to each other, you can simply add them to each other. +Note that you first need to create the plots and save them as objects (in the following example they're called `p1` and `p2`). +Then, you place them next to each other with `+`. + +```{r} +#| fig-width: 6 +#| fig-asp: 0.5 +#| fig-alt: | +#| Two plots (a scatterplot of highway mileage versus engine size and a +#| side-by-side boxplots of highway mileage versus drive train) placed next +#| to each other. + +p1 <- ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point() + + labs(title = "Plot 1") +p2 <- ggplot(mpg, aes(x = drv, y = hwy)) + + geom_boxplot() + + labs(title = "Plot 2") +p1 + p2 +``` + +It's important to note that in the above code chunk we did not use a new function from the patchwork package. +Instead, the package added a new functionality to the `+` operator. + +You can also create complex plot layouts with patchwork. +In the following, `|` places the `p1` and `p3` next to each other and `/` moves `p2` to the next line. + +```{r} +#| fig-width: 6 +#| fig-asp: 0.8 +#| fig-alt: | +#| Three plots laid out such that first and third plot are next to each other +#| and the second plot stretched beneath them. The first plot is a +#| scatterplot of highway mileage versus engine size, third plot is a +#| scatterplot of highway mileage versus city mileage, and the third plot is +#| side-by-side boxplots of highway mileage versus drive train) placed next +#| to each other. + +p3 <- ggplot(mpg, aes(x = cty, y = hwy)) + + geom_point() + + labs(title = "Plot 3") +(p1 | p3) / p2 +``` + +Additionally, patchwork allows you to collect legends from multiple plots into one common legend, customize the placement of the legend as well as dimensions of the plots, and add a common title, subtitle, caption, etc. to your plots. +Below we create 5 plots. +We have turned off the legends on the box plots and the scatterplot and collected the legends for the density plots at the top of the plot with `& theme(legend.position = "top")`. +Note the use of the `&` operator here instead of the usual `+`. +This is because we're modifying the theme for the patchwork plot as opposed to the individual ggplots. +The legend is placed on top, inside the `guide_area()`. +Finally, we have also customized the heights of the various components of our patchwork -- the guide has a height of 1, the box plots 3, density plots 2, and the faceted scatterplot 4. +Patchwork divides up the area you have allotted for your plot using this scale and places the components accordingly. + +```{r} +#| fig-width: 8 +#| fig-asp: 1 +#| fig-alt: | +#| Five plots laid out such that first two plots are next to each other. Plots +#| three and four are underneath them. And the fifth plot stretches under them. +#| The patchworked plot is titled "City and highway mileage for cars with +#| different drive trains" and captioned "Source: https://fueleconomy.gov". +#| The first two plots are side-by-side box plots. Plots 3 and 4 are density +#| plots. And the fifth plot is a faceted scatterplot. Each of these plots show +#| geoms colored by drive train, but the patchworked plot has only one legend +#| that applies to all of them, above the plots and beneath the title. + +p1 <- ggplot(mpg, aes(x = drv, y = cty, color = drv)) + + geom_boxplot(show.legend = FALSE) + + labs(title = "Plot 1") + +p2 <- ggplot(mpg, aes(x = drv, y = hwy, color = drv)) + + geom_boxplot(show.legend = FALSE) + + labs(title = "Plot 2") + +p3 <- ggplot(mpg, aes(x = cty, color = drv, fill = drv)) + + geom_density(alpha = 0.5) + + labs(title = "Plot 3") + +p4 <- ggplot(mpg, aes(x = hwy, color = drv, fill = drv)) + + geom_density(alpha = 0.5) + + labs(title = "Plot 4") + +p5 <- ggplot(mpg, aes(x = cty, y = hwy, color = drv)) + + geom_point(show.legend = FALSE) + + facet_wrap(~drv) + + labs(title = "Plot 5") + +(guide_area() / (p1 + p2) / (p3 + p4) / p5) + + plot_annotation( + title = "City and highway mileage for cars with different drive trains", + caption = "Source: https://fueleconomy.gov." + ) + + plot_layout( + guides = "collect", + heights = c(1, 3, 2, 4) + ) & + theme(legend.position = "top") +``` + +If you'd like to learn more about combining and layout out multiple plots with patchwork, we recommend looking through the guides on the package website: . + +### Exercises + +1. What happens if you omit the parentheses in the following plot layout. + Can you explain why this happens? + + ```{r} + #| fig-show: hide + + p1 <- ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point() + + labs(title = "Plot 1") + p2 <- ggplot(mpg, aes(x = drv, y = hwy)) + + geom_boxplot() + + labs(title = "Plot 2") + p3 <- ggplot(mpg, aes(x = cty, y = hwy)) + + geom_point() + + labs(title = "Plot 3") + + (p1 | p2) / p3 + ``` + +2. Using the three plots from the previous exercise, recreate the following patchwork. + + ```{r} + #| fig-width: 7 + #| fig-asp: 0.8 + #| echo: false + #| fig-alt: | + #| Three plots: Plot 1 is a scatterplot of highway mileage versus engine size. + #| Plot 2 is side-by-side box plots of highway mileage versus drive train. + #| Plot 3 is side-by-side box plots of city mileage versus drive train. + #| Plots 1 is on the first row. Plots 2 and 3 are on the next row, each span + #| half the width of Plot 1. Plot 1 is labelled "Fig. A", Plot 2 is labelled + #| "Fig. B", and Plot 3 is labelled "Fig. C". + + p1 / (p2 + p3) + + plot_annotation( + tag_levels = c("A"), + tag_prefix = "Fig. ", + tag_suffix = ":" + ) + ``` + +## Summary + +In this chapter you've learned about adding plot labels such as title, subtitle, caption as well as modifying default axis labels, using annotation to add informational text to your plot or to highlight specific data points, customizing the axis scales, and changing the theme of your plot. +You've also learned about combining multiple plots in a single graph using both simple and complex plot layouts. + +While you've so far learned about how to make many different types of plots and how to customize them using a variety of techniques, we've barely scratched the surface of what you can create with ggplot2. +If you want to get a comprehensive understanding of ggplot2, we recommend reading the book, [*ggplot2: Elegant Graphics for Data Analysis*](https://ggplot2-book.org). +Other useful resources are the [*R Graphics Cookbook*](https://r-graphics.org) by Winston Chang and [*Fundamentals of Data Visualization*](https://clauswilke.com/dataviz/) by Claus Wilke. diff --git a/contribs.txt b/contribs.txt deleted file mode 100644 index b78da20f7..000000000 --- a/contribs.txt +++ /dev/null @@ -1,94 +0,0 @@ - 625 hadley - 93 Garrett - 77 Hadley Wickham - 50 S'busiso Mkhondwane - 21 behrman - 11 Brett Klamer - 10 Radu Grosu - 9 Brandon Greenwell - 8 Bill Behrman - 7 Garrett Grolemund - 7 Rademeyer Vermaak - 7 Colin Gillespie - 7 harrismcgehee - 6 jjchern - 6 Jakub Nowosad - 6 OaCantona - 5 kdpsingh - 5 Julian During - 4 Thomas Klebel - 4 Mine Cetinkaya-Rundel - 4 Jennifer (Jenny) Bryan - 4 Terence Teo - 4 Patrick Kennedy - 3 Jonathan Page - 3 Jose Roberto Ayala Solares - 3 yahwes - 3 seamus-mckinsey - 3 Ian Lyttle - 3 Ian Sealy - 3 Yihui Xie - 2 Cooper Morris - 2 Christian G. Warden - 2 Daniel Gromer - 2 Devin Pastoor - 2 Etienne B. Racine - 2 Jim Hester - 2 Joanne Jang - 2 Kirill Sevastyanenko - 2 MJMarshall - 2 Nirmal Patel - 2 Paul - 2 Robert Schuessler - 2 Will Beasley - 2 rlzijdeman - 2 robinlovelace - 2 sibusiso16 - 2 spirgel - 1 robinsones - 1 Jeroen Janssens - 1 Mustafa Ascha - 1 Nelson Areal - 1 Nick Clark - 1 Alex - 1 Hengni Cai - 1 Gregory Jefferis - 1 seanpwilliams - 1 Peter Hurford - 1 Flemming Villalona - 1 Eric Watt - 1 shoili - 1 Earl Brown - 1 Shannon Ellis - 1 Steve Mortimer - 1 TJ Mahr - 1 Dylan Cashman - 1 Derwin McGeary - 1 Tom Prior - 1 Ajay Deonarine - 1 David Clark - 1 adi pradhan - 1 bahadir cankardes - 1 batpigandme - 1 Curtis Alexander - 1 Ahmed ElGabbas - 1 Christian Mongeau - 1 jennybc - 1 Ben Marwick - 1 jonathanflint - 1 Andrew Landgraf - 1 koalabearski - 1 nate-d-olson - 1 nickelas - 1 nwaff - 1 zeal626 - 1 Jon Calder - 1 Julia Stewart Lowndes - 1 John Sears - 1 Justinas Petuchovas - 1 Kara Woo - 1 Kenny Darrell - 1 svenski - 1 KyleHumphrey - 1 Lawrence Wu - 1 Matthew Sedaghatfar diff --git a/contribute.qmd b/contribute.qmd new file mode 100644 index 000000000..7838cc014 --- /dev/null +++ b/contribute.qmd @@ -0,0 +1,19 @@ +# Contributing {#sec-contributing} + +This book has been developed in the open, and it wouldn't be nearly as good without your contributions. +There are a number of ways you can help make the book even better: + +- If you don't understand something, please [let me know](mailto:h.wickham@gmail.com). + Your feedback on what is confusing or hard to understand is valuable. + +- If you spot a typo, feel free to edit the underlying page and send a pull request. + If you've never done this before, the process is very easy: + + - Click the edit this page on the sidebar. + + - Make the changes using GitHub's in-page editor and save. + + - Submit a pull request and include a brief description of your changes. + "Fixing typos" is perfectly adequate. + + - If you make significant changes, include the phrase "I assign the copyright of this contribution to Hadley Wickham" - I need this so I can publish the printed book. diff --git a/contribute.rmd b/contribute.rmd deleted file mode 100644 index 8920eec5b..000000000 --- a/contribute.rmd +++ /dev/null @@ -1,23 +0,0 @@ -# Contributing - -This book has been developed in the open, and it wouldn't be nearly as good -without your contributions. There are a number of ways you can help make the -book even better: - -* If you don't understand something, please - [let me know](mailto:h.wickham@gmail.com). Your feedback on what is confusing - or hard to understand is valuable. - -* If you spot a typo, feel free to edit the underlying page and send a pull - request. If you've never done this before, the process is very easy: - - * Click the edit this page on the sidebar. - - * Make the changes using github's in-page editor and save. - - * Submit a pull request and include a brief description of your changes. - "Fixing typos" is perfectly adequate. - - * If you make significant changes, include the phrase "I assign the - copyright of this contribution to Hadley Wickham" - I need this so I can - publish the printed book. diff --git a/contributors.R b/contributors.R new file mode 100644 index 000000000..db69421c6 --- /dev/null +++ b/contributors.R @@ -0,0 +1,31 @@ +library(tidyverse) +contribs_all_json <- gh::gh("/repos/:owner/:repo/contributors", + owner = "hadley", + repo = "r4ds", + .limit = Inf +) +contribs_all <- tibble( + login = contribs_all_json %>% map_chr("login"), + n = contribs_all_json %>% map_int("contributions") +) + +contribs_old <- read_csv("contributors.csv", col_types = list()) +contribs_new <- contribs_all %>% anti_join(contribs_old) + +# Get info for new contributors +needed_json <- map( + contribs_new$login, + ~ gh::gh("/users/:username", username = .x) +) +info_new <- tibble( + login = map_chr(needed_json, "login", .default = NA), + name = map_chr(needed_json, "name", .default = NA), + blog = map_chr(needed_json, "blog", .default = NA) +) +info_old <- contribs_old %>% select(login, name, blog) +info_all <- bind_rows(info_old, info_new) + +contribs_all <- contribs_all %>% + left_join(info_all) %>% + arrange(login) +write_csv(contribs_all, "contributors.csv") diff --git a/contributors.csv b/contributors.csv new file mode 100644 index 000000000..bd5eee945 --- /dev/null +++ b/contributors.csv @@ -0,0 +1,263 @@ +login,n,name,blog +a-rosenberg,1,NA,NA +a2800276,1,Tim Becker,NA +Abinashbunty,1,Abinash Satapathy,https://www.abinash.nl/ +adam-gruer,1,Adam Gruer,adamgruer.rbind.io +adidoit,1,adi pradhan,http://adidoit.github.io +Adrianzo,1,A. s.,NA +aephidayatuloh,1,Aep Hidyatuloh,NA +agila5,1,Andrea Gilardi,NA +ajay-d,1,Ajay Deonarine,http://deonarine.com/ +AlanFeder,1,NA,NA +alansuidaihe,1,Daihe Sui, +alberto-agudo,9,NA, +AlbertRapp,1,NA,NA +aleloi,1,NA,NA +alonzi,1,pete,NA +ALShum,1,Alex,www.ALShum.com +amacfarland,1,Andrew M.,NA +andland,1,Andrew Landgraf,http://andrewlandgraf.com +andyhuynh92,1,NA, +angela-li,1,Angela Li,NA +AnttiRask,1,Antti Rask,youcanbeapirate.com +aquarhead,1,LOU Xun,https://aqd.is +ariespirgel,2,NA,https://arie.rbind.io +august-18,1,NA,NA +aviast,1,Michael Henry,NA +azzaea,1,Azza Ahmed,https://azzaea.netlify.com/ +bambooforest,1,Steven Moran,https://scholar.google.com/citations?user=PpTOh08AAAAJ&hl=en +BarkleyBG,1,Brian G. Barkley,BarkleyBG.netlify.com +batpigandme,5,Mara Averick,https://twitter.com/dataandme +BB1464,1,Oluwafemi OYEDELE,statisticalinference.netlify.app +bbrewington,1,Brent Brewington,NA +behrman,29,Bill Behrman,NA +benherbertson,3,Ben Herbertson,NA +benmarwick,2,Ben Marwick,http://faculty.washington.edu/bmarwick/ +bensteinberg,4,Ben Steinberg,NA +bentyeh,1,Benjamin Yeh,https://bentyeh.github.io +betulturkoglu,1,Betul Turkoglu,NA +bgreenwell,9,Brandon Greenwell,NA +BinxiePeterson,1,Bianca Peterson,NA +BirgerNi,1,Birger Niklas,NA +bklamer,11,Brett Klamer,NA +boardtc,1,NA,NA +c-hoh,1,Christian,hohenfeld.is +caddycarine,1,Caddy,NA +camillevleonard,1,Camille V Leonard,https://www.camillevleonard.com/ +canovasjm,1,NA,NA +cedricbatailler,1,Cedric Batailler,cedricbatailler.me +christina-wei,1,Christina Wei, +chrMongeau,1,Christian Mongeau,http://mongeau.net +coopermor,2,Cooper Morris,NA +csgillespie,7,Colin Gillespie,http://www.mas.ncl.ac.uk/~ncsg3/ +csrvermaak,7,Rademeyer Vermaak,NA +cthierst,1,Chloe Thierstein, +ctsa,1,Chris Saunders,http://www.linkedin.com/in/christophertsaunders +curious-abhinav,1,Abhinav Singh,https://curious-abhinav.github.io +curtisalexander,1,Curtis Alexander,https://www.calex.org +cwarden,2,Christian G. Warden,http://xn.pinkhamster.net/ +cwickham,1,Charlotte Wickham,http://cwick.co.nz +darrkj,1,Kenny Darrell,http://darrkj.github.io/blogs +davidkane9,1,David Kane,www.davidkane.info +davidrsch,6,David,NA +davidrubinger,1,David Rubinger,NA +DDClark,1,David Clark,NA +derwinmcgeary,1,Derwin McGeary,http://derwinmcgeary.github.io +dgromer,2,Daniel Gromer,NA +Divider85,3,NA,NA +djbirke,1,NA,NA +djnavarro,1,Danielle Navarro,https://djnavarro.net +DOH-RPS1303,1,Russell Shean,NA +dongzhuoer,5,Zhuoer Dong,https://dongzhuoer.github.io +dpastoor,2,Devin Pastoor,NA +DSGeoff,1,NA,NA +dthakkar09,1,Devarshi Thakkar, +duju211,13,Julian During,NA +dylancashman,1,Dylan Cashman,https://www.eecs.tufts.edu/~dcashm01/ +eddelbuettel,1,Dirk Eddelbuettel,http://dirk.eddelbuettel.com +EdwinTh,4,Edwin Thoen,thats-so-random.com +elgabbas,1,Ahmed El-Gabbas,https://elgabbas.github.io +enryH,1,Henry Webel,NA +ercan7,1,Ercan Karadas,NA +EricKit,1,Eric Kitaif,NA +ericwatt,1,Eric Watt,www.ericdwatt.com +erikerhardt,2,Erik Erhardt,StatAcumen.com +etiennebr,2,Etienne B. Racine,NA +evjrob,1,Everett Robinson,NA +fellennert,1,NA,NA +flemmingmiguel,1,Flemming Miguel,NA +florisvdh,1,Floris Vanderhaeghe,NA +funkybluehen,1,NA,NA +gabrivera,1,NA,NA +gadenbuie,1,Garrick Aden-Buie,https://garrickadenbuie.com +ganong123,1,Peter Ganong,voices.uchicago.edu/ganong +garrettgman,103,Garrett Grolemund,NA +GeroVanMi,1,Gerome Meyer,https://astralibra.ch +gl-eb,1,Gleb Ebert,glebsite.ch +GoldbergData,1,Josh Goldberg,https://twitter.com/GoldbergData +gridgrad,1,bahadir cankardes,NA +gustavdelius,2,Gustav W Delius,NA +hadley,1173,Hadley Wickham,http://hadley.nz +hao-trivago,2,Hao Chen,NA +harrismcgehee,7,Harris McGehee,https://gist.github.com/harrismcgehee +hendrikweisser,1,NA,NA +hengnicai,1,Hengni Cai,NA +Iain-S,1,Iain,NA +iansealy,3,Ian Sealy,NA +ijlyttle,3,Ian Lyttle,NA +ivan-krukov,1,Ivan Krukov,NA +jacobkap,1,Jacob Kaplan,http://crimedatatool.com/ +jazzlw,1,Jazz Weisman,NA +jdblischak,1,John Blischak,https://jdblischak.com/ +jdstorey,1,John D. Storey,http://jdstorey.github.io/ +jefferis,1,Gregory Jefferis,http://www2.mrc-lmb.cam.ac.uk/group-leaders/h-to-m/gregory-jefferis/ +JeffreyRStevens,2,Jeffrey Stevens,https://decisionslab.unl.edu/ +JeldorPKU,1,蒋雨蒙,https://jeldorpku.github.io +jennybc,5,Jennifer (Jenny) Bryan,https://jennybryan.org +jenren,1,Jen Ren,NA +jeroenjanssens,1,Jeroen Janssens,http://jeroenjanssens.com +jeromecholewa,1,NA,NA +jilmun,3,Janet Wesner,jilmun.github.io +jimhester,2,Jim Hester,http://www.jimhester.com +jjchern,6,JJ Chen,NA +jkolacz,1,Jacek Kolacz,NA +joannejang,2,Joanne Jang,joannejang.com +johannes4998,1,NA,NA +johnsears,1,John Sears,NA +jonathanflint,1,NA,NA +jonmcalder,1,Jon Calder,http://joncalder.co.za +jonpage,3,Jonathan Page,economistry.com +jonthegeek,1,Jon Harmon,http://jonthegeek.com +jooyoungseo,2,JooYoung Seo,https://jooyoungseo.github.io +jpetuchovas,1,Justinas Petuchovas,NA +jrdnbradford,1,Jordan,www.linkedin.com/in/jrdnbradford +jrnold,4,Jeffrey Arnold,http://jrnold.me +jroberayalas,7,Jose Roberto Ayala Solares,jroberayalas.netlify.com +jtr13,1,Joyce Robbins,NA +juandering,1,NA,NA +jules32,1,Julia Stewart Lowndes,http://jules32.github.io +kaetschap,1,Sonja,NA +karawoo,1,Kara Woo,http://karawoo.com +katrinleinweber,1,Katrin Leinweber,NA +kdpsingh,5,Karandeep Singh,http://umich.edu/~kdpsingh +kevinxperese,5,Kevin Perese,NA +kferris10,1,Kevin Ferris,NA +kirillseva,2,Kirill Sevastyanenko,NA +KittJonathan,15,Jonathan Kitt,NA +koalabearski,1,NA,NA +krlmlr,1,Kirill Müller,NA +kucharsky,1,Rafał Kucharski,NA +kwstat,1,Kevin Wright,NA +landesbergn,1,Noah Landesberg,noahlandesberg.com +lawwu,1,Lawrence Wu,NA +lindbrook,1,NA,NA +lwjohnst86,2,Luke W Johnston,lukewjohnston.com +MarckK,1,Kara de la Marck,https://www.linkedin.com/in/karadelamarck +marwahaha,1,Kunal Marwaha,kunalmarwaha.com/about +matanhakim,1,Matan Hakim,NA +MatthiasLiew,3,Matthias Liew,NA +MattWittbrodt,1,Matt Wittbrodt,mattwittbrodt.com +maurolepore,2,Mauro Lepore,https://fgeo.netlify.com/ +mbeveridge,7,Mark Beveridge,https://twitter.com/mbeveridge +mcewenkhundi,1,NA,NA +mcsnowface,6,"mcsnowface, PhD",NA +mfherman,1,Matt Herman,mattherman.info +michaelboerman,1,Michael Boerman,https://michaelboerman.com +mine-cetinkaya-rundel,158,Mine Cetinkaya-Rundel,https://stat.duke.edu/~mc301 +mitsuoxv,31,Mitsuo Shiota,https://mitsuoxv.rbind.io/ +mjhendrickson,1,Matthew Hendrickson,https://about.me/matthew.j.hendrickson +MJMarshall,2,NA,NA +mkfin7,1,Misty Knight-Finley, +mmhamdy,1,Mohammed Hamdy,NA +mnazarov,1,Maxim Nazarov,NA +mpaulacaldas,4,Maria Paula Caldas,mpaulacaldas.com +mustafaascha,1,Mustafa Ascha,NA +nareal,1,Nelson Areal,nelsonareal.net +nate-d-olson,1,Nate Olson,NA +nateaff,1,Nathanael,nateaff.com +nattalides,1,NA,NA +NedJWestern,1,Ned Western,NA +nickclark1000,1,Nick Clark,NA +nickelas,1,NA,NA +nirmalpatel,2,Nirmal Patel,http://playpowerlabs.com +nischalshrestha,1,Nischal Shrestha,http://nischalshrestha.me +njtierney,1,Nicholas Tierney,http://www.njtierney.com +Nowosad,6,Jakub Nowosad,https://nowosad.github.io +nstjhp,1,Nick Pullen, +olivier6088,1,NA,NA +oliviercailloux,1,Olivier Cailloux,https://www.lamsade.dauphine.fr/~ocailloux/ +p0bs,1,Robin Penfold,p0bs.com +pabloedug,1,Pablo E. Garcia,NA +padamson,1,Paul Adamson,padamson.github.io +penelopeysm,1,Penelope Y,NA +peterhurford,1,Peter Hurford,http://www.peterhurford.com +petzi53,14,Peter Baumgartner,https://notes.peter-baumgartner.net/ +pkq,4,Patrick Kennedy,NA +pooyataher,1,Pooya Taherkhani,https://gitlab.com/pooyat +PursuitOfDataScience,14,Y. Yu,https://youzhi.netlify.app/ +radugrosu,10,Radu Grosu,radugrosu.com +Ranae,2,Ranae Dietzel,ranae.github.io +rastrau,2,Ralph Straumann,https://ralphstraumann.ch +raynamharris,1,Rayna M Harris,https://www.raynamharris.com +ReeceGoding,1,NA,NA +rgertenbach,1,Robin Gertenbach,NA +RIngyao,1,Jajo,NA +rivaquiroga,1,Riva Quiroga,https://rivaquiroga.cl/ +RJHKnight,1,Richard Knight,NA +rlzijdeman,2,Richard Zijdeman,NA +robertchu03,1,NA,NA +RobinKohrs,1,Robin Kohrs,https://quarantino.netlify.app/ +Robinlovelace,2,Robin,http://robinlovelace.net +robinsones,1,Emily Robinson,robinsones.github.io +robtenorio,1,Rob Tenorio,NA +RodAli,1,Rod Mazloomi,NA +RohanAlexander,5,Rohan Alexander,https://www.rohanalexander.com/ +RomeroBarata,1,Romero Morais,NA +rudeboybert,1,Albert Y. Kim,http://rudeboybert.rbind.io/ +saghirb,3,Saghir,http://www.ilustat.com +salmasian,1,Hojjat Salmasian,NA +sauercrowd,1,Jonas,https://blog.sauercrowdlabs.xyz +sciencificity,3,Vebash Naidoo,https://sciencificity-blog.netlify.app/ +seamus-mckinsey,4,Seamus McKinsey,NA +seanpwilliams,1,NA,NA +seasmith,1,Luke Smith,https://seasmith.github.io +sedaghatfar,3,Matthew Sedaghatfar,NA +sekR4,1,Sebastian Kraus,https://www.linkedin.com/in/sebastiankrausjena +sfirke,1,Sam Firke,samfirke.com +ShanEllis,1,Shannon Ellis,shanellis.com +shoili,1,NA,shoili.github.io +Shurakai,2,Christian Heinrich,NA +sibusiso16,52,S'busiso Mkhondwane,NA +sm-raiyyan,1,SM Raiyyan, +sonicdoe,11,Jakob Krigovsky,https://sonicdoe.com +stephan-koenig,3,Stephan Koenig,stephankoenig.me +stephenbalogun,6,Stephen Balogun,https://stephenbalogun.github.io/stbalogun/ +StevenMMortimer,1,Steven M. Mortimer,https://stevenmortimer.com +stragu,4,Stéphane Guillou,https://stragu.github.io/ +sulgik,2,Sulgi Kim, +svenski,1,Sergiusz Bleja,NA +talgalili,1,Tal Galili,https://www.r-statistics.com +Taurenamo,1,Alec Fisher, +tgerarden,1,Todd Gerarden,http://toddgerarden.com +thomasggodfrey,1,Tom Godfrey, +timbroderick,1,Tim Broderick,http://www.timbroderick.net +timwaterhouse,1,Tim Waterhouse,NA +tjmahr,1,TJ Mahr,tjmahr.com +tklebel,4,Thomas Klebel,https://thomasklebel.eu +tomjamesprior,1,Tom Prior,NA +tteo,4,Terence Teo,tteo.github.io +twgardner2,1,NA,NA +ulyngs,4,Ulrik Lyngs,www.ulriklyngs.com +uribo,1,Shinya Uryu,https://uribo.hatenablog.com +vanderlindenma,1,Martin Van der Linden,NA +waltersom,1,Walter Somerville,NA +werkstattcodes,1,NA,http://werk.statt.codes +wibeasley,2,Will Beasley,http://scholar.google.com/citations?user=ffsJTC0AAAAJ&hl=en +yihui,4,Yihui Xie,https://yihui.name +yimingli,3,Yiming (Paul) Li,https://yimingli.net +yingxingwu,1,NA,NA +yutannihilation,1,Hiroaki Yutani,https://twitter.com/yutannihilation +yuyu-aung,1,Yu Yu Aung,NA +zachbogart,1,Zach Bogart,zachbogart.com +zeal626,1,NA,NA +zekiakyol,16,Zeki Akyol,zekiakyol.com diff --git a/cover.jpg b/cover.jpg new file mode 100644 index 000000000..fbe1bff6b Binary files /dev/null and b/cover.jpg differ diff --git a/cover.png b/cover.png deleted file mode 100644 index a7150bdfa..000000000 Binary files a/cover.png and /dev/null differ diff --git a/data-import.qmd b/data-import.qmd new file mode 100644 index 000000000..4f97b9222 --- /dev/null +++ b/data-import.qmd @@ -0,0 +1,539 @@ +# Data import {#sec-data-import} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +Working with data provided by R packages is a great way to learn data science tools, but you want to apply what you've learned to your own data at some point. +In this chapter, you'll learn the basics of reading data files into R. + +Specifically, this chapter will focus on reading plain-text rectangular files. +We'll start with practical advice for handling features like column names, types, and missing data. +You will then learn about reading data from multiple files at once and writing data from R to a file. +Finally, you'll learn how to handcraft data frames in R. + +### Prerequisites + +In this chapter, you'll learn how to load flat files in R with the **readr** package, which is part of the core tidyverse. + +```{r} +#| label: setup +#| message: false + +library(tidyverse) +``` + +## Reading data from a file + +To begin, we'll focus on the most common rectangular data file type: CSV, which is short for comma-separated values. +Here is what a simple CSV file looks like. +The first row, commonly called the header row, gives the column names, and the following six rows provide the data. +The columns are separated, aka delimited, by commas. + +```{r} +#| echo: false +#| message: false +#| comment: "" + +read_lines("data/students.csv") |> cat(sep = "\n") +``` + +@tbl-students-table shows a representation of the same data as a table. + +```{r} +#| label: tbl-students-table +#| echo: false +#| message: false +#| tbl-cap: Data from the students.csv file as a table. + +read_csv("data/students.csv") |> + knitr::kable() +``` + +We can read this file into R using `read_csv()`. +The first argument is the most important: the path to the file. +You can think about the path as the address of the file: the file is called `students.csv` and that it lives in the `data` folder. + +```{r} +#| message: true + +students <- read_csv("data/students.csv") +``` + +The code above will work if you have the `students.csv` file in a `data` folder in your project. +You can download the `students.csv` file from or you can read it directly from that URL with: + +```{r} +#| eval: false + +students <- read_csv("https://pos.it/r4ds-students-csv") +``` + +When you run `read_csv()`, it prints out a message telling you the number of rows and columns of data, the delimiter that was used, and the column specifications (names of columns organized by the type of data the column contains). +It also prints out some information about retrieving the full column specification and how to quiet this message. +This message is an integral part of readr, and we'll return to it in @sec-col-types. + +### Practical advice + +Once you read data in, the first step usually involves transforming it in some way to make it easier to work with in the rest of your analysis. +Let's take another look at the `students` data with that in mind. + +```{r} +students +``` + +In the `favourite.food` column, there are a bunch of food items, and then the character string `N/A`, which should have been a real `NA` that R will recognize as "not available". +This is something we can address using the `na` argument. +By default, `read_csv()` only recognizes empty strings (`""`) in this dataset as `NA`s, we want it to also recognize the character string `"N/A"`. + +```{r} +#| message: false +students <- read_csv("data/students.csv", na = c("N/A", "")) + +students +``` + +You might also notice that the `Student ID` and `Full Name` columns are surrounded by backticks. +That's because they contain spaces, breaking R's usual rules for variable names; they're **non-syntactic** names. +To refer to these variables, you need to surround them with backticks, `` ` ``: + +```{r} +students |> + rename( + student_id = `Student ID`, + full_name = `Full Name` + ) +``` + +An alternative approach is to use `janitor::clean_names()` to use some heuristics to turn them all into snake case at once[^data-import-1]. + +[^data-import-1]: The [janitor](http://sfirke.github.io/janitor/) package is not part of the tidyverse, but it offers handy functions for data cleaning and works well within data pipelines that use `|>`. + +```{r} +#| message: false + +students |> janitor::clean_names() +``` + +Another common task after reading in data is to consider variable types. +For example, `meal_plan` is a categorical variable with a known set of possible values, which in R should be represented as a factor: + +```{r} +students |> + janitor::clean_names() |> + mutate(meal_plan = factor(meal_plan)) +``` + +Note that the values in the `meal_plan` variable have stayed the same, but the type of variable denoted underneath the variable name has changed from character (``) to factor (``). +You'll learn more about factors in @sec-factors. + +Before you analyze these data, you'll probably want to fix the `age` and `id` columns. +Currently, `age` is a character variable because one of the observations is typed out as `five` instead of a numeric `5`. +We discuss the details of fixing this issue in @sec-import-spreadsheets. + +```{r} +students <- students |> + janitor::clean_names() |> + mutate( + meal_plan = factor(meal_plan), + age = parse_number(if_else(age == "five", "5", age)) + ) + +students +``` + +A new function here is `if_else()`, which has three arguments. +The first argument `test` should be a logical vector. +The result will contain the value of the second argument, `yes`, when `test` is `TRUE`, and the value of the third argument, `no`, when it is `FALSE`. +Here we're saying if `age` is the character string `"five"`, make it `"5"`, and if not leave it as `age`. +You will learn more about `if_else()` and logical vectors in @sec-logicals. + +### Other arguments + +There are a couple of other important arguments that we need to mention, and they'll be easier to demonstrate if we first show you a handy trick: `read_csv()` can read text strings that you've created and formatted like a CSV file: + +```{r} +#| message: false + +read_csv( + "a,b,c + 1,2,3 + 4,5,6" +) +``` + +Usually, `read_csv()` uses the first line of the data for the column names, which is a very common convention. +But it's not uncommon for a few lines of metadata to be included at the top of the file. +You can use `skip = n` to skip the first `n` lines or use `comment = "#"` to drop all lines that start with (e.g.) `#`: + +```{r} +#| message: false + +read_csv( + "The first line of metadata + The second line of metadata + x,y,z + 1,2,3", + skip = 2 +) + +read_csv( + "# A comment I want to skip + x,y,z + 1,2,3", + comment = "#" +) +``` + +In other cases, the data might not have column names. +You can use `col_names = FALSE` to tell `read_csv()` not to treat the first row as headings and instead label them sequentially from `X1` to `Xn`: + +```{r} +#| message: false + +read_csv( + "1,2,3 + 4,5,6", + col_names = FALSE +) +``` + +Alternatively, you can pass `col_names` a character vector which will be used as the column names: + +```{r} +#| message: false + +read_csv( + "1,2,3 + 4,5,6", + col_names = c("x", "y", "z") +) +``` + +These arguments are all you need to know to read the majority of CSV files that you'll encounter in practice. +(For the rest, you'll need to carefully inspect your `.csv` file and read the documentation for `read_csv()`'s many other arguments.) + +### Other file types + +Once you've mastered `read_csv()`, using readr's other functions is straightforward; it's just a matter of knowing which function to reach for: + +- `read_csv2()` reads semicolon-separated files. + These use `;` instead of `,` to separate fields and are common in countries that use `,` as the decimal marker. + +- `read_tsv()` reads tab-delimited files. + +- `read_delim()` reads in files with any delimiter, attempting to automatically guess the delimiter if you don't specify it. + +- `read_fwf()` reads fixed-width files. + You can specify fields by their widths with `fwf_widths()` or by their positions with `fwf_positions()`. + +- `read_table()` reads a common variation of fixed-width files where columns are separated by white space. + +- `read_log()` reads Apache-style log files. + +### Exercises + +1. What function would you use to read a file where fields were separated with "\|"? + +2. Apart from `file`, `skip`, and `comment`, what other arguments do `read_csv()` and `read_tsv()` have in common? + +3. What are the most important arguments to `read_fwf()`? + +4. Sometimes strings in a CSV file contain commas. + To prevent them from causing problems, they need to be surrounded by a quoting character, like `"` or `'`. By default, `read_csv()` assumes that the quoting character will be `"`. + To read the following text into a data frame, what argument to `read_csv()` do you need to specify? + + ```{r} + #| eval: false + + "x,y\n1,'a,b'" + ``` + +5. Identify what is wrong with each of the following inline CSV files. + What happens when you run the code? + + ```{r} + #| eval: false + + read_csv("a,b\n1,2,3\n4,5,6") + read_csv("a,b,c\n1,2\n1,2,3,4") + read_csv("a,b\n\"1") + read_csv("a,b\n1,2\na,b") + read_csv("a;b\n1;3") + ``` + +6. Practice referring to non-syntactic names in the following data frame by: + + a. Extracting the variable called `1`. + b. Plotting a scatterplot of `1` vs. `2`. + c. Creating a new column called `3`, which is `2` divided by `1`. + d. Renaming the columns to `one`, `two`, and `three`. + + ```{r} + annoying <- tibble( + `1` = 1:10, + `2` = `1` * 2 + rnorm(length(`1`)) + ) + ``` + +## Controlling column types {#sec-col-types} + +A CSV file doesn't contain any information about the type of each variable (i.e. whether it's a logical, number, string, etc.), so readr will try to guess the type. +This section describes how the guessing process works, how to resolve some common problems that cause it to fail, and, if needed, how to supply the column types yourself. +Finally, we'll mention a few general strategies that are useful if readr is failing catastrophically and you need to get more insight into the structure of your file. + +### Guessing types + +readr uses a heuristic to figure out the column types. +For each column, it pulls the values of 1,000[^data-import-2] rows spaced evenly from the first row to the last, ignoring missing values. +It then works through the following questions: + +[^data-import-2]: You can override the default of 1000 with the `guess_max` argument. + +- Does it contain only `F`, `T`, `FALSE`, or `TRUE` (ignoring case)? If so, it's a logical. +- Does it contain only numbers (e.g., `1`, `-4.5`, `5e6`, `Inf`)? If so, it's a number. +- Does it match the ISO8601 standard? If so, it's a date or date-time. (We'll return to date-times in more detail in @sec-creating-datetimes). +- Otherwise, it must be a string. + +You can see that behavior in action in this simple example: + +```{r} +#| message: false + +read_csv(" + logical,numeric,date,string + TRUE,1,2021-01-15,abc + false,4.5,2021-02-15,def + T,Inf,2021-02-16,ghi +") +``` + +This heuristic works well if you have a clean dataset, but in real life, you'll encounter a selection of weird and beautiful failures. + +### Missing values, column types, and problems + +The most common way column detection fails is that a column contains unexpected values, and you get a character column instead of a more specific type. +One of the most common causes for this is a missing value, recorded using something other than the `NA` that readr expects. + +Take this simple 1 column CSV file as an example: + +```{r} +simple_csv <- " + x + 10 + . + 20 + 30" +``` + +If we read it without any additional arguments, `x` becomes a character column: + +```{r} +#| message: false + +read_csv(simple_csv) +``` + +In this very small case, you can easily see the missing value `.`. +But what happens if you have thousands of rows with only a few missing values represented by `.`s sprinkled among them? +One approach is to tell readr that `x` is a numeric column, and then see where it fails. +You can do that with the `col_types` argument, which takes a named list where the names match the column names in the CSV file: + +```{r} +df <- read_csv( + simple_csv, + col_types = list(x = col_double()) +) +``` + +Now `read_csv()` reports that there was a problem, and tells us we can find out more with `problems()`: + +```{r} +problems(df) +``` + +This tells us that there was a problem in row 3, col 1 where readr expected a double but got a `.`. +That suggests this dataset uses `.` for missing values. +So then we set `na = "."`, the automatic guessing succeeds, giving us the numeric column that we want: + +```{r} +#| message: false + +read_csv(simple_csv, na = ".") +``` + +### Column types + +readr provides a total of nine column types for you to use: + +- `col_logical()` and `col_double()` read logicals and real numbers. They're relatively rarely needed (except as above), since readr will usually guess them for you. +- `col_integer()` reads integers. We seldom distinguish integers and doubles in this book because they're functionally equivalent, but reading integers explicitly can occasionally be useful because they occupy half the memory of doubles. +- `col_character()` reads strings. This can be useful to specify explicitly when you have a column that is a numeric identifier, i.e., long series of digits that identifies an object but doesn't make sense to apply mathematical operations to. Examples include phone numbers, social security numbers, credit card numbers, etc. +- `col_factor()`, `col_date()`, and `col_datetime()` create factors, dates, and date-times respectively; you'll learn more about those when we get to those data types in @sec-factors and @sec-dates-and-times. +- `col_number()` is a permissive numeric parser that will ignore non-numeric components, and is particularly useful for currencies. You'll learn more about it in @sec-numbers. +- `col_skip()` skips a column so it's not included in the result, which can be useful for speeding up reading the data if you have a large CSV file and you only want to use some of the columns. + +It's also possible to override the default column by switching from `list()` to `cols()` and specifying `.default`: + +```{r} +another_csv <- " +x,y,z +1,2,3" + +read_csv( + another_csv, + col_types = cols(.default = col_character()) +) +``` + +Another useful helper is `cols_only()` which will read in only the columns you specify: + +```{r} +read_csv( + another_csv, + col_types = cols_only(x = col_character()) +) +``` + +## Reading data from multiple files {#sec-readr-directory} + +Sometimes your data is split across multiple files instead of being contained in a single file. +For example, you might have sales data for multiple months, with each month's data in a separate file: `01-sales.csv` for January, `02-sales.csv` for February, and `03-sales.csv` for March. +With `read_csv()` you can read these data in at once and stack them on top of each other in a single data frame. + +```{r} +#| message: false + +sales_files <- c("data/01-sales.csv", "data/02-sales.csv", "data/03-sales.csv") +read_csv(sales_files, id = "file") +``` + +Once again, the code above will work if you have the CSV files in a `data` folder in your project. +You can download these files from , , and or you can read them directly with: + +```{r} +#| eval: false + +sales_files <- c( + "https://pos.it/r4ds-01-sales", + "https://pos.it/r4ds-02-sales", + "https://pos.it/r4ds-03-sales" +) +read_csv(sales_files, id = "file") +``` + +The `id` argument adds a new column called `file` to the resulting data frame that identifies the file the data come from. +This is especially helpful in circumstances where the files you're reading in do not have an identifying column that can help you trace the observations back to their original sources. + +If you have many files you want to read in, it can get cumbersome to write out their names as a list. +Instead, you can use the base `list.files()` function to find the files for you by matching a pattern in the file names. +You'll learn more about these patterns in @sec-regular-expressions. + +```{r} +sales_files <- list.files("data", pattern = "sales\\.csv$", full.names = TRUE) +sales_files +``` + +## Writing to a file {#sec-writing-to-a-file} + +readr also comes with two useful functions for writing data back to disk: `write_csv()` and `write_tsv()`. +The most important arguments to these functions are `x` (the data frame to save) and `file` (the location to save it). +You can also specify how missing values are written with `na`, and if you want to `append` to an existing file. + +```{r} +#| eval: false + +write_csv(students, "students.csv") +``` + +Now let's read that csv file back in. +Note that the variable type information that you just set up is lost when you save to CSV because you're starting over with reading from a plain text file again: + +```{r} +#| warning: false +#| message: false + +students +write_csv(students, "students-2.csv") +read_csv("students-2.csv") +``` + +This makes CSVs a little unreliable for caching interim results---you need to recreate the column specification every time you load in. +There are two main alternative: + +1. `write_rds()` and `read_rds()` are uniform wrappers around the base functions `readRDS()` and `saveRDS()`. + These store data in R's custom binary format called RDS. + This means that when you reload the object, you are loading the *exact same* R object that you stored. + + ```{r} + write_rds(students, "students.rds") + read_rds("students.rds") + ``` + +2. The arrow package allows you to read and write parquet files, a fast binary file format that can be shared across programming languages. + We'll return to arrow in more depth in @sec-arrow. + + ```{r} + #| eval: false + + library(arrow) + write_parquet(students, "students.parquet") + read_parquet("students.parquet") + #> # A tibble: 6 × 5 + #> student_id full_name favourite_food meal_plan age + #> + #> 1 1 Sunil Huffmann Strawberry yoghurt Lunch only 4 + #> 2 2 Barclay Lynn French fries Lunch only 5 + #> 3 3 Jayendra Lyne NA Breakfast and lunch 7 + #> 4 4 Leon Rossini Anchovies Lunch only NA + #> 5 5 Chidiegwu Dunkel Pizza Breakfast and lunch 5 + #> 6 6 Güvenç Attila Ice cream Lunch only 6 + ``` + +Parquet tends to be much faster than RDS and is usable outside of R, but does require the arrow package. + +```{r} +#| include: false +file.remove("students-2.csv") +file.remove("students.rds") +``` + +## Data entry + +Sometimes you'll need to assemble a tibble "by hand" doing a little data entry in your R script. +There are two useful functions to help you do this which differ in whether you layout the tibble by columns or by rows. +`tibble()` works by column: + +```{r} +tibble( + x = c(1, 2, 5), + y = c("h", "m", "g"), + z = c(0.08, 0.83, 0.60) +) +``` + +Laying out the data by column can make it hard to see how the rows are related, so an alternative is `tribble()`, short for **tr**ansposed t**ibble**, which lets you lay out your data row by row. +`tribble()` is customized for data entry in code: column headings start with `~` and entries are separated by commas. +This makes it possible to lay out small amounts of data in an easy to read form: + +```{r} +tribble( + ~x, ~y, ~z, + 1, "h", 0.08, + 2, "m", 0.83, + 5, "g", 0.60 +) +``` + +## Summary + +In this chapter, you've learned how to load CSV files with `read_csv()` and to do your own data entry with `tibble()` and `tribble()`. +You've learned how csv files work, some of the problems you might encounter, and how to overcome them. +We'll come to data import a few times in this book: @sec-import-spreadsheets from Excel and Google Sheets, @sec-import-databases will show you how to load data from databases, @sec-arrow from parquet files, @sec-rectangling from JSON, and @sec-scraping from websites. + +We're just about at the end of this section of the book, but there's one important last topic to cover: how to get help. +So in the next chapter, you'll learn some good places to look for help, how to create a reprex to maximize your chances of getting good help, and some general advice on keeping up with the world of R. diff --git a/data-tidy.qmd b/data-tidy.qmd new file mode 100644 index 000000000..c9f962c16 --- /dev/null +++ b/data-tidy.qmd @@ -0,0 +1,591 @@ +# Data tidying {#sec-data-tidy} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +> "Happy families are all alike; every unhappy family is unhappy in its own way."\ +> --- Leo Tolstoy + +> "Tidy datasets are all alike, but every messy dataset is messy in its own way."\ +> --- Hadley Wickham + +In this chapter, you will learn a consistent way to organize your data in R using a system called **tidy data**. +Getting your data into this format requires some work up front, but that work pays off in the long term. +Once you have tidy data and the tidy tools provided by packages in the tidyverse, you will spend much less time munging data from one representation to another, allowing you to spend more time on the data questions you care about. + +In this chapter, you'll first learn the definition of tidy data and see it applied to a simple toy dataset. +Then we'll dive into the primary tool you'll use for tidying data: pivoting. +Pivoting allows you to change the form of your data without changing any of the values. + +### Prerequisites + +In this chapter, we'll focus on tidyr, a package that provides a bunch of tools to help tidy up your messy datasets. +tidyr is a member of the core tidyverse. + +```{r} +#| label: setup +#| message: false + +library(tidyverse) +``` + +From this chapter on, we'll suppress the loading message from `library(tidyverse)`. + +## Tidy data {#sec-tidy-data} + +You can represent the same underlying data in multiple ways. +The example below shows the same data organized in three different ways. +Each dataset shows the same values of four variables: *country*, *year*, *population*, and number of documented *cases* of TB (tuberculosis), but each dataset organizes the values in a different way. + +```{r} +table1 + +table2 + +table3 +``` + +These are all representations of the same underlying data, but they are not equally easy to use. +One of them, `table1`, will be much easier to work with inside the tidyverse because it's **tidy**. + +There are three interrelated rules that make a dataset tidy: + +1. Each variable is a column; each column is a variable. +2. Each observation is a row; each row is an observation. +3. Each value is a cell; each cell is a single value. + +@fig-tidy-structure shows the rules visually. + +```{r} +#| label: fig-tidy-structure +#| echo: false +#| fig-cap: | +#| The following three rules make a dataset tidy: variables are columns, +#| observations are rows, and values are cells. +#| fig-alt: | +#| Three panels, each representing a tidy data frame. The first panel +#| shows that each variable is a column. The second panel shows that each +#| observation is a row. The third panel shows that each value is +#| a cell. + +knitr::include_graphics("images/tidy-1.png", dpi = 270) +``` + +Why ensure that your data is tidy? +There are two main advantages: + +1. There's a general advantage to picking one consistent way of storing data. + If you have a consistent data structure, it's easier to learn the tools that work with it because they have an underlying uniformity. + +2. There's a specific advantage to placing variables in columns because it allows R's vectorized nature to shine. + As you learned in @sec-mutate and @sec-summarize, most built-in R functions work with vectors of values. + That makes transforming tidy data feel particularly natural. + +dplyr, ggplot2, and all the other packages in the tidyverse are designed to work with tidy data. +Here are a few small examples showing how you might work with `table1`. + +```{r} +#| fig-width: 5 +#| fig-alt: | +#| This figure shows the number of cases in 1999 and 2000 for +#| Afghanistan, Brazil, and China, with year on the x-axis and number +#| of cases on the y-axis. Each point on the plot represents the number +#| of cases in a given country in a given year. The points for each +#| country are differentiated from others by color and shape and connected +#| with a line, resulting in three, non-parallel, non-intersecting lines. +#| The numbers of cases in China are highest for both 1999 and 2000, with +#| values above 200,000 for both years. The number of cases in Brazil is +#| approximately 40,000 in 1999 and approximately 75,000 in 2000. The +#| numbers of cases in Afghanistan are lowest for both 1999 and 2000, with +#| values that appear to be very close to 0 on this scale. + +# Compute rate per 10,000 +table1 |> + mutate(rate = cases / population * 10000) + +# Compute total cases per year +table1 |> + group_by(year) |> + summarize(total_cases = sum(cases)) + +# Visualize changes over time +ggplot(table1, aes(x = year, y = cases)) + + geom_line(aes(group = country), color = "grey50") + + geom_point(aes(color = country, shape = country)) + + scale_x_continuous(breaks = c(1999, 2000)) # x-axis breaks at 1999 and 2000 +``` + +### Exercises + +1. For each of the sample tables, describe what each observation and each column represents. + +2. Sketch out the process you'd use to calculate the `rate` for `table2` and `table3`. + You will need to perform four operations: + + a. Extract the number of TB cases per country per year. + b. Extract the matching population per country per year. + c. Divide cases by population, and multiply by 10000. + d. Store back in the appropriate place. + + You haven't yet learned all the functions you'd need to actually perform these operations, but you should still be able to think through the transformations you'd need. + +## Lengthening data {#sec-pivoting} + +The principles of tidy data might seem so obvious that you wonder if you'll ever encounter a dataset that isn't tidy. +Unfortunately, however, most real data is untidy. +There are two main reasons: + +1. Data is often organized to facilitate some goal other than analysis. + For example, it's common for data to be structured to make data entry, not analysis, easy. + +2. Most people aren't familiar with the principles of tidy data, and it's hard to derive them yourself unless you spend a lot of time working with data. + +This means that most real analyses will require at least a little tidying. +You'll begin by figuring out what the underlying variables and observations are. +Sometimes this is easy; other times you'll need to consult with the people who originally generated the data. +Next, you'll **pivot** your data into a tidy form, with variables in the columns and observations in the rows. + +tidyr provides two functions for pivoting data: `pivot_longer()` and `pivot_wider()`. +We'll first start with `pivot_longer()` because it's the most common case. +Let's dive into some examples. + +### Data in column names {#sec-billboard} + +The `billboard` dataset records the billboard rank of songs in the year 2000: + +```{r} +billboard +``` + +In this dataset, each observation is a song. +The first three columns (`artist`, `track` and `date.entered`) are variables that describe the song. +Then we have 76 columns (`wk1`-`wk76`) that describe the rank of the song in each week[^data-tidy-1]. +Here, the column names are one variable (the `week`) and the cell values are another (the `rank`). + +[^data-tidy-1]: The song will be included as long as it was in the top 100 at some point in 2000, and is tracked for up to 72 weeks after it appears. + +To tidy this data, we'll use `pivot_longer()`: + +```{r, R.options=list(pillar.print_min = 10)} +billboard |> + pivot_longer( + cols = starts_with("wk"), + names_to = "week", + values_to = "rank" + ) +``` + +After the data, there are three key arguments: + +- `cols` specifies which columns need to be pivoted, i.e. which columns aren't variables. This argument uses the same syntax as `select()` so here we could use `!c(artist, track, date.entered)` or `starts_with("wk")`. +- `names_to` names the variable stored in the column names, we named that variable `week`. +- `values_to` names the variable stored in the cell values, we named that variable `rank`. + +Note that in the code `"week"` and `"rank"` are quoted because those are new variables we're creating, they don't yet exist in the data when we run the `pivot_longer()` call. + +Now let's turn our attention to the resulting, longer data frame. +What happens if a song is in the top 100 for less than 76 weeks? +Take 2 Pac's "Baby Don't Cry", for example. +The above output suggests that it was only in the top 100 for 7 weeks, and all the remaining weeks are filled in with missing values. +These `NA`s don't really represent unknown observations; they were forced to exist by the structure of the dataset[^data-tidy-2], so we can ask `pivot_longer()` to get rid of them by setting `values_drop_na = TRUE`: + +[^data-tidy-2]: We'll come back to this idea in @sec-missing-values. + +```{r} +billboard |> + pivot_longer( + cols = starts_with("wk"), + names_to = "week", + values_to = "rank", + values_drop_na = TRUE + ) +``` + +The number of rows is now much lower, indicating that many rows with `NA`s were dropped. + +You might also wonder what happens if a song is in the top 100 for more than 76 weeks? +We can't tell from this data, but you might guess that additional columns `wk77`, `wk78`, ... would be added to the dataset. + +This data is now tidy, but we could make future computation a bit easier by converting values of `week` from character strings to numbers using `mutate()` and `readr::parse_number()`. +`parse_number()` is a handy function that will extract the first number from a string, ignoring all other text. + +```{r} +billboard_longer <- billboard |> + pivot_longer( + cols = starts_with("wk"), + names_to = "week", + values_to = "rank", + values_drop_na = TRUE + ) |> + mutate( + week = parse_number(week) + ) +billboard_longer +``` + +Now that we have all the week numbers in one variable and all the rank values in another, we're in a good position to visualize how song ranks vary over time. +The code is shown below and the result is in @fig-billboard-ranks. +We can see that very few songs stay in the top 100 for more than 20 weeks. + +```{r} +#| label: fig-billboard-ranks +#| fig-cap: | +#| A line plot showing how the rank of a song changes over time. +#| fig-alt: | +#| A line plot with week on the x-axis and rank on the y-axis, where +#| each line represents a song. Most songs appear to start at a high rank, +#| rapidly accelerate to a low rank, and then decay again. There are +#| surprisingly few tracks in the region when week is >20 and rank is +#| >50. + +billboard_longer |> + ggplot(aes(x = week, y = rank, group = track)) + + geom_line(alpha = 0.25) + + scale_y_reverse() +``` + +### How does pivoting work? + +Now that you've seen how we can use pivoting to reshape our data, let's take a little time to gain some intuition about what pivoting does to the data. +Let's start with a very simple dataset to make it easier to see what's happening. +Suppose we have three patients with `id`s A, B, and C, and we take two blood pressure measurements on each patient. +We'll create the data with `tribble()`, a handy function for constructing small tibbles by hand: + +```{r} +df <- tribble( + ~id, ~bp1, ~bp2, + "A", 100, 120, + "B", 140, 115, + "C", 120, 125 +) +``` + +We want our new dataset to have three variables: `id` (already exists), `measurement` (the column names), and `value` (the cell values). +To achieve this, we need to pivot `df` longer: + +```{r} +df |> + pivot_longer( + cols = bp1:bp2, + names_to = "measurement", + values_to = "value" + ) +``` + +How does the reshaping work? +It's easier to see if we think about it column by column. +As shown in @fig-pivot-variables, the values in a column that was already a variable in the original dataset (`id`) need to be repeated, once for each column that is pivoted. + +```{r} +#| label: fig-pivot-variables +#| echo: false +#| fig-cap: | +#| Columns that are already variables need to be repeated, once for +#| each column that is pivoted. +#| fig-alt: | +#| A diagram showing how `pivot_longer()` transforms a simple +#| dataset, using color to highlight how the values in the `id` column +#| ("A", "B", "C") are each repeated twice in the output because there are +#| two columns being pivoted ("bp1" and "bp2"). + +knitr::include_graphics("diagrams/tidy-data/variables.png", dpi = 270) +``` + +The column names become values in a new variable, whose name is defined by `names_to`, as shown in @fig-pivot-names. +They need to be repeated once for each row in the original dataset. + +```{r} +#| label: fig-pivot-names +#| echo: false +#| fig-cap: | +#| The column names of pivoted columns become values in a new column. The +#| values need to be repeated once for each row of the original dataset. +#| fig-alt: | +#| A diagram showing how `pivot_longer()` transforms a simple +#| data set, using color to highlight how column names ("bp1" and +#| "bp2") become the values in a new `measurement` column. They are repeated +#| three times because there were three rows in the input. + +knitr::include_graphics("diagrams/tidy-data/column-names.png", dpi = 270) +``` + +The cell values also become values in a new variable, with a name defined by `values_to`. +They are unwound row by row. +@fig-pivot-values illustrates the process. + +```{r} +#| label: fig-pivot-values +#| echo: false +#| fig-cap: | +#| The number of values is preserved (not repeated), but unwound +#| row-by-row. +#| fig-alt: | +#| A diagram showing how `pivot_longer()` transforms data, +#| using color to highlight how the cell values (blood pressure measurements) +#| become the values in a new `value` column. They are unwound row-by-row, +#| so the original rows (100,120), then (140,115), then (120,125), become +#| a column running from 100 to 125. + +knitr::include_graphics("diagrams/tidy-data/cell-values.png", dpi = 270) +``` + +### Many variables in column names + +A more challenging situation occurs when you have multiple pieces of information crammed into the column names, and you would like to store these in separate new variables. +For example, take the `who2` dataset, the source of `table1` and friends that you saw above: + +```{r} +who2 +``` + +This dataset, collected by the World Health Organisation, records information about tuberculosis diagnoses. +There are two columns that are already variables and are easy to interpret: `country` and `year`. +They are followed by 56 columns like `sp_m_014`, `ep_m_4554`, and `rel_m_3544`. +If you stare at these columns for long enough, you'll notice there's a pattern. +Each column name is made up of three pieces separated by `_`. +The first piece, `sp`/`rel`/`ep`, describes the method used for the diagnosis, the second piece, `m`/`f` is the `gender` (coded as a binary variable in this dataset), and the third piece, `014`/`1524`/`2534`/`3544`/`4554`/`5564`/`65` is the `age` range (`014` represents 0-14, for example). + +So in this case we have six pieces of information recorded in `who2`: the country and the year (already columns); the method of diagnosis, the gender category, and the age range category (contained in the other column names); and the count of patients in that category (cell values). +To organize these six pieces of information in six separate columns, we use `pivot_longer()` with a vector of column names for `names_to` and instructors for splitting the original variable names into pieces for `names_sep` as well as a column name for `values_to`: + +```{r} +who2 |> + pivot_longer( + cols = !(country:year), + names_to = c("diagnosis", "gender", "age"), + names_sep = "_", + values_to = "count" + ) +``` + +An alternative to `names_sep` is `names_pattern`, which you can use to extract variables from more complicated naming scenarios, once you've learned about regular expressions in @sec-regular-expressions. + +Conceptually, this is only a minor variation on the simpler case you've already seen. +@fig-pivot-multiple-names shows the basic idea: now, instead of the column names pivoting into a single column, they pivot into multiple columns. +You can imagine this happening in two steps (first pivoting and then separating) but under the hood it happens in a single step because that's faster. + +```{r} +#| label: fig-pivot-multiple-names +#| echo: false +#| fig-cap: | +#| Pivoting columns with multiple pieces of information in the names +#| means that each column name now fills in values in multiple output +#| columns. +#| fig-alt: | +#| A diagram that uses color to illustrate how supplying `names_sep` +#| and multiple `names_to` creates multiple variables in the output. +#| The input has variable names "x_1" and "y_2" which are split up +#| by "_" to create name and number columns in the output. This is +#| is similar case with a single `names_to`, but what would have been a +#| single output variable is now separated into multiple variables. + +knitr::include_graphics("diagrams/tidy-data/multiple-names.png", dpi = 270) +``` + +### Data and variable names in the column headers + +The next step up in complexity is when the column names include a mix of variable values and variable names. +For example, take the `household` dataset: + +```{r} +household +``` + +This dataset contains data about five families, with the names and dates of birth of up to two children. +The new challenge in this dataset is that the column names contain the names of two variables (`dob`, `name)` and the values of another (`child,` with values 1 or 2). +To solve this problem we again need to supply a vector to `names_to` but this time we use the special `".value"` sentinel; this isn't the name of a variable but a unique value that tells `pivot_longer()` to do something different. +This overrides the usual `values_to` argument to use the first component of the pivoted column name as a variable name in the output. + +```{r} +household |> + pivot_longer( + cols = !family, + names_to = c(".value", "child"), + names_sep = "_", + values_drop_na = TRUE + ) +``` + +We again use `values_drop_na = TRUE`, since the shape of the input forces the creation of explicit missing variables (e.g., for families with only one child). + +@fig-pivot-names-and-values illustrates the basic idea with a simpler example. +When you use `".value"` in `names_to`, the column names in the input contribute to both values and variable names in the output. + +```{r} +#| label: fig-pivot-names-and-values +#| echo: false +#| fig-cap: | +#| Pivoting with `names_to = c(".value", "num")` splits the column names +#| into two components: the first part determines the output column +#| name (`x` or `y`), and the second part determines the value of the +#| `num` column. +#| fig-alt: | +#| A diagram that uses color to illustrate how the special ".value" +#| sentinel works. The input has names "x_1", "x_2", "y_1", and "y_2", +#| and we want to use the first component ("x", "y") as a variable name +#| and the second ("1", "2") as the value for a new "num" column. + +knitr::include_graphics("diagrams/tidy-data/names-and-values.png", dpi = 270) +``` + +## Widening data + +So far we've used `pivot_longer()` to solve the common class of problems where values have ended up in column names. +Next we'll pivot (HA HA) to `pivot_wider()`, which makes datasets **wider** by increasing columns and reducing rows and helps when one observation is spread across multiple rows. +This seems to arise less commonly in the wild, but it does seem to crop up a lot when dealing with governmental data. + +We'll start by looking at `cms_patient_experience`, a dataset from the Centers of Medicare and Medicaid services that collects data about patient experiences: + +```{r} +cms_patient_experience +``` + +The core unit being studied is an organization, but each organization is spread across six rows, with one row for each measurement taken in the survey organization. +We can see the complete set of values for `measure_cd` and `measure_title` by using `distinct()`: + +```{r} +cms_patient_experience |> + distinct(measure_cd, measure_title) +``` + +Neither of these columns will make particularly great variable names: `measure_cd` doesn't hint at the meaning of the variable and `measure_title` is a long sentence containing spaces. +We'll use `measure_cd` as the source for our new column names for now, but in a real analysis you might want to create your own variable names that are both short and meaningful. + +`pivot_wider()` has the opposite interface to `pivot_longer()`: instead of choosing new column names, we need to provide the existing columns that define the values (`values_from`) and the column name (`names_from)`: + +```{r} +cms_patient_experience |> + pivot_wider( + names_from = measure_cd, + values_from = prf_rate + ) +``` + +The output doesn't look quite right; we still seem to have multiple rows for each organization. +That's because, we also need to tell `pivot_wider()` which column or columns have values that uniquely identify each row; in this case those are the variables starting with `"org"`: + +```{r} +cms_patient_experience |> + pivot_wider( + id_cols = starts_with("org"), + names_from = measure_cd, + values_from = prf_rate + ) +``` + +This gives us the output that we're looking for. + +### How does `pivot_wider()` work? + +To understand how `pivot_wider()` works, let's again start with a very simple dataset. +This time we have two patients with `id`s A and B, we have three blood pressure measurements on patient A and two on patient B: + +```{r} +df <- tribble( + ~id, ~measurement, ~value, + "A", "bp1", 100, + "B", "bp1", 140, + "B", "bp2", 115, + "A", "bp2", 120, + "A", "bp3", 105 +) +``` + +We'll take the values from the `value` column and the names from the `measurement` column: + +```{r} +df |> + pivot_wider( + names_from = measurement, + values_from = value + ) +``` + +To begin the process `pivot_wider()` needs to first figure out what will go in the rows and columns. +The new column names will be the unique values of `measurement`. + +```{r} +df |> + distinct(measurement) |> + pull() +``` + +By default, the rows in the output are determined by all the variables that aren't going into the new names or values. +These are called the `id_cols`. +Here there is only one column, but in general there can be any number. + +```{r} +df |> + select(-measurement, -value) |> + distinct() +``` + +`pivot_wider()` then combines these results to generate an empty data frame: + +```{r} +df |> + select(-measurement, -value) |> + distinct() |> + mutate(x = NA, y = NA, z = NA) +``` + +It then fills in all the missing values using the data in the input. +In this case, not every cell in the output has a corresponding value in the input as there's no third blood pressure measurement for patient B, so that cell remains missing. +We'll come back to this idea that `pivot_wider()` can "make" missing values in @sec-missing-values. + +You might also wonder what happens if there are multiple rows in the input that correspond to one cell in the output. +The example below has two rows that correspond to `id` "A" and `measurement` "bp1": + +```{r} +df <- tribble( + ~id, ~measurement, ~value, + "A", "bp1", 100, + "A", "bp1", 102, + "A", "bp2", 120, + "B", "bp1", 140, + "B", "bp2", 115 +) +``` + +If we attempt to pivot this we get an output that contains list-columns, which you'll learn more about in @sec-rectangling: + +```{r} +df |> + pivot_wider( + names_from = measurement, + values_from = value + ) +``` + +Since you don't know how to work with this sort of data yet, you'll want to follow the hint in the warning to figure out where the problem is: + +```{r} +df |> + group_by(id, measurement) |> + summarize(n = n(), .groups = "drop") |> + filter(n > 1) +``` + +It's then up to you to figure out what's gone wrong with your data and either repair the underlying damage or use your grouping and summarizing skills to ensure that each combination of row and column values only has a single row. + +## Summary + +In this chapter you learned about tidy data: data that has variables in columns and observations in rows. +Tidy data makes working in the tidyverse easier, because it's a consistent structure understood by most functions, the main challenge is transforming the data from whatever structure you receive it in to a tidy format. +To that end, you learned about `pivot_longer()` and `pivot_wider()` which allow you to tidy up many untidy datasets. +The examples we presented here are a selection of those from `vignette("pivot", package = "tidyr")`, so if you encounter a problem that this chapter doesn't help you with, that vignette is a good place to try next. + +Another challenge is that, for a given dataset, it can be impossible to label the longer or the wider version as the "tidy" one. +This is partly a reflection of our definition of tidy data, where we said tidy data has one variable in each column, but we didn't actually define what a variable is (and it's surprisingly hard to do so). +It's totally fine to be pragmatic and to say a variable is whatever makes your analysis easiest. +So if you're stuck figuring out how to do some computation, consider switching up the organisation of your data; don't be afraid to untidy, transform, and re-tidy as needed! + +If you enjoyed this chapter and want to learn more about the underlying theory, you can learn more about the history and theoretical underpinnings in the [Tidy Data](https://www.jstatsoft.org/article/view/v059i10) paper published in the Journal of Statistical Software. + +Now that you're writing a substantial amount of R code, it's time to learn more about organizing your code into files and directories. +In the next chapter, you'll learn all about the advantages of scripts and projects, and some of the many tools that they provide to make your life easier. diff --git a/data-transform.qmd b/data-transform.qmd new file mode 100644 index 000000000..3011f0ec2 --- /dev/null +++ b/data-transform.qmd @@ -0,0 +1,887 @@ +# Data transformation {#sec-data-transform} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +Visualization is an important tool for generating insight, but it's rare that you get the data in exactly the right form you need to make the graph you want. +Often you'll need to create some new variables or summaries to answer your questions with your data, or maybe you just want to rename the variables or reorder the observations to make the data a little easier to work with. +You'll learn how to do all that (and more!) in this chapter, which will introduce you to data transformation using the **dplyr** package and a new dataset on flights that departed from New York City in 2013. + +The goal of this chapter is to give you an overview of all the key tools for transforming a data frame. +We'll start with functions that operate on rows and then columns of a data frame, then circle back to talk more about the pipe, an important tool that you use to combine verbs. +We will then introduce the ability to work with groups. +We will end the chapter with a case study that showcases these functions in action and we'll come back to the functions in more detail in later chapters, as we start to dig into specific types of data (e.g., numbers, strings, dates). + +### Prerequisites + +In this chapter we'll focus on the dplyr package, another core member of the tidyverse. +We'll illustrate the key ideas using data from the nycflights13 package, and use ggplot2 to help us understand the data. + +```{r} +#| label: setup + +library(nycflights13) +library(tidyverse) +``` + +Take careful note of the conflicts message that's printed when you load the tidyverse. +It tells you that dplyr overwrites some functions in base R. +If you want to use the base version of these functions after loading dplyr, you'll need to use their full names: `stats::filter()` and `stats::lag()`. +So far we've mostly ignored which package a function comes from because most of the time it doesn't matter. +However, knowing the package can help you find help and find related functions, so when we need to be precise about which package a function comes from, we'll use the same syntax as R: `packagename::functionname()`. + +### nycflights13 + +To explore the basic dplyr verbs, we're going to use `nycflights13::flights`. +This dataset contains all `r format(nrow(nycflights13::flights), big.mark = ",")` flights that departed from New York City in 2013. +The data comes from the US [Bureau of Transportation Statistics](http://www.transtats.bts.gov/DatabaseInfo.asp?DB_ID=120&Link=0), and is documented in `?flights`. + +```{r} +flights +``` + +`flights` is a tibble, a special type of data frame used by the tidyverse to avoid some common gotchas. +The most important difference between tibbles and data frames is the way tibbles print; they are designed for large datasets, so they only show the first few rows and only the columns that fit on one screen. +There are a few options to see everything. +If you're using RStudio, the most convenient is probably `View(flights)`, which will open an interactive scrollable and filterable view. +Otherwise you can use `print(flights, width = Inf)` to show all columns, or use `glimpse()`: + +```{r} +glimpse(flights) +``` + +In both views, the variables names are followed by abbreviations that tell you the type of each variable: `` is short for integer, `` is short for double (aka real numbers), `` for character (aka strings), and `` for date-time. +These are important because the operations you can perform on a column depend so much on its "type". + +### dplyr basics + +You're about to learn the primary dplyr verbs (functions) which will allow you to solve the vast majority of your data manipulation challenges. +But before we discuss their individual differences, it's worth stating what they have in common: + +1. The first argument is always a data frame. + +2. The subsequent arguments typically describe which columns to operate on, using the variable names (without quotes). + +3. The output is always a new data frame. + +Because each verb does one thing well, solving complex problems will usually require combining multiple verbs, and we'll do so with the pipe, `|>`. +We'll discuss the pipe more in @sec-the-pipe, but in brief, the pipe takes the thing on its left and passes it along to the function on its right so that `x |> f(y)` is equivalent to `f(x, y)`, and `x |> f(y) |> g(z)` is equivalent to `g(f(x, y), z)`. +The easiest way to pronounce the pipe is "then". +That makes it possible to get a sense of the following code even though you haven't yet learned the details: + +```{r} +#| eval: false + +flights |> + filter(dest == "IAH") |> + group_by(year, month, day) |> + summarize( + arr_delay = mean(arr_delay, na.rm = TRUE) + ) +``` + +dplyr's verbs are organized into four groups based on what they operate on: **rows**, **columns**, **groups**, or **tables**. +In the following sections you'll learn the most important verbs for rows, columns, and groups, then we'll come back to the join verbs that work on tables in @sec-joins. +Let's dive in! + +## Rows + +The most important verbs that operate on rows of a dataset are `filter()`, which changes which rows are present without changing their order, and `arrange()`, which changes the order of the rows without changing which are present. +Both functions only affect the rows, and the columns are left unchanged. +We'll also discuss `distinct()` which finds rows with unique values but unlike `arrange()` and `filter()` it can also optionally modify the columns. + +### `filter()` + +`filter()` allows you to keep rows based on the values of the columns[^data-transform-1]. +The first argument is the data frame. +The second and subsequent arguments are the conditions that must be true to keep the row. +For example, we could find all flights that departed more than 120 minutes (two hours) late: + +[^data-transform-1]: Later, you'll learn about the `slice_*()` family which allows you to choose rows based on their positions. + +```{r} +flights |> + filter(dep_delay > 120) +``` + +As well as `>` (greater than), you can use `>=` (greater than or equal to), `<` (less than), `<=` (less than or equal to), `==` (equal to), and `!=` (not equal to). +You can also combine conditions with `&` or `,` to indicate "and" (check for both conditions) or with `|` to indicate "or" (check for either condition): + +```{r} +# Flights that departed on January 1 +flights |> + filter(month == 1 & day == 1) + +# Flights that departed in January or February +flights |> + filter(month == 1 | month == 2) +``` + +There's a useful shortcut when you're combining `|` and `==`: `%in%`. +It keeps rows where the variable equals one of the values on the right: + +```{r} +# A shorter way to select flights that departed in January or February +flights |> + filter(month %in% c(1, 2)) +``` + +We'll come back to these comparisons and logical operators in more detail in @sec-logicals. + +When you run `filter()` dplyr executes the filtering operation, creating a new data frame, and then prints it. +It doesn't modify the existing `flights` dataset because dplyr functions never modify their inputs. +To save the result, you need to use the assignment operator, `<-`: + +```{r} +jan1 <- flights |> + filter(month == 1 & day == 1) +``` + +### Common mistakes + +When you're starting out with R, the easiest mistake to make is to use `=` instead of `==` when testing for equality. +`filter()` will let you know when this happens: + +```{r} +#| error: true + +flights |> + filter(month = 1) +``` + +Another mistakes is you write "or" statements like you would in English: + +```{r} +#| eval: false + +flights |> + filter(month == 1 | 2) +``` + +This "works", in the sense that it doesn't throw an error, but it doesn't do what you want because `|` first checks the condition `month == 1` and then checks the condition `2`, which is not a sensible condition to check. +We'll learn more about what's happening here and why in @sec-boolean-operations. + +### `arrange()` + +`arrange()` changes the order of the rows based on the value of the columns. +It takes a data frame and a set of column names (or more complicated expressions) to order by. +If you provide more than one column name, each additional column will be used to break ties in the values of preceding columns. +For example, the following code sorts by the departure time, which is spread over four columns. +We get the earliest years first, then within a year the earliest months, etc. + +```{r} +flights |> + arrange(year, month, day, dep_time) +``` + +You can use `desc()` on a column inside of `arrange()` to re-order the data frame based on that column in descending (big-to-small) order. +For example, this code orders flights from most to least delayed: + +```{r} +flights |> + arrange(desc(dep_delay)) +``` + +Note that the number of rows has not changed -- we're only arranging the data, we're not filtering it. + +### `distinct()` + +`distinct()` finds all the unique rows in a dataset, so in a technical sense, it primarily operates on the rows. +Most of the time, however, you'll want the distinct combination of some variables, so you can also optionally supply column names: + +```{r} +# Remove duplicate rows, if any +flights |> + distinct() + +# Find all unique origin and destination pairs +flights |> + distinct(origin, dest) +``` + +Alternatively, if you want to the keep other columns when filtering for unique rows, you can use the `.keep_all = TRUE` option. + +```{r} +flights |> + distinct(origin, dest, .keep_all = TRUE) +``` + +It's not a coincidence that all of these distinct flights are on January 1: `distinct()` will find the first occurrence of a unique row in the dataset and discard the rest. + +If you want to find the number of occurrences instead, you're better off swapping `distinct()` for `count()`, and with the `sort = TRUE` argument you can arrange them in descending order of number of occurrences. +You'll learn more about count in @sec-counts. + +```{r} +flights |> + count(origin, dest, sort = TRUE) +``` + +### Exercises + +1. In a single pipeline for each condition, find all flights that meet the condition: + + - Had an arrival delay of two or more hours + - Flew to Houston (`IAH` or `HOU`) + - Were operated by United, American, or Delta + - Departed in summer (July, August, and September) + - Arrived more than two hours late, but didn't leave late + - Were delayed by at least an hour, but made up over 30 minutes in flight + +2. Sort `flights` to find the flights with longest departure delays. + Find the flights that left earliest in the morning. + +3. Sort `flights` to find the fastest flights. + (Hint: Try including a math calculation inside of your function.) + +4. Was there a flight on every day of 2013? + +5. Which flights traveled the farthest distance? + Which traveled the least distance? + +6. Does it matter what order you used `filter()` and `arrange()` if you're using both? + Why/why not? + Think about the results and how much work the functions would have to do. + +## Columns + +There are four important verbs that affect the columns without changing the rows: `mutate()` creates new columns that are derived from the existing columns, `select()` changes which columns are present, `rename()` changes the names of the columns, and `relocate()` changes the positions of the columns. + +### `mutate()` {#sec-mutate} + +The job of `mutate()` is to add new columns that are calculated from the existing columns. +In the transform chapters, you'll learn a large set of functions that you can use to manipulate different types of variables. +For now, we'll stick with basic algebra, which allows us to compute the `gain`, how much time a delayed flight made up in the air, and the `speed` in miles per hour: + +```{r} +flights |> + mutate( + gain = dep_delay - arr_delay, + speed = distance / air_time * 60 + ) +``` + +By default, `mutate()` adds new columns on the right hand side of your dataset, which makes it difficult to see what's happening here. +We can use the `.before` argument to instead add the variables to the left hand side[^data-transform-2]: + +[^data-transform-2]: Remember that in RStudio, the easiest way to see a dataset with many columns is `View()`. + +```{r} +flights |> + mutate( + gain = dep_delay - arr_delay, + speed = distance / air_time * 60, + .before = 1 + ) +``` + +The `.` is a sign that `.before` is an argument to the function, not the name of a third new variable we are creating. +You can also use `.after` to add after a variable, and in both `.before` and `.after` you can use the variable name instead of a position. +For example, we could add the new variables after `day`: + +```{r} +#| results: false + +flights |> + mutate( + gain = dep_delay - arr_delay, + speed = distance / air_time * 60, + .after = day + ) +``` + +Alternatively, you can control which variables are kept with the `.keep` argument. +A particularly useful argument is `"used"` which specifies that we only keep the columns that were involved or created in the `mutate()` step. +For example, the following output will contain only the variables `dep_delay`, `arr_delay`, `air_time`, `gain`, `hours`, and `gain_per_hour`. + +```{r} +#| results: false + +flights |> + mutate( + gain = dep_delay - arr_delay, + hours = air_time / 60, + gain_per_hour = gain / hours, + .keep = "used" + ) +``` + +Note that since we haven't assigned the result of the above computation back to `flights`, the new variables `gain,` `hours`, and `gain_per_hour` will only be printed but will not be stored in a data frame. +And if we want them to be available in a data frame for future use, we should think carefully about whether we want the result to be assigned back to `flights`, overwriting the original data frame with many more variables, or to a new object. +Often, the right answer is a new object that is named informatively to indicate its contents, e.g., `delay_gain`, but you might also have good reasons for overwriting `flights`. + +### `select()` {#sec-select} + +It's not uncommon to get datasets with hundreds or even thousands of variables. +In this situation, the first challenge is often just focusing on the variables you're interested in. +`select()` allows you to rapidly zoom in on a useful subset using operations based on the names of the variables: + +- Select columns by name: + + ```{r} + #| results: false + + flights |> + select(year, month, day) + ``` + +- Select all columns between year and day (inclusive): + + ```{r} + #| results: false + + flights |> + select(year:day) + ``` + +- Select all columns except those from year to day (inclusive): + + ```{r} + #| results: false + + flights |> + select(!year:day) + ``` + + You can also use `-` instead of `!` (and you're likely to see that in the wild); we recommend `!` because it reads as "not", and combines well with `&` and `|`. + +- Select all columns that are characters: + + ```{r} + #| results: false + + flights |> + select(where(is.character)) + ``` + +There are a number of helper functions you can use within `select()`: + +- `starts_with("abc")`: matches names that begin with "abc". +- `ends_with("xyz")`: matches names that end with "xyz". +- `contains("ijk")`: matches names that contain "ijk". +- `num_range("x", 1:3)`: matches `x1`, `x2` and `x3`. + +See `?select` for more details. +Once you know regular expressions (the topic of @sec-regular-expressions) you'll also be able to use `matches()` to select variables that match a pattern. + +You can rename variables as you `select()` them by using `=`. +The new name appears on the left hand side of the `=`, and the old variable appears on the right hand side: + +```{r} +flights |> + select(tail_num = tailnum) +``` + +### `rename()` + +If you want to keep all the existing variables and just want to rename a few, you can use `rename()` instead of `select()`: + +```{r} +flights |> + rename(tail_num = tailnum) +``` + +If you have a bunch of inconsistently named columns and it would be painful to fix them all by hand, check out `janitor::clean_names()` which provides some useful automated cleaning. + +### `relocate()` + +Use `relocate()` to move variables around. +You might want to collect related variables together or move important variables to the front. +By default `relocate()` moves variables to the front: + +```{r} +flights |> + relocate(time_hour, air_time) +``` + +You can also specify where to put them using the `.before` and `.after` arguments, just like in `mutate()`: + +```{r} +#| results: false + +flights |> + relocate(year:dep_time, .after = time_hour) +flights |> + relocate(starts_with("arr"), .before = dep_time) +``` + +### Exercises + +```{r} +#| eval: false +#| echo: false + +# For data checking, not used in results shown in book +flights <- flights |> mutate( + dep_time = hour * 60 + minute, + arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100), + airtime2 = arr_time - dep_time, + dep_sched = dep_time + dep_delay +) + +ggplot(flights, aes(x = dep_sched)) + geom_histogram(binwidth = 60) +ggplot(flights, aes(x = dep_sched %% 60)) + geom_histogram(binwidth = 1) +ggplot(flights, aes(x = air_time - airtime2)) + geom_histogram() +``` + +1. Compare `dep_time`, `sched_dep_time`, and `dep_delay`. + How would you expect those three numbers to be related? + +2. Brainstorm as many ways as possible to select `dep_time`, `dep_delay`, `arr_time`, and `arr_delay` from `flights`. + +3. What happens if you specify the name of the same variable multiple times in a `select()` call? + +4. What does the `any_of()` function do? + Why might it be helpful in conjunction with this vector? + + ```{r} + variables <- c("year", "month", "day", "dep_delay", "arr_delay") + ``` + +5. Does the result of running the following code surprise you? + How do the select helpers deal with upper and lower case by default? + How can you change that default? + + ```{r} + #| eval: false + + flights |> select(contains("TIME")) + ``` + +6. Rename `air_time` to `air_time_min` to indicate units of measurement and move it to the beginning of the data frame. + +7. Why doesn't the following work, and what does the error mean? + + ```{r} + #| error: true + + flights |> + select(tailnum) |> + arrange(arr_delay) + ``` + +## The pipe {#sec-the-pipe} + +We've shown you simple examples of the pipe above, but its real power arises when you start to combine multiple verbs. +For example, imagine that you wanted to find the fast flights to Houston's IAH airport: you need to combine `filter()`, `mutate()`, `select()`, and `arrange()`: + +```{r} +flights |> + filter(dest == "IAH") |> + mutate(speed = distance / air_time * 60) |> + select(year:day, dep_time, carrier, flight, speed) |> + arrange(desc(speed)) +``` + +Even though this pipeline has four steps, it's easy to skim because the verbs come at the start of each line: start with the `flights` data, then filter, then mutate, then select, then arrange. + +What would happen if we didn't have the pipe? +We could nest each function call inside the previous call: + +```{r} +#| results: false + +arrange( + select( + mutate( + filter( + flights, + dest == "IAH" + ), + speed = distance / air_time * 60 + ), + year:day, dep_time, carrier, flight, speed + ), + desc(speed) +) +``` + +Or we could use a bunch of intermediate objects: + +```{r} +#| results: false + +flights1 <- filter(flights, dest == "IAH") +flights2 <- mutate(flights1, speed = distance / air_time * 60) +flights3 <- select(flights2, year:day, dep_time, carrier, flight, speed) +arrange(flights3, desc(speed)) +``` + +While both forms have their time and place, the pipe generally produces data analysis code that is easier to write and read. + +To add the pipe to your code, we recommend using the build-in keyboard shortcut Ctrl/Cmd + Shift + M. +You'll need to make one change to your RStudio options to use `|>` instead of `%>%` as shown in @fig-pipe-options; more on `%>%` shortly. + +```{r} +#| label: fig-pipe-options +#| echo: false +#| fig-cap: | +#| To insert `|>`, make sure the "Use native pipe operator" option is checked. +#| fig-alt: | +#| Screenshot showing the "Use native pipe operator" option which can +#| be found on the "Editing" panel of the "Code" options. + +knitr::include_graphics("screenshots/rstudio-pipe-options.png") +``` + +::: callout-note +## magrittr + +If you've been using the tidyverse for a while, you might be familiar with the `%>%` pipe provided by the **magrittr** package. +The magrittr package is included in the core tidyverse, so you can use `%>%` whenever you load the tidyverse: + +```{r} +#| eval: false + +library(tidyverse) + +mtcars %>% + group_by(cyl) %>% + summarize(n = n()) +``` + +For simple cases, `|>` and `%>%` behave identically. +So why do we recommend the base pipe? +Firstly, because it's part of base R, it's always available for you to use, even when you're not using the tidyverse. +Secondly, `|>` is quite a bit simpler than `%>%`: in the time between the invention of `%>%` in 2014 and the inclusion of `|>` in R 4.1.0 in 2021, we gained a better understanding of the pipe. +This allowed the base implementation to jettison infrequently used and less important features. +::: + +## Groups + +So far you've learned about functions that work with rows and columns. +dplyr gets even more powerful when you add in the ability to work with groups. +In this section, we'll focus on the most important functions: `group_by()`, `summarize()`, and the slice family of functions. + +### `group_by()` + +Use `group_by()` to divide your dataset into groups meaningful for your analysis: + +```{r} +flights |> + group_by(month) +``` + +`group_by()` doesn't change the data but, if you look closely at the output, you'll notice that the output indicates that it is "grouped by" month (`Groups: month [12]`). +This means subsequent operations will now work "by month". +`group_by()` adds this grouped feature (referred to as class) to the data frame, which changes the behavior of the subsequent verbs applied to the data. + +### `summarize()` {#sec-summarize} + +The most important grouped operation is a summary, which, if being used to calculate a single summary statistic, reduces the data frame to have a single row for each group. +In dplyr, this operation is performed by `summarize()`[^data-transform-3], as shown by the following example, which computes the average departure delay by month: + +[^data-transform-3]: Or `summarise()`, if you prefer British English. + +```{r} +flights |> + group_by(month) |> + summarize( + avg_delay = mean(dep_delay) + ) +``` + +Uhoh! +Something has gone wrong and all of our results are `NA`s (pronounced "N-A"), R's symbol for missing value. +This happened because some of the observed flights had missing data in the delay column, and so when we calculated the mean including those values, we got an `NA` result. +We'll come back to discuss missing values in detail in @sec-missing-values, but for now we'll tell the `mean()` function to ignore all missing values by setting the argument `na.rm` to `TRUE`: + +```{r} +flights |> + group_by(month) |> + summarize( + delay = mean(dep_delay, na.rm = TRUE) + ) +``` + +You can create any number of summaries in a single call to `summarize()`. +You'll learn various useful summaries in the upcoming chapters, but one very useful summary is `n()`, which returns the number of rows in each group: + +```{r} +flights |> + group_by(month) |> + summarize( + delay = mean(dep_delay, na.rm = TRUE), + n = n() + ) +``` + +Means and counts can get you a surprisingly long way in data science! + +### The `slice_` functions + +There are five handy functions that allow you extract specific rows within each group: + +- `df |> slice_head(n = 1)` takes the first row from each group. +- `df |> slice_tail(n = 1)` takes the last row in each group. +- `df |> slice_min(x, n = 1)` takes the row with the smallest value of column `x`. +- `df |> slice_max(x, n = 1)` takes the row with the largest value of column `x`. +- `df |> slice_sample(n = 1)` takes one random row. + +You can vary `n` to select more than one row, or instead of `n =`, you can use `prop = 0.1` to select (e.g.) 10% of the rows in each group. +For example, the following code finds the flights that are most delayed upon arrival at each destination: + +```{r} +flights |> + group_by(dest) |> + slice_max(arr_delay, n = 1) |> + relocate(dest) +``` + +Note that there are 105 destinations but we get 108 rows here. +What's up? +`slice_min()` and `slice_max()` keep tied values so `n = 1` means give us all rows with the highest value. +If you want exactly one row per group you can set `with_ties = FALSE`. + +This is similar to computing the max delay with `summarize()`, but you get the whole corresponding row (or rows if there's a tie) instead of the single summary statistic. + +### Grouping by multiple variables + +You can create groups using more than one variable. +For example, we could make a group for each date. + +```{r} +daily <- flights |> + group_by(year, month, day) +daily +``` + +When you summarize a tibble grouped by more than one variable, each summary peels off the last group. +In hindsight, this wasn't a great way to make this function work, but it's difficult to change without breaking existing code. +To make it obvious what's happening, dplyr displays a message that tells you how you can change this behavior: + +```{r} +daily_flights <- daily |> + summarize(n = n()) +``` + +If you're happy with this behavior, you can explicitly request it in order to suppress the message: + +```{r} +#| results: false + +daily_flights <- daily |> + summarize( + n = n(), + .groups = "drop_last" + ) +``` + +Alternatively, change the default behavior by setting a different value, e.g., `"drop"` to drop all grouping or `"keep"` to preserve the same groups. + +### Ungrouping + +You might also want to remove grouping from a data frame without using `summarize()`. +You can do this with `ungroup()`. + +```{r} +daily |> + ungroup() +``` + +Now let's see what happens when you summarize an ungrouped data frame. + +```{r} +daily |> + ungroup() |> + summarize( + avg_delay = mean(dep_delay, na.rm = TRUE), + flights = n() + ) +``` + +You get a single row back because dplyr treats all the rows in an ungrouped data frame as belonging to one group. + +### `.by` + +dplyr 1.1.0 includes a new, experimental, syntax for per-operation grouping, the `.by` argument. +`group_by()` and `ungroup()` aren't going away, but you can now also use the `.by` argument to group within a single operation: + +```{r} +#| results: false +flights |> + summarize( + delay = mean(dep_delay, na.rm = TRUE), + n = n(), + .by = month + ) +``` + +Or if you want to group by multiple variables: + +```{r} +#| results: false +flights |> + summarize( + delay = mean(dep_delay, na.rm = TRUE), + n = n(), + .by = c(origin, dest) + ) +``` + +`.by` works with all verbs and has the advantage that you don't need to use the `.groups` argument to suppress the grouping message or `ungroup()` when you're done. + +We didn't focus on this syntax in this chapter because it was very new when we wrote the book. +We did want to mention it because we think it has a lot of promise and it's likely to be quite popular. +You can learn more about it in the [dplyr 1.1.0 blog post](https://www.tidyverse.org/blog/2023/02/dplyr-1-1-0-per-operation-grouping/). + +### Exercises + +1. Which carrier has the worst average delays? + Challenge: can you disentangle the effects of bad airports vs. bad carriers? + Why/why not? + (Hint: think about `flights |> group_by(carrier, dest) |> summarize(n())`) + +2. Find the flights that are most delayed upon departure from each destination. + +3. How do delays vary over the course of the day. + Illustrate your answer with a plot. + +4. What happens if you supply a negative `n` to `slice_min()` and friends? + +5. Explain what `count()` does in terms of the dplyr verbs you just learned. + What does the `sort` argument to `count()` do? + +6. Suppose we have the following tiny data frame: + + ```{r} + df <- tibble( + x = 1:5, + y = c("a", "b", "a", "a", "b"), + z = c("K", "K", "L", "L", "K") + ) + ``` + + a. Write down what you think the output will look like, then check if you were correct, and describe what `group_by()` does. + + ```{r} + #| eval: false + + df |> + group_by(y) + ``` + + b. Write down what you think the output will look like, then check if you were correct, and describe what `arrange()` does. + Also comment on how it's different from the `group_by()` in part (a)? + + ```{r} + #| eval: false + + df |> + arrange(y) + ``` + + c. Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does. + + ```{r} + #| eval: false + + df |> + group_by(y) |> + summarize(mean_x = mean(x)) + ``` + + d. Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does. + Then, comment on what the message says. + + ```{r} + #| eval: false + + df |> + group_by(y, z) |> + summarize(mean_x = mean(x)) + ``` + + e. Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does. + How is the output different from the one in part (d). + + ```{r} + #| eval: false + + df |> + group_by(y, z) |> + summarize(mean_x = mean(x), .groups = "drop") + ``` + + f. Write down what you think the outputs will look like, then check if you were correct, and describe what each pipeline does. + How are the outputs of the two pipelines different? + + ```{r} + #| eval: false + + df |> + group_by(y, z) |> + summarize(mean_x = mean(x)) + + df |> + group_by(y, z) |> + mutate(mean_x = mean(x)) + ``` + +## Case study: aggregates and sample size {#sec-sample-size} + +Whenever you do any aggregation, it's always a good idea to include a count (`n()`). +That way, you can ensure that you're not drawing conclusions based on very small amounts of data. +We'll demonstrate this with some baseball data from the **Lahman** package. +Specifically, we will compare what proportion of times a player gets a hit (`H`) vs. the number of times they try to put the ball in play (`AB`): + +```{r} +batters <- Lahman::Batting |> + group_by(playerID) |> + summarize( + performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE), + n = sum(AB, na.rm = TRUE) + ) +batters +``` + +When we plot the skill of the batter (measured by the batting average, `performance`) against the number of opportunities to hit the ball (measured by times at bat, `n`), you see two patterns: + +1. The variation in `performance` is larger among players with fewer at-bats. + The shape of this plot is very characteristic: whenever you plot a mean (or other summary statistics) vs. group size, you'll see that the variation decreases as the sample size increases[^data-transform-4]. + +2. There's a positive correlation between skill (`performance`) and opportunities to hit the ball (`n`) because teams want to give their best batters the most opportunities to hit the ball. + +[^data-transform-4]: \*cough\* the law of large numbers \*cough\*. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of number of batting performance vs. batting opportunites +#| overlaid with a smoothed line. Average performance increases sharply +#| from 0.2 at when n is 1 to 0.25 when n is ~1000. Average performance +#| continues to increase linearly at a much shallower slope reaching +#| ~0.3 when n is ~15,000. + +batters |> + filter(n > 100) |> + ggplot(aes(x = n, y = performance)) + + geom_point(alpha = 1 / 10) + + geom_smooth(se = FALSE) +``` + +Note the handy pattern for combining ggplot2 and dplyr. +You just have to remember to switch from `|>`, for dataset processing, to `+` for adding layers to your plot. + +This also has important implications for ranking. +If you naively sort on `desc(performance)`, the people with the best batting averages are clearly the ones who tried to put the ball in play very few times and happened to get a hit, they're not necessarily the most skilled players: + +```{r} +batters |> + arrange(desc(performance)) +``` + +You can find a good explanation of this problem and how to overcome it at and . + +## Summary + +In this chapter, you've learned the tools that dplyr provides for working with data frames. +The tools are roughly grouped into three categories: those that manipulate the rows (like `filter()` and `arrange()`, those that manipulate the columns (like `select()` and `mutate()`), and those that manipulate groups (like `group_by()` and `summarize()`). +In this chapter, we've focused on these "whole data frame" tools, but you haven't yet learned much about what you can do with the individual variable. +We'll come back to that in the Transform part of the book, where each chapter will give you tools for a specific type of variable. + +In the next chapter, we'll pivot back to workflow to discuss the importance of code style, keeping your code well organized in order to make it easy for you and others to read and understand your code. diff --git a/data-visualize.qmd b/data-visualize.qmd new file mode 100644 index 000000000..967e63544 --- /dev/null +++ b/data-visualize.qmd @@ -0,0 +1,930 @@ +# Data visualization {#sec-data-visualization} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +> "The simple graph has brought more information to the data analyst's mind than any other device." --- John Tukey + +R has several systems for making graphs, but ggplot2 is one of the most elegant and most versatile. +ggplot2 implements the **grammar of graphics**, a coherent system for describing and building graphs. +With ggplot2, you can do more and faster by learning one system and applying it in many places. + +This chapter will teach you how to visualize your data using **ggplot2**. +We will start by creating a simple scatterplot and use that to introduce aesthetic mappings and geometric objects -- the fundamental building blocks of ggplot2. +We will then walk you through visualizing distributions of single variables as well as visualizing relationships between two or more variables. +We'll finish off with saving your plots and troubleshooting tips. + +### Prerequisites + +This chapter focuses on ggplot2, one of the core packages in the tidyverse. +To access the datasets, help pages, and functions used in this chapter, load the tidyverse by running: + +```{r} +#| label: setup + +library(tidyverse) +``` + +That one line of code loads the core tidyverse; the packages that you will use in almost every data analysis. +It also tells you which functions from the tidyverse conflict with functions in base R (or from other packages you might have loaded)[^data-visualize-1]. + +[^data-visualize-1]: You can eliminate that message and force conflict resolution to happen on demand by using the conflicted package, which becomes more important as you load more packages. + You can learn more about conflicted at . + +If you run this code and get the error message `there is no package called 'tidyverse'`, you'll need to first install it, then run `library()` once again. + +```{r} +#| eval: false + +install.packages("tidyverse") +library(tidyverse) +``` + +You only need to install a package once, but you need to load it every time you start a new session. + +In addition to tidyverse, we will also use the **palmerpenguins** package, which includes the `penguins` dataset containing body measurements for penguins on three islands in the Palmer Archipelago, and the ggthemes package, which offers a colorblind safe color palette. + +```{r} +library(palmerpenguins) +library(ggthemes) +``` + +## First steps + +Do penguins with longer flippers weigh more or less than penguins with shorter flippers? +You probably already have an answer, but try to make your answer precise. +What does the relationship between flipper length and body mass look like? +Is it positive? +Negative? +Linear? +Nonlinear? +Does the relationship vary by the species of the penguin? +How about by the island where the penguin lives? +Let's create visualizations that we can use to answer these questions. + +### The `penguins` data frame + +You can test your answers to those questions with the `penguins` **data frame** found in palmerpenguins (a.k.a. `palmerpenguins::penguins`). +A data frame is a rectangular collection of variables (in the columns) and observations (in the rows). +`penguins` contains `r nrow(penguins)` observations collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER[^data-visualize-2]. + +[^data-visualize-2]: Horst AM, Hill AP, Gorman KB (2020). + palmerpenguins: Palmer Archipelago (Antarctica) penguin data. + R package version 0.1.0. + . + doi: 10.5281/zenodo.3960218. + +To make the discussion easier, let's define some terms: + +- A **variable** is a quantity, quality, or property that you can measure. + +- A **value** is the state of a variable when you measure it. + The value of a variable may change from measurement to measurement. + +- An **observation** is a set of measurements made under similar conditions (you usually make all of the measurements in an observation at the same time and on the same object). + An observation will contain several values, each associated with a different variable. + We'll sometimes refer to an observation as a data point. + +- **Tabular data** is a set of values, each associated with a variable and an observation. + Tabular data is *tidy* if each value is placed in its own "cell", each variable in its own column, and each observation in its own row. + +In this context, a variable refers to an attribute of all the penguins, and an observation refers to all the attributes of a single penguin. + +Type the name of the data frame in the console and R will print a preview of its contents. +Note that it says `tibble` on top of this preview. +In the tidyverse, we use special data frames called **tibbles** that you will learn more about soon. + +```{r} +penguins +``` + +This data frame contains `r ncol(penguins)` columns. +For an alternative view, where you can see all variables and the first few observations of each variable, use `glimpse()`. +Or, if you're in RStudio, run `View(penguins)` to open an interactive data viewer. + +```{r} +glimpse(penguins) +``` + +Among the variables in `penguins` are: + +1. `species`: a penguin's species (Adelie, Chinstrap, or Gentoo). + +2. `flipper_length_mm`: length of a penguin's flipper, in millimeters. + +3. `body_mass_g`: body mass of a penguin, in grams. + +To learn more about `penguins`, open its help page by running `?penguins`. + +### Ultimate goal {#sec-ultimate-goal} + +Our ultimate goal in this chapter is to recreate the following visualization displaying the relationship between flipper lengths and body masses of these penguins, taking into consideration the species of the penguin. + +```{r} +#| echo: false +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins, with a +#| best fit line of the relationship between these two variables +#| overlaid. The plot displays a positive, fairly linear, and relatively +#| strong relationship between these two variables. Species (Adelie, +#| Chinstrap, and Gentoo) are represented with different colors and +#| shapes. The relationship between body mass and flipper length is +#| roughly the same for these three species, and Gentoo penguins are +#| larger than penguins from the other two species. + +ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + + geom_point(aes(color = species, shape = species)) + + geom_smooth(method = "lm") + + labs( + title = "Body mass and flipper length", + subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", + x = "Flipper length (mm)", + y = "Body mass (g)", + color = "Species", + shape = "Species" + ) + + scale_color_colorblind() +``` + +### Creating a ggplot + +Let's recreate this plot step-by-step. + +With ggplot2, you begin a plot with the function `ggplot()`, defining a plot object that you then add **layers** to. +The first argument of `ggplot()` is the dataset to use in the graph and so `ggplot(data = penguins)` creates an empty graph that is primed to display the `penguins` data, but since we haven't told it how to visualize it yet, for now it's empty. +This is not a very exciting plot, but you can think of it like an empty canvas you'll paint the remaining layers of your plot onto. + +```{r} +#| fig-alt: | +#| A blank, gray plot area. + +ggplot(data = penguins) +``` + +Next, we need to tell `ggplot()` how the information from our data will be visually represented. +The `mapping` argument of the `ggplot()` function defines how variables in your dataset are mapped to visual properties (**aesthetics**) of your plot. +The `mapping` argument is always defined in the `aes()` function, and the `x` and `y` arguments of `aes()` specify which variables to map to the x and y axes. +For now, we will only map flipper length to the `x` aesthetic and body mass to the `y` aesthetic. +ggplot2 looks for the mapped variables in the `data` argument, in this case, `penguins`. + +The following plot shows the result of adding these mappings. + +```{r} +#| fig-alt: | +#| The plot shows flipper length on the x-axis, with values that range from +#| 170 to 230, and body mass on the y-axis, with values that range from 3000 +#| to 6000. + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) +) +``` + +Our empty canvas now has more structure -- it's clear where flipper lengths will be displayed (on the x-axis) and where body masses will be displayed (on the y-axis). +But the penguins themselves are not yet on the plot. +This is because we have not yet articulated, in our code, how to represent the observations from our data frame on our plot. + +To do so, we need to define a **geom**: the geometrical object that a plot uses to represent data. +These geometric objects are made available in ggplot2 with functions that start with `geom_`. +People often describe plots by the type of geom that the plot uses. +For example, bar charts use bar geoms (`geom_bar()`), line charts use line geoms (`geom_line()`), boxplots use boxplot geoms (`geom_boxplot()`), scatterplots use point geoms (`geom_point()`), and so on. + +The function `geom_point()` adds a layer of points to your plot, which creates a scatterplot. +ggplot2 comes with many geom functions that each adds a different type of layer to a plot. +You'll learn a whole bunch of geoms throughout the book, particularly in @sec-layers. + +```{r} +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. The plot +#| displays a positive, linear, and relatively strong relationship between +#| these two variables. + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) +) + + geom_point() +``` + +Now we have something that looks like what we might think of as a "scatterplot". +It doesn't yet match our "ultimate goal" plot, but using this plot we can start answering the question that motivated our exploration: "What does the relationship between flipper length and body mass look like?" The relationship appears to be positive (as flipper length increases, so does body mass), fairly linear (the points are clustered around a line instead of a curve), and moderately strong (there isn't too much scatter around such a line). +Penguins with longer flippers are generally larger in terms of their body mass. + +Before we add more layers to this plot, let's pause for a moment and review the warning message we got: + +> Removed 2 rows containing missing values (`geom_point()`). + +We're seeing this message because there are two penguins in our dataset with missing body mass and/or flipper length values and ggplot2 has no way of representing them on the plot without both of these values. +Like R, ggplot2 subscribes to the philosophy that missing values should never silently go missing. +This type of warning is probably one of the most common types of warnings you will see when working with real data -- missing values are a very common issue and you'll learn more about them throughout the book, particularly in @sec-missing-values. +For the remaining plots in this chapter we will suppress this warning so it's not printed alongside every single plot we make. + +### Adding aesthetics and layers {#sec-adding-aesthetics-layers} + +Scatterplots are useful for displaying the relationship between two numerical variables, but it's always a good idea to be skeptical of any apparent relationship between two variables and ask if there may be other variables that explain or change the nature of this apparent relationship. +For example, does the relationship between flipper length and body mass differ by species? +Let's incorporate species into our plot and see if this reveals any additional insights into the apparent relationship between these variables. +We will do this by representing species with different colored points. + +To achieve this, will we need to modify the aesthetic or the geom? +If you guessed "in the aesthetic mapping, inside of `aes()`", you're already getting the hang of creating data visualizations with ggplot2! +And if not, don't worry. +Throughout the book you will make many more ggplots and have many more opportunities to check your intuition as you make them. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. The plot +#| displays a positive, fairly linear, and relatively strong relationship +#| between these two variables. Species (Adelie, Chinstrap, and Gentoo) +#| are represented with different colors. + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species) +) + + geom_point() +``` + +When a categorical variable is mapped to an aesthetic, ggplot2 will automatically assign a unique value of the aesthetic (here a unique color) to each unique level of the variable (each of the three species), a process known as **scaling**. +ggplot2 will also add a legend that explains which values correspond to which levels. + +Now let's add one more layer: a smooth curve displaying the relationship between body mass and flipper length. +Before you proceed, refer back to the code above, and think about how we can add this to our existing plot. + +Since this is a new geometric object representing our data, we will add a new geom as a layer on top of our point geom: `geom_smooth()`. +And we will specify that we want to draw the line of best fit based on a `l`inear `m`odel with `method = "lm"`. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. Overlaid +#| on the scatterplot are three smooth curves displaying the +#| relationship between these variables for each species (Adelie, +#| Chinstrap, and Gentoo). Different penguin species are plotted in +#| different colors for the points and the smooth curves. + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g, color = species) +) + + geom_point() + + geom_smooth(method = "lm") +``` + +We have successfully added lines, but this plot doesn't look like the plot from @sec-ultimate-goal, which only has one line for the entire dataset as opposed to separate lines for each of the penguin species. + +When aesthetic mappings are defined in `ggplot()`, at the *global* level, they're passed down to each of the subsequent geom layers of the plot. +However, each geom function in ggplot2 can also take a `mapping` argument, which allows for aesthetic mappings at the *local* level that are added to those inherited from the global level. +Since we want points to be colored based on species but don't want the lines to be separated out for them, we should specify `color = species` for `geom_point()` only. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. Overlaid +#| on the scatterplot is a single line of best fit displaying the +#| relationship between these variables for each species (Adelie, +#| Chinstrap, and Gentoo). Different penguin species are plotted in +#| different colors for the points only. + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) +) + + geom_point(mapping = aes(color = species)) + + geom_smooth(method = "lm") +``` + +Voila! +We have something that looks very much like our ultimate goal, though it's not yet perfect. +We still need to use different shapes for each species of penguins and improve labels. + +It's generally not a good idea to represent information using only colors on a plot, as people perceive colors differently due to color blindness or other color vision differences. +Therefore, in addition to color, we can also map `species` to the `shape` aesthetic. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. Overlaid +#| on the scatterplot is a single line of best fit displaying the +#| relationship between these variables for each species (Adelie, +#| Chinstrap, and Gentoo). Different penguin species are plotted in +#| different colors and shapes for the points only. + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) +) + + geom_point(mapping = aes(color = species, shape = species)) + + geom_smooth(method = "lm") +``` + +Note that the legend is automatically updated to reflect the different shapes of the points as well. + +And finally, we can improve the labels of our plot using the `labs()` function in a new layer. +Some of the arguments to `labs()` might be self explanatory: `title` adds a title and `subtitle` adds a subtitle to the plot. +Other arguments match the aesthetic mappings, `x` is the x-axis label, `y` is the y-axis label, and `color` and `shape` define the label for the legend. +In addition, we can improve the color palette to be colorblind safe with the `scale_color_colorblind()` function from the ggthemes package. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins, with a +#| line of best fit displaying the relationship between these two variables +#| overlaid. The plot displays a positive, fairly linear, and relatively +#| strong relationship between these two variables. Species (Adelie, +#| Chinstrap, and Gentoo) are represented with different colors and +#| shapes. The relationship between body mass and flipper length is +#| roughly the same for these three species, and Gentoo penguins are +#| larger than penguins from the other two species. + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) +) + + geom_point(aes(color = species, shape = species)) + + geom_smooth(method = "lm") + + labs( + title = "Body mass and flipper length", + subtitle = "Dimensions for Adelie, Chinstrap, and Gentoo Penguins", + x = "Flipper length (mm)", y = "Body mass (g)", + color = "Species", shape = "Species" + ) + + scale_color_colorblind() +``` + +We finally have a plot that perfectly matches our "ultimate goal"! + +### Exercises + +1. How many rows are in `penguins`? + How many columns? + +2. What does the `bill_depth_mm` variable in the `penguins` data frame describe? + Read the help for `?penguins` to find out. + +3. Make a scatterplot of `bill_depth_mm` vs. `bill_length_mm`. + That is, make a scatterplot with `bill_depth_mm` on the y-axis and `bill_length_mm` on the x-axis. + Describe the relationship between these two variables. + +4. What happens if you make a scatterplot of `species` vs. `bill_depth_mm`? + What might be a better choice of geom? + +5. Why does the following give an error and how would you fix it? + + ```{r} + #| eval: false + + ggplot(data = penguins) + + geom_point() + ``` + +6. What does the `na.rm` argument do in `geom_point()`? + What is the default value of the argument? + Create a scatterplot where you successfully use this argument set to `TRUE`. + +7. Add the following caption to the plot you made in the previous exercise: "Data come from the palmerpenguins package." Hint: Take a look at the documentation for `labs()`. + +8. Recreate the following visualization. + What aesthetic should `bill_depth_mm` be mapped to? + And should it be mapped at the global level or at the geom level? + + ```{r} + #| echo: false + #| warning: false + #| fig-alt: | + #| A scatterplot of body mass vs. flipper length of penguins, colored + #| by bill depth. A smooth curve of the relationship between body mass + #| and flipper length is overlaid. The relationship is positive, + #| fairly linear, and moderately strong. + + ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) + ) + + geom_point(aes(color = bill_depth_mm)) + + geom_smooth() + ``` + +9. Run this code in your head and predict what the output will look like. + Then, run the code in R and check your predictions. + + ```{r} + #| eval: false + + ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g, color = island) + ) + + geom_point() + + geom_smooth(se = FALSE) + ``` + +10. Will these two graphs look different? + Why/why not? + + ```{r} + #| eval: false + + ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) + ) + + geom_point() + + geom_smooth() + + ggplot() + + geom_point( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) + ) + + geom_smooth( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) + ) + ``` + +## ggplot2 calls {#sec-ggplot2-calls} + +As we move on from these introductory sections, we'll transition to a more concise expression of ggplot2 code. +So far we've been very explicit, which is helpful when you are learning: + +```{r} +#| eval: false + +ggplot( + data = penguins, + mapping = aes(x = flipper_length_mm, y = body_mass_g) +) + + geom_point() +``` + +Typically, the first one or two arguments to a function are so important that you should know them by heart. +The first two arguments to `ggplot()` are `data` and `mapping`, in the remainder of the book, we won't supply those names. +That saves typing, and, by reducing the amount of extra text, makes it easier to see what's different between plots. +That's a really important programming concern that we'll come back to in @sec-functions. + +Rewriting the previous plot more concisely yields: + +```{r} +#| eval: false + +ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + + geom_point() +``` + +In the future, you'll also learn about the pipe, `|>`, which will allow you to create that plot with: + +```{r} +#| eval: false + +penguins |> + ggplot(aes(x = flipper_length_mm, y = body_mass_g)) + + geom_point() +``` + +## Visualizing distributions + +How you visualize the distribution of a variable depends on the type of variable: categorical or numerical. + +### A categorical variable + +A variable is **categorical** if it can only take one of a small set of values. +To examine the distribution of a categorical variable, you can use a bar chart. +The height of the bars displays how many observations occurred with each `x` value. + +```{r} +#| fig-alt: | +#| A bar chart of frequencies of species of penguins: Adelie +#| (approximately 150), Chinstrap (approximately 90), Gentoo +#| (approximately 125). + +ggplot(penguins, aes(x = species)) + + geom_bar() +``` + +In bar plots of categorical variables with non-ordered levels, like the penguin `species` above, it's often preferable to reorder the bars based on their frequencies. +Doing so requires transforming the variable to a factor (how R handles categorical data) and then reordering the levels of that factor. + +```{r} +#| fig-alt: | +#| A bar chart of frequencies of species of penguins, where the bars are +#| ordered in decreasing order of their heights (frequencies): Adelie +#| (approximately 150), Gentoo (approximately 125), Chinstrap +#| (approximately 90). + +ggplot(penguins, aes(x = fct_infreq(species))) + + geom_bar() +``` + +You will learn more about factors and functions for dealing with factors (like `fct_infreq()` shown above) in @sec-factors. + +### A numerical variable + +A variable is **numerical** (or quantitative) if it can take on a wide range of numerical values, and it is sensible to add, subtract, or take averages with those values. +Numerical variables can be continuous or discrete. + +One commonly used visualization for distributions of continuous variables is a histogram. + +```{r} +#| warning: false +#| fig-alt: | +#| A histogram of body masses of penguins. The distribution is unimodal +#| and right skewed, ranging between approximately 2500 to 6500 grams. + +ggplot(penguins, aes(x = body_mass_g)) + + geom_histogram(binwidth = 200) +``` + +A histogram divides the x-axis into equally spaced bins and then uses the height of a bar to display the number of observations that fall in each bin. +In the graph above, the tallest bar shows that 39 observations have a `body_mass_g` value between 3,500 and 3,700 grams, which are the left and right edges of the bar. + +You can set the width of the intervals in a histogram with the binwidth argument, which is measured in the units of the `x` variable. +You should always explore a variety of binwidths when working with histograms, as different binwidths can reveal different patterns. +In the plots below a binwidth of 20 is too narrow, resulting in too many bars, making it difficult to determine the shape of the distribution. +Similarly, a binwidth of 2,000 is too high, resulting in all data being binned into only three bars, and also making it difficult to determine the shape of the distribution. +A binwidth of 200 provides a sensible balance. + +```{r} +#| warning: false +#| layout-ncol: 2 +#| fig-width: 3 +#| fig-alt: | +#| Two histograms of body masses of penguins, one with binwidth of 20 +#| (left) and one with binwidth of 2000 (right). The histogram with binwidth +#| of 20 shows lots of ups and downs in the heights of the bins, creating a +#| jagged outline. The histogram with binwidth of 2000 shows only three bins. + +ggplot(penguins, aes(x = body_mass_g)) + + geom_histogram(binwidth = 20) +ggplot(penguins, aes(x = body_mass_g)) + + geom_histogram(binwidth = 2000) +``` + +An alternative visualization for distributions of numerical variables is a density plot. +A density plot is a smoothed-out version of a histogram and a practical alternative, particularly for continuous data that comes from an underlying smooth distribution. +We won't go into how `geom_density()` estimates the density (you can read more about that in the function documentation), but let's explain how the density curve is drawn with an analogy. +Imagine a histogram made out of wooden blocks. +Then, imagine that you drop a cooked spaghetti string over it. +The shape the spaghetti will take draped over blocks can be thought of as the shape of the density curve. +It shows fewer details than a histogram but can make it easier to quickly glean the shape of the distribution, particularly with respect to modes and skewness. + +```{r} +#| fig-alt: | +#| A density plot of body masses of penguins. The distribution is unimodal +#| and right skewed, ranging between approximately 2500 to 6500 grams. + +ggplot(penguins, aes(x = body_mass_g)) + + geom_density() +``` + +### Exercises + +1. Make a bar plot of `species` of `penguins`, where you assign `species` to the `y` aesthetic. + How is this plot different? + +2. How are the following two plots different? + Which aesthetic, `color` or `fill`, is more useful for changing the color of bars? + + ```{r} + #| eval: false + + ggplot(penguins, aes(x = species)) + + geom_bar(color = "red") + + ggplot(penguins, aes(x = species)) + + geom_bar(fill = "red") + ``` + +3. What does the `bins` argument in `geom_histogram()` do? + +4. Make a histogram of the `carat` variable in the `diamonds` dataset that is available when you load the tidyverse package. + Experiment with different binwidths. + What binwidth reveals the most interesting patterns? + +## Visualizing relationships + +To visualize a relationship we need to have at least two variables mapped to aesthetics of a plot. +In the following sections you will learn about commonly used plots for visualizing relationships between two or more variables and the geoms used for creating them. + +### A numerical and a categorical variable + +To visualize the relationship between a numerical and a categorical variable we can use side-by-side box plots. +A **boxplot** is a type of visual shorthand for measures of position (percentiles) that describe a distribution. +It is also useful for identifying potential outliers. +As shown in @fig-eda-boxplot, each boxplot consists of: + +- A box that indicates the range of the middle half of the data, a distance known as the interquartile range (IQR), stretching from the 25th percentile of the distribution to the 75th percentile. + In the middle of the box is a line that displays the median, i.e. 50th percentile, of the distribution. + These three lines give you a sense of the spread of the distribution and whether or not the distribution is symmetric about the median or skewed to one side. + +- Visual points that display observations that fall more than 1.5 times the IQR from either edge of the box. + These outlying points are unusual so are plotted individually. + +- A line (or whisker) that extends from each end of the box and goes to the farthest non-outlier point in the distribution. + +```{r} +#| label: fig-eda-boxplot +#| echo: false +#| fig-cap: | +#| Diagram depicting how a boxplot is created. +#| fig-alt: | +#| A diagram depicting how a boxplot is created following the steps outlined +#| above. + +knitr::include_graphics("images/EDA-boxplot.png") +``` + +Let's take a look at the distribution of body mass by species using `geom_boxplot()`: + +```{r} +#| warning: false +#| fig-alt: | +#| Side-by-side box plots of distributions of body masses of Adelie, +#| Chinstrap, and Gentoo penguins. The distribution of Adelie and +#| Chinstrap penguins' body masses appear to be symmetric with +#| medians around 3750 grams. The median body mass of Gentoo penguins +#| is much higher, around 5000 grams, and the distribution of the +#| body masses of these penguins appears to be somewhat right skewed. + +ggplot(penguins, aes(x = species, y = body_mass_g)) + + geom_boxplot() +``` + +Alternatively, we can make density plots with `geom_density()`. + +```{r} +#| warning: false +#| fig-alt: | +#| A density plot of body masses of penguins by species of penguins. Each +#| species (Adelie, Chinstrap, and Gentoo) is represented with different +#| colored outlines for the density curves. + +ggplot(penguins, aes(x = body_mass_g, color = species)) + + geom_density(linewidth = 0.75) +``` + +We've also customized the thickness of the lines using the `linewidth` argument in order to make them stand out a bit more against the background. + +Additionally, we can map `species` to both `color` and `fill` aesthetics and use the `alpha` aesthetic to add transparency to the filled density curves. +This aesthetic takes values between 0 (completely transparent) and 1 (completely opaque). +In the following plot it's *set* to 0.5. + +```{r} +#| warning: false +#| fig-alt: | +#| A density plot of body masses of penguins by species of penguins. Each +#| species (Adelie, Chinstrap, and Gentoo) is represented in different +#| colored outlines for the density curves. The density curves are also +#| filled with the same colors, with some transparency added. + +ggplot(penguins, aes(x = body_mass_g, color = species, fill = species)) + + geom_density(alpha = 0.5) +``` + +Note the terminology we have used here: + +- We *map* variables to aesthetics if we want the visual attribute represented by that aesthetic to vary based on the values of that variable. +- Otherwise, we *set* the value of an aesthetic. + +### Two categorical variables + +We can use stacked bar plots to visualize the relationship between two categorical variables. +For example, the following two stacked bar plots both display the relationship between `island` and `species`, or specifically, visualizing the distribution of `species` within each island. + +The first plot shows the frequencies of each species of penguins on each island. +The plot of frequencies show that there are equal numbers of Adelies on each island. +But we don't have a good sense of the percentage balance within each island. + +```{r} +#| fig-alt: | +#| Bar plots of penguin species by island (Biscoe, Dream, and Torgersen) +ggplot(penguins, aes(x = island, fill = species)) + + geom_bar() +``` + +The second plot is a relative frequency plot, created by setting `position = "fill"` in the geom is more useful for comparing species distributions across islands since it's not affected by the unequal numbers of penguins across the islands. +Using this plot we can see that Gentoo penguins all live on Biscoe island and make up roughly 75% of the penguins on that island, Chinstrap all live on Dream island and make up roughly 50% of the penguins on that island, and Adelie live on all three islands and make up all of the penguins on Torgersen. + +```{r} +#| fig-alt: | +#| Bar plots of penguin species by island (Biscoe, Dream, and Torgersen) +#| the bars are scaled to the same height, making it a relative frequencies +#| plot + +ggplot(penguins, aes(x = island, fill = species)) + + geom_bar(position = "fill") +``` + +In creating these bar charts, we map the variable that will be separated into bars to the `x` aesthetic, and the variable that will change the colors inside the bars to the `fill` aesthetic. + +### Two numerical variables + +So far you've learned about scatterplots (created with `geom_point()`) and smooth curves (created with `geom_smooth()`) for visualizing the relationship between two numerical variables. +A scatterplot is probably the most commonly used plot for visualizing the relationship between two numerical variables. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. The plot +#| displays a positive, linear, relatively strong relationship between +#| these two variables. + +ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + + geom_point() +``` + +### Three or more variables + +As we saw in @sec-adding-aesthetics-layers, we can incorporate more variables into a plot by mapping them to additional aesthetics. +For example, in the following scatterplot the colors of points represent species and the shapes of points represent islands. + +```{r} +#| warning: false +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. The plot +#| displays a positive, linear, relatively strong relationship between +#| these two variables. The points are colored based on the species of the +#| penguins and the shapes of the points represent islands (round points are +#| Biscoe island, triangles are Dream island, and squared are Torgersen +#| island). The plot is very busy and it's difficult to distinguish the shapes +#| of the points. + +ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + + geom_point(aes(color = species, shape = island)) +``` + +However adding too many aesthetic mappings to a plot makes it cluttered and difficult to make sense of. +Another way, which is particularly useful for categorical variables, is to split your plot into **facets**, subplots that each display one subset of the data. + +To facet your plot by a single variable, use `facet_wrap()`. +The first argument of `facet_wrap()` is a formula[^data-visualize-3], which you create with `~` followed by a variable name. +The variable that you pass to `facet_wrap()` should be categorical. + +[^data-visualize-3]: Here "formula" is the name of the thing created by `~`, not a synonym for "equation". + +```{r} +#| warning: false +#| fig-width: 8 +#| fig-asp: 0.33 +#| fig-alt: | +#| A scatterplot of body mass vs. flipper length of penguins. The shapes and +#| colors of points represent species. Penguins from each island are on a +#| separate facet. Within each facet, the relationship between body mass and +#| flipper length is positive, linear, relatively strong. + +ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + + geom_point(aes(color = species, shape = species)) + + facet_wrap(~island) +``` + +You will learn about many other geoms for visualizing distributions of variables and relationships between them in @sec-layers. + +### Exercises + +1. The `mpg` data frame that is bundled with the ggplot2 package contains `r nrow(mpg)` observations collected by the US Environmental Protection Agency on `r mpg |> distinct(model) |> nrow()` car models. + Which variables in `mpg` are categorical? + Which variables are numerical? + (Hint: Type `?mpg` to read the documentation for the dataset.) How can you see this information when you run `mpg`? + +2. Make a scatterplot of `hwy` vs. `displ` using the `mpg` data frame. + Next, map a third, numerical variable to `color`, then `size`, then both `color` and `size`, then `shape`. + How do these aesthetics behave differently for categorical vs. numerical variables? + +3. In the scatterplot of `hwy` vs. `displ`, what happens if you map a third variable to `linewidth`? + +4. What happens if you map the same variable to multiple aesthetics? + +5. Make a scatterplot of `bill_depth_mm` vs. `bill_length_mm` and color the points by `species`. + What does adding coloring by species reveal about the relationship between these two variables? + What about faceting by `species`? + +6. Why does the following yield two separate legends? + How would you fix it to combine the two legends? + + ```{r} + #| warning: false + #| fig-show: hide + + ggplot( + data = penguins, + mapping = aes( + x = bill_length_mm, y = bill_depth_mm, + color = species, shape = species + ) + ) + + geom_point() + + labs(color = "Species") + ``` + +7. Create the two following stacked bar plots. + Which question can you answer with the first one? + Which question can you answer with the second one? + + ```{r} + #| fig-show: hide + + ggplot(penguins, aes(x = island, fill = species)) + + geom_bar(position = "fill") + ggplot(penguins, aes(x = species, fill = island)) + + geom_bar(position = "fill") + ``` + +## Saving your plots {#sec-ggsave} + +Once you've made a plot, you might want to get it out of R by saving it as an image that you can use elsewhere. +That's the job of `ggsave()`, which will save the plot most recently created to disk: + +```{r} +#| fig-show: hide +#| warning: false + +ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) + + geom_point() +ggsave(filename = "penguin-plot.png") +``` + +```{r} +#| include: false + +file.remove("penguin-plot.png") +``` + +This will save your plot to your working directory, a concept you'll learn more about in @sec-workflow-scripts-projects. + +If you don't specify the `width` and `height` they will be taken from the dimensions of the current plotting device. +For reproducible code, you'll want to specify them. +You can learn more about `ggsave()` in the documentation. + +Generally, however, we recommend that you assemble your final reports using Quarto, a reproducible authoring system that allows you to interleave your code and your prose and automatically include your plots in your write-ups. +You will learn more about Quarto in @sec-quarto. + +### Exercises + +1. Run the following lines of code. + Which of the two plots is saved as `mpg-plot.png`? + Why? + + ```{r} + #| eval: false + + ggplot(mpg, aes(x = class)) + + geom_bar() + ggplot(mpg, aes(x = cty, y = hwy)) + + geom_point() + ggsave("mpg-plot.png") + ``` + +2. What do you need to change in the code above to save the plot as a PDF instead of a PNG? + How could you find out what types of image files would work in `ggsave()`? + +## Common problems + +As you start to run R code, you're likely to run into problems. +Don't worry --- it happens to everyone. +We have all been writing R code for years, but every day we still write code that doesn't work on the first try! + +Start by carefully comparing the code that you're running to the code in the book. +R is extremely picky, and a misplaced character can make all the difference. +Make sure that every `(` is matched with a `)` and every `"` is paired with another `"`. +Sometimes you'll run the code and nothing happens. +Check the left-hand of your console: if it's a `+`, it means that R doesn't think you've typed a complete expression and it's waiting for you to finish it. +In this case, it's usually easy to start from scratch again by pressing ESCAPE to abort processing the current command. + +One common problem when creating ggplot2 graphics is to put the `+` in the wrong place: it has to come at the end of the line, not the start. +In other words, make sure you haven't accidentally written code like this: + +```{r} +#| eval: false + +ggplot(data = mpg) ++ geom_point(mapping = aes(x = displ, y = hwy)) +``` + +If you're still stuck, try the help. +You can get help about any R function by running `?function_name` in the console, or highlighting the function name and pressing F1 in RStudio. +Don't worry if the help doesn't seem that helpful - instead skip down to the examples and look for code that matches what you're trying to do. + +If that doesn't help, carefully read the error message. +Sometimes the answer will be buried there! +But when you're new to R, even if the answer is in the error message, you might not yet know how to understand it. +Another great tool is Google: try googling the error message, as it's likely someone else has had the same problem, and has gotten help online. + +## Summary + +In this chapter, you've learned the basics of data visualization with ggplot2. +We started with the basic idea that underpins ggplot2: a visualization is a mapping from variables in your data to aesthetic properties like position, color, size and shape. +You then learned about increasing the complexity and improving the presentation of your plots layer-by-layer. +You also learned about commonly used plots for visualizing the distribution of a single variable as well as for visualizing relationships between two or more variables, by leveraging additional aesthetic mappings and/or splitting your plot into small multiples using faceting. + +We'll use visualizations again and again throughout this book, introducing new techniques as we need them as well as do a deeper dive into creating visualizations with ggplot2 in @sec-layers through @sec-communication. + +With the basics of visualization under your belt, in the next chapter we're going to switch gears a little and give you some practical workflow advice. +We intersperse workflow advice with data science tools throughout this part of the book because it'll help you stay organized as you write increasing amounts of R code. diff --git a/data/01-sales.csv b/data/01-sales.csv new file mode 100644 index 000000000..534e8ac10 --- /dev/null +++ b/data/01-sales.csv @@ -0,0 +1,8 @@ +month,year,brand,item,n +January,2019,1,1234,3 +January,2019,1,8721,9 +January,2019,1,1822,2 +January,2019,2,3333,1 +January,2019,2,2156,9 +January,2019,2,3987,6 +January,2019,2,3827,6 \ No newline at end of file diff --git a/data/02-sales.csv b/data/02-sales.csv new file mode 100644 index 000000000..6c8d2aa52 --- /dev/null +++ b/data/02-sales.csv @@ -0,0 +1,7 @@ +month,year,brand,item,n +February,2019,1,1234,8 +February,2019,1,8721,2 +February,2019,1,1822,3 +February,2019,2,3333,1 +February,2019,2,2156,3 +February,2019,2,3987,6 diff --git a/data/03-sales.csv b/data/03-sales.csv new file mode 100644 index 000000000..61d4d34a0 --- /dev/null +++ b/data/03-sales.csv @@ -0,0 +1,7 @@ +month,year,brand,item,n +March,2019,1,1234,3 +March,2019,1,3627,1 +March,2019,1,8820,3 +March,2019,2,7253,1 +March,2019,2,8766,3 +March,2019,2,8288,6 diff --git a/data/bake-sale.xlsx b/data/bake-sale.xlsx new file mode 100644 index 000000000..788373cc0 Binary files /dev/null and b/data/bake-sale.xlsx differ diff --git a/data/gapminder.R b/data/gapminder.R new file mode 100644 index 000000000..c2c297911 --- /dev/null +++ b/data/gapminder.R @@ -0,0 +1,20 @@ + + +repurrrsive::gap_simple |> + count(year) + +by_year <- repurrrsive::gap_simple |> + group_by(year) +paths <- by_year |> + group_keys() |> + mutate(path = str_glue("data/gapminder/{year}.xlsx")) |> + pull() +paths + +years <- by_year |> + group_split() |> + map(\(df) select(df, -year)) + +dir.create("data/gapminder") + +walk2(years, paths, writexl::write_xlsx) diff --git a/data/gapminder/1952.xlsx b/data/gapminder/1952.xlsx new file mode 100644 index 000000000..7ce82a5b3 Binary files /dev/null and b/data/gapminder/1952.xlsx differ diff --git a/data/gapminder/1957.xlsx b/data/gapminder/1957.xlsx new file mode 100644 index 000000000..c909acdf2 Binary files /dev/null and b/data/gapminder/1957.xlsx differ diff --git a/data/gapminder/1962.xlsx b/data/gapminder/1962.xlsx new file mode 100644 index 000000000..621e4c682 Binary files /dev/null and b/data/gapminder/1962.xlsx differ diff --git a/data/gapminder/1967.xlsx b/data/gapminder/1967.xlsx new file mode 100644 index 000000000..337a45da9 Binary files /dev/null and b/data/gapminder/1967.xlsx differ diff --git a/data/gapminder/1972.xlsx b/data/gapminder/1972.xlsx new file mode 100644 index 000000000..21f9de80e Binary files /dev/null and b/data/gapminder/1972.xlsx differ diff --git a/data/gapminder/1977.xlsx b/data/gapminder/1977.xlsx new file mode 100644 index 000000000..f71a9f501 Binary files /dev/null and b/data/gapminder/1977.xlsx differ diff --git a/data/gapminder/1982.xlsx b/data/gapminder/1982.xlsx new file mode 100644 index 000000000..0ff0eae80 Binary files /dev/null and b/data/gapminder/1982.xlsx differ diff --git a/data/gapminder/1987.xlsx b/data/gapminder/1987.xlsx new file mode 100644 index 000000000..a0b10ceb3 Binary files /dev/null and b/data/gapminder/1987.xlsx differ diff --git a/data/gapminder/1992.xlsx b/data/gapminder/1992.xlsx new file mode 100644 index 000000000..6ae0e5690 Binary files /dev/null and b/data/gapminder/1992.xlsx differ diff --git a/data/gapminder/1997.xlsx b/data/gapminder/1997.xlsx new file mode 100644 index 000000000..fe6517026 Binary files /dev/null and b/data/gapminder/1997.xlsx differ diff --git a/data/gapminder/2002.xlsx b/data/gapminder/2002.xlsx new file mode 100644 index 000000000..f794a287a Binary files /dev/null and b/data/gapminder/2002.xlsx differ diff --git a/data/gapminder/2007.xlsx b/data/gapminder/2007.xlsx new file mode 100644 index 000000000..0601ec54c Binary files /dev/null and b/data/gapminder/2007.xlsx differ diff --git a/data/penguins.xlsx b/data/penguins.xlsx new file mode 100644 index 000000000..2e198cea3 Binary files /dev/null and b/data/penguins.xlsx differ diff --git a/data/roster.xlsx b/data/roster.xlsx new file mode 100644 index 000000000..43943aecb Binary files /dev/null and b/data/roster.xlsx differ diff --git a/data/sales.xlsx b/data/sales.xlsx new file mode 100644 index 000000000..c9d306cd4 Binary files /dev/null and b/data/sales.xlsx differ diff --git a/data/students.csv b/data/students.csv new file mode 100644 index 000000000..3ad908e5a --- /dev/null +++ b/data/students.csv @@ -0,0 +1,7 @@ +Student ID,Full Name,favourite.food,mealPlan,AGE +1,Sunil Huffmann,Strawberry yoghurt,Lunch only,4 +2,Barclay Lynn,French fries,Lunch only,5 +3,Jayendra Lyne,N/A,Breakfast and lunch,7 +4,Leon Rossini,Anchovies,Lunch only, +5,Chidiegwu Dunkel,Pizza,Breakfast and lunch,five +6,Güvenç Attila,Ice cream,Lunch only,6 \ No newline at end of file diff --git a/data/students.xlsx b/data/students.xlsx new file mode 100644 index 000000000..a0345c461 Binary files /dev/null and b/data/students.xlsx differ diff --git a/data/survey.xlsx b/data/survey.xlsx new file mode 100644 index 000000000..e4e647d0b Binary files /dev/null and b/data/survey.xlsx differ diff --git a/databases.qmd b/databases.qmd new file mode 100644 index 000000000..b67287b35 --- /dev/null +++ b/databases.qmd @@ -0,0 +1,666 @@ +# Databases {#sec-import-databases} + +```{r} +#| echo: false + +source("_common.R") +``` + +## Introduction + +A huge amount of data lives in databases, so it's essential that you know how to access it. +Sometimes you can ask someone to download a snapshot into a `.csv` for you, but this gets painful quickly: every time you need to make a change you'll have to communicate with another human. +You want to be able to reach into the database directly to get the data you need, when you need it. + +In this chapter, you'll first learn the basics of the DBI package: how to use it to connect to a database and then retrieve data with a SQL[^databases-1] query. +**SQL**, short for **s**tructured **q**uery **l**anguage, is the lingua franca of databases, and is an important language for all data scientists to learn. +That said, we're not going to start with SQL, but instead we'll teach you dbplyr, which can translate your dplyr code to the SQL. +We'll use that as a way to teach you some of the most important features of SQL. +You won't become a SQL master by the end of the chapter, but you will be able to identify the most important components and understand what they do. + +[^databases-1]: SQL is either pronounced "s"-"q"-"l" or "sequel". + +### Prerequisites + +In this chapter, we'll introduce DBI and dbplyr. +DBI is a low-level interface that connects to databases and executes SQL; dbplyr is a high-level interface that translates your dplyr code to SQL queries then executes them with DBI. + +```{r} +#| label: setup +#| message: false + +library(DBI) +library(dbplyr) +library(tidyverse) +``` + +## Database basics + +At the simplest level, you can think about a database as a collection of data frames, called **tables** in database terminology. +Like a data frame, a database table is a collection of named columns, where every value in the column is the same type. +There are three high level differences between data frames and database tables: + +- Database tables are stored on disk and can be arbitrarily large. + Data frames are stored in memory, and are fundamentally limited (although that limit is still plenty large for many problems). + +- Database tables almost always have indexes. + Much like the index of a book, a database index makes it possible to quickly find rows of interest without having to look at every single row. + Data frames and tibbles don't have indexes, but data.tables do, which is one of the reasons that they're so fast. + +- Most classical databases are optimized for rapidly collecting data, not analyzing existing data. + These databases are called **row-oriented** because the data is stored row-by-row, rather than column-by-column like R. + More recently, there's been much development of **column-oriented** databases that make analyzing the existing data much faster. + +Databases are run by database management systems (**DBMS**'s for short), which come in three basic forms: + +- **Client-server** DBMS's run on a powerful central server, which you connect from your computer (the client). They are great for sharing data with multiple people in an organization. Popular client-server DBMS's include PostgreSQL, MariaDB, SQL Server, and Oracle. +- **Cloud** DBMS's, like Snowflake, Amazon's RedShift, and Google's BigQuery, are similar to client server DBMS's, but they run in the cloud. This means that they can easily handle extremely large datasets and can automatically provide more compute resources as needed. +- **In-process** DBMS's, like SQLite or duckdb, run entirely on your computer. They're great for working with large datasets where you're the primary user. + +## Connecting to a database + +To connect to the database from R, you'll use a pair of packages: + +- You'll always use DBI (**d**ata**b**ase **i**nterface) because it provides a set of generic functions that connect to the database, upload data, run SQL queries, etc. + +- You'll also use a package tailored for the DBMS you're connecting to. + This package translates the generic DBI commands into the specifics needed for a given DBMS. + There's usually one package for each DBMS, e.g. + RPostgres for PostgreSQL and RMariaDB for MySQL. + +If you can't find a specific package for your DBMS, you can usually use the odbc package instead. +This uses the ODBC protocol supported by many DBMS. +odbc requires a little more setup because you'll also need to install an ODBC driver and tell the odbc package where to find it. + +Concretely, you create a database connection using `DBI::dbConnect()`. +The first argument selects the DBMS[^databases-2], then the second and subsequent arguments describe how to connect to it (i.e. where it lives and the credentials that you need to access it). +The following code shows a couple of typical examples: + +[^databases-2]: Typically, this is the only function you'll use from the client package, so we recommend using `::` to pull out that one function, rather than loading the complete package with `library()`. + +```{r} +#| eval: false +con <- DBI::dbConnect( + RMariaDB::MariaDB(), + username = "foo" +) +con <- DBI::dbConnect( + RPostgres::Postgres(), + hostname = "databases.mycompany.com", + port = 1234 +) +``` + +The precise details of the connection vary a lot from DBMS to DBMS so unfortunately we can't cover all the details here. +This means you'll need to do a little research on your own. +Typically you can ask the other data scientists in your team or talk to your DBA (**d**ata**b**ase **a**dministrator). +The initial setup will often take a little fiddling (and maybe some googling) to get it right, but you'll generally only need to do it once. + +### In this book + +Setting up a client-server or cloud DBMS would be a pain for this book, so we'll instead use an in-process DBMS that lives entirely in an R package: duckdb. +Thanks to the magic of DBI, the only difference between using duckdb and any other DBMS is how you'll connect to the database. +This makes it great to teach with because you can easily run this code as well as easily take what you learn and apply it elsewhere. + +Connecting to duckdb is particularly simple because the defaults create a temporary database that is deleted when you quit R. +That's great for learning because it guarantees that you'll start from a clean slate every time you restart R: + +```{r} +con <- DBI::dbConnect(duckdb::duckdb()) +``` + +duckdb is a high-performance database that's designed very much for the needs of a data scientist. +We use it here because it's very easy to get started with, but it's also capable of handling gigabytes of data with great speed. +If you want to use duckdb for a real data analysis project, you'll also need to supply the `dbdir` argument to make a persistent database and tell duckdb where to save it. +Assuming you're using a project (@sec-workflow-scripts-projects), it's reasonable to store it in the `duckdb` directory of the current project: + +```{r} +#| eval: false +con <- DBI::dbConnect(duckdb::duckdb(), dbdir = "duckdb") +``` + +### Load some data {#sec-load-data} + +Since this is a new database, we need to start by adding some data. +Here we'll add `mpg` and `diamonds` datasets from ggplot2 using `DBI::dbWriteTable()`. +The simplest usage of `dbWriteTable()` needs three arguments: a database connection, the name of the table to create in the database, and a data frame of data. + +```{r} +dbWriteTable(con, "mpg", ggplot2::mpg) +dbWriteTable(con, "diamonds", ggplot2::diamonds) +``` + +If you're using duckdb in a real project, we highly recommend learning about `duckdb_read_csv()` and `duckdb_register_arrow()`. +These give you powerful and performant ways to quickly load data directly into duckdb, without having to first load it into R. +We'll also show off a useful technique for loading multiple files into a database in @sec-save-database. + +### DBI basics + +You can check that the data is loaded correctly by using a couple of other DBI functions: `dbListTable()` lists all tables in the database[^databases-3] and `dbReadTable()` retrieves the contents of a table. + +[^databases-3]: At least, all the tables that you have permission to see. + +```{r} +dbListTables(con) + +con |> + dbReadTable("diamonds") |> + as_tibble() +``` + +`dbReadTable()` returns a `data.frame` so we use `as_tibble()` to convert it into a tibble so that it prints nicely. + +If you already know SQL, you can use `dbGetQuery()` to get the results of running a query on the database: + +```{r} +sql <- " + SELECT carat, cut, clarity, color, price + FROM diamonds + WHERE price > 15000 +" +as_tibble(dbGetQuery(con, sql)) +``` + +If you've never seen SQL before, don't worry! +You'll learn more about it shortly. +But if you read it carefully, you might guess that it selects five columns of the diamonds dataset and all the rows where `price` is greater than 15,000. + +## dbplyr basics + +Now that we've connected to a database and loaded up some data, we can start to learn about dbplyr. +dbplyr is a dplyr **backend**, which means that you keep writing dplyr code but the backend executes it differently. +In this, dbplyr translates to SQL; other backends include [dtplyr](https://dtplyr.tidyverse.org) which translates to [data.table](https://r-datatable.com), and [multidplyr](https://multidplyr.tidyverse.org) which executes your code on multiple cores. + +To use dbplyr, you must first use `tbl()` to create an object that represents a database table: + +```{r} +diamonds_db <- tbl(con, "diamonds") +diamonds_db +``` + +::: callout-note +There are two other common ways to interact with a database. +First, many corporate databases are very large so you need some hierarchy to keep all the tables organized. +In that case you might need to supply a schema, or a catalog and a schema, in order to pick the table you're interested in: + +```{r} +#| eval: false +diamonds_db <- tbl(con, in_schema("sales", "diamonds")) +diamonds_db <- tbl(con, in_catalog("north_america", "sales", "diamonds")) +``` + +Other times you might want to use your own SQL query as a starting point: + +```{r} +#| eval: false +diamonds_db <- tbl(con, sql("SELECT * FROM diamonds")) +``` +::: + +This object is **lazy**; when you use dplyr verbs on it, dplyr doesn't do any work: it just records the sequence of operations that you want to perform and only performs them when needed. +For example, take the following pipeline: + +```{r} +big_diamonds_db <- diamonds_db |> + filter(price > 15000) |> + select(carat:clarity, price) + +big_diamonds_db +``` + +You can tell this object represents a database query because it prints the DBMS name at the top, and while it tells you the number of columns, it typically doesn't know the number of rows. +This is because finding the total number of rows usually requires executing the complete query, something we're trying to avoid. + +You can see the SQL code generated by the dplyr function `show_query()`. +If you know dplyr, this is a great way to learn SQL! +Write some dplyr code, get dbplyr to translate it to SQL, and then try to figure out how the two languages match up. + +```{r} +big_diamonds_db |> + show_query() +``` + +To get all the data back into R, you call `collect()`. +Behind the scenes, this generates the SQL, calls `dbGetQuery()` to get the data, then turns the result into a tibble: + +```{r} +big_diamonds <- big_diamonds_db |> + collect() +big_diamonds +``` + +Typically, you'll use dbplyr to select the data you want from the database, performing basic filtering and aggregation using the translations described below. +Then, once you're ready to analyse the data with functions that are unique to R, you'll `collect()` the data to get an in-memory tibble, and continue your work with pure R code. + +## SQL + +The rest of the chapter will teach you a little SQL through the lens of dbplyr. +It's a rather non-traditional introduction to SQL but we hope it will get you quickly up to speed with the basics. +Luckily, if you understand dplyr you're in a great place to quickly pick up SQL because so many of the concepts are the same. + +We'll explore the relationship between dplyr and SQL using a couple of old friends from the nycflights13 package: `flights` and `planes`. +These datasets are easy to get into our learning database because dbplyr comes with a function that copies the tables from nycflights13 to our database: + +```{r} +dbplyr::copy_nycflights13(con) +flights <- tbl(con, "flights") +planes <- tbl(con, "planes") +``` + +```{r} +#| echo: false +options(dplyr.strict_sql = TRUE) +``` + +### SQL basics + +The top-level components of SQL are called **statements**. +Common statements include `CREATE` for defining new tables, `INSERT` for adding data, and `SELECT` for retrieving data. +We will focus on `SELECT` statements, also called **queries**, because they are almost exclusively what you'll use as a data scientist. + +A query is made up of **clauses**. +There are five important clauses: `SELECT`, `FROM`, `WHERE`, `ORDER BY`, and `GROUP BY`. Every query must have the `SELECT`[^databases-4] and `FROM`[^databases-5] clauses and the simplest query is `SELECT * FROM table`, which selects all columns from the specified table +. This is what dbplyr generates for an unadulterated table +: + +[^databases-4]: Confusingly, depending on the context, `SELECT` is either a statement or a clause. + To avoid this confusion, we'll generally use `SELECT` query instead of `SELECT` statement. + +[^databases-5]: Ok, technically, only the `SELECT` is required, since you can write queries like `SELECT 1+1` to perform basic calculations. + But if you want to work with data (as you always do!) you'll also need a `FROM` clause. + +```{r} +flights |> show_query() +planes |> show_query() +``` + +`WHERE` and `ORDER BY` control which rows are included and how they are ordered: + +```{r} +flights |> + filter(dest == "IAH") |> + arrange(dep_delay) |> + show_query() +``` + +`GROUP BY` converts the query to a summary, causing aggregation to happen: + +```{r} +flights |> + group_by(dest) |> + summarize(dep_delay = mean(dep_delay, na.rm = TRUE)) |> + show_query() +``` + +There are two important differences between dplyr verbs and SELECT clauses: + +- In SQL, case doesn't matter: you can write `select`, `SELECT`, or even `SeLeCt`. In this book we'll stick with the common convention of writing SQL keywords in uppercase to distinguish them from table or variables names. +- In SQL, order matters: you must always write the clauses in the order `SELECT`, `FROM`, `WHERE`, `GROUP BY`, `ORDER BY`. Confusingly, this order doesn't match how the clauses actually evaluated which is first `FROM`, then `WHERE`, `GROUP BY`, `SELECT`, and `ORDER BY`. + +The following sections explore each clause in more detail. + +::: callout-note +Note that while SQL is a standard, it is extremely complex and no database follows it exactly. +While the main components that we'll focus on in this book are very similar between DBMS's, there are many minor variations. +Fortunately, dbplyr is designed to handle this problem and generates different translations for different databases. +It's not perfect, but it's continually improving, and if you hit a problem you can file an issue [on GitHub](https://github.com/tidyverse/dbplyr/issues/) to help us do better. +::: + +### SELECT + +The `SELECT` clause is the workhorse of queries and performs the same job as `select()`, `mutate()`, `rename()`, `relocate()`, and, as you'll learn in the next section, `summarize()`. + +`select()`, `rename()`, and `relocate()` have very direct translations to `SELECT` as they just affect where a column appears (if at all) along with its name: + +```{r} +planes |> + select(tailnum, type, manufacturer, model, year) |> + show_query() + +planes |> + select(tailnum, type, manufacturer, model, year) |> + rename(year_built = year) |> + show_query() + +planes |> + select(tailnum, type, manufacturer, model, year) |> + relocate(manufacturer, model, .before = type) |> + show_query() +``` + +This example also shows you how SQL does renaming. +In SQL terminology renaming is called **aliasing** and is done with `AS`. +Note that unlike `mutate()`, the old name is on the left and the new name is on the right. + +::: callout-note +In the examples above note that `"year"` and `"type"` are wrapped in double quotes. +That's because these are **reserved words** in duckdb, so dbplyr quotes them to avoid any potential confusion between column/table names and SQL operators. + +When working with other databases you're likely to see every variable name quotes because only a handful of client packages, like duckdb, know what all the reserved words are, so they quote everything to be safe. + +``` sql +SELECT "tailnum", "type", "manufacturer", "model", "year" +FROM "planes" +``` + +Some other database systems use backticks instead of quotes: + +``` sql +SELECT `tailnum`, `type`, `manufacturer`, `model`, `year` +FROM `planes` +``` +::: + +The translations for `mutate()` are similarly straightforward: each variable becomes a new expression in `SELECT`: + +```{r} +flights |> + mutate( + speed = distance / (air_time / 60) + ) |> + show_query() +``` + +We'll come back to the translation of individual components (like `/`) in @sec-sql-expressions. + +### FROM + +The `FROM` clause defines the data source. +It's going to be rather uninteresting for a little while, because we're just using single tables. +You'll see more complex examples once we hit the join functions. + +### GROUP BY + +`group_by()` is translated to the `GROUP BY`[^databases-6] clause and `summarize()` is translated to the `SELECT` clause: + +[^databases-6]: This is no coincidence: the dplyr function name was inspired by the SQL clause. + +```{r} +diamonds_db |> + group_by(cut) |> + summarize( + n = n(), + avg_price = mean(price, na.rm = TRUE) + ) |> + show_query() +``` + +We'll come back to what's happening with translation `n()` and `mean()` in @sec-sql-expressions. + +### WHERE + +`filter()` is translated to the `WHERE` clause: + +```{r} +flights |> + filter(dest == "IAH" | dest == "HOU") |> + show_query() + +flights |> + filter(arr_delay > 0 & arr_delay < 20) |> + show_query() +``` + +There are a few important details to note here: + +- `|` becomes `OR` and `&` becomes `AND`. +- SQL uses `=` for comparison, not `==`. SQL doesn't have assignment, so there's no potential for confusion there. +- SQL uses only `''` for strings, not `""`. In SQL, `""` is used to identify variables, like R's ``` `` ```. + +Another useful SQL operator is `IN`, which is very close to R's `%in%`: + +```{r} +flights |> + filter(dest %in% c("IAH", "HOU")) |> + show_query() +``` + +SQL uses `NULL` instead of `NA`. +`NULL`s behave similarly to `NA`s. +The main difference is that while they're "infectious" in comparisons and arithmetic, they are silently dropped when summarizing. +dbplyr will remind you about this behavior the first time you hit it: + +```{r} +flights |> + group_by(dest) |> + summarize(delay = mean(arr_delay)) +``` + +If you want to learn more about how `NULL`s work, you might enjoy "[*Three valued logic*](https://modern-sql.com/concept/three-valued-logic)" by Markus Winand. + +In general, you can work with `NULL`s using the functions you'd use for `NA`s in R: + +```{r} +flights |> + filter(!is.na(dep_delay)) |> + show_query() +``` + +This SQL query illustrates one of the drawbacks of dbplyr: while the SQL is correct, it isn't as simple as you might write by hand. +In this case, you could drop the parentheses and use a special operator that's easier to read: + +``` sql +WHERE "dep_delay" IS NOT NULL +``` + +Note that if you `filter()` a variable that you created using a summarize, dbplyr will generate a `HAVING` clause, rather than a `WHERE` clause. +This is a one of the idiosyncrasies of SQL: `WHERE` is evaluated before `SELECT` and `GROUP BY`, so SQL needs another clause that's evaluated afterwards. + +```{r} +diamonds_db |> + group_by(cut) |> + summarize(n = n()) |> + filter(n > 100) |> + show_query() +``` + +### ORDER BY + +Ordering rows involves a straightforward translation from `arrange()` to the `ORDER BY` clause: + +```{r} +flights |> + arrange(year, month, day, desc(dep_delay)) |> + show_query() +``` + +Notice how `desc()` is translated to `DESC`: this is one of the many dplyr functions whose name was directly inspired by SQL. + +### Subqueries + +Sometimes it's not possible to translate a dplyr pipeline into a single `SELECT` statement and you need to use a subquery. +A **subquery** is just a query used as a data source in the `FROM` clause, instead of the usual table. + +dbplyr typically uses subqueries to work around limitations of SQL. +For example, expressions in the `SELECT` clause can't refer to columns that were just created. +That means that the following (silly) dplyr pipeline needs to happen in two steps: the first (inner) query computes `year1` and then the second (outer) query can compute `year2`. + +```{r} +flights |> + mutate( + year1 = year + 1, + year2 = year1 + 1 + ) |> + show_query() +``` + +You'll also see this if you attempted to `filter()` a variable that you just created. +Remember, even though `WHERE` is written after `SELECT`, it's evaluated before it, so we need a subquery in this (silly) example: + +```{r} +flights |> + mutate(year1 = year + 1) |> + filter(year1 == 2014) |> + show_query() +``` + +Sometimes dbplyr will create a subquery where it's not needed because it doesn't yet know how to optimize that translation. +As dbplyr improves over time, these cases will get rarer but will probably never go away. + +### Joins + +If you're familiar with dplyr's joins, SQL joins are very similar. +Here's a simple example: + +```{r} +flights |> + left_join(planes |> rename(year_built = year), by = "tailnum") |> + show_query() +``` + +The main thing to notice here is the syntax: SQL joins use sub-clauses of the `FROM` clause to bring in additional tables, using `ON` to define how the tables are related. + +dplyr's names for these functions are so closely connected to SQL that you can easily guess the equivalent SQL for `inner_join()`, `right_join()`, and `full_join()`: + +``` sql +SELECT flights.*, "type", manufacturer, model, engines, seats, speed +FROM flights +INNER JOIN planes ON (flights.tailnum = planes.tailnum) + +SELECT flights.*, "type", manufacturer, model, engines, seats, speed +FROM flights +RIGHT JOIN planes ON (flights.tailnum = planes.tailnum) + +SELECT flights.*, "type", manufacturer, model, engines, seats, speed +FROM flights +FULL JOIN planes ON (flights.tailnum = planes.tailnum) +``` + +You're likely to need many joins when working with data from a database. +That's because database tables are often stored in a highly normalized form, where each "fact" is stored in a single place and to keep a complete dataset for analysis you need to navigate a complex network of tables connected by primary and foreign keys. +If you hit this scenario, the [dm package](https://cynkra.github.io/dm/), by Tobias Schieferdecker, Kirill Müller, and Darko Bergant, is a life saver. +It can automatically determine the connections between tables using the constraints that DBAs often supply, visualize the connections so you can see what's going on, and generate the joins you need to connect one table to another. + +### Other verbs + +dbplyr also translates other verbs like `distinct()`, `slice_*()`, and `intersect()`, and a growing selection of tidyr functions like `pivot_longer()` and `pivot_wider()`. +The easiest way to see the full set of what's currently available is to visit the dbplyr website: . + +### Exercises + +1. What is `distinct()` translated to? + How about `head()`? + +2. Explain what each of the following SQL queries do and try recreate them using dbplyr. + + ``` sql + SELECT * + FROM flights + WHERE dep_delay < arr_delay + + SELECT *, distance / (airtime / 60) AS speed + FROM flights + ``` + +## Function translations {#sec-sql-expressions} + +So far we've focused on the big picture of how dplyr verbs are translated to the clauses of a query. +Now we're going to zoom in a little and talk about the translation of the R functions that work with individual columns, e.g., what happens when you use `mean(x)` in a `summarize()`? + +To help see what's going on, we'll use a couple of little helper functions that run a `summarize()` or `mutate()` and show the generated SQL. +That will make it a little easier to explore a few variations and see how summaries and transformations can differ. + +```{r} +summarize_query <- function(df, ...) { + df |> + summarize(...) |> + show_query() +} +mutate_query <- function(df, ...) { + df |> + mutate(..., .keep = "none") |> + show_query() +} +``` + +Let's dive in with some summaries! +Looking at the code below you'll notice that some summary functions, like `mean()`, have a relatively simple translation while others, like `median()`, are much more complex. +The complexity is typically higher for operations that are common in statistics but less common in databases. + +```{r} +flights |> + group_by(year, month, day) |> + summarize_query( + mean = mean(arr_delay, na.rm = TRUE), + median = median(arr_delay, na.rm = TRUE) + ) +``` + +The translation of summary functions becomes more complicated when you use them inside a `mutate()` because they have to turn into so-called **window** functions. +In SQL, you turn an ordinary aggregation function into a window function by adding `OVER` after it: + +```{r} +flights |> + group_by(year, month, day) |> + mutate_query( + mean = mean(arr_delay, na.rm = TRUE), + ) +``` + +In SQL, the `GROUP BY` clause is used exclusively for summaries so here you can see that the grouping has moved from the `PARTITION BY` argument to `OVER`. + +Window functions include all functions that look forward or backwards, like `lead()` and `lag()` which look at the "previous" or "next" value respectively: + +```{r} +flights |> + group_by(dest) |> + arrange(time_hour) |> + mutate_query( + lead = lead(arr_delay), + lag = lag(arr_delay) + ) +``` + +Here it's important to `arrange()` the data, because SQL tables have no intrinsic order. +In fact, if you don't use `arrange()` you might get the rows back in a different order every time! +Notice for window functions, the ordering information is repeated: the `ORDER BY` clause of the main query doesn't automatically apply to window functions. + +Another important SQL function is `CASE WHEN`. It's used as the translation of `if_else()` and `case_when()`, the dplyr function that it directly inspired. +Here are a couple of simple examples: + +```{r} +flights |> + mutate_query( + description = if_else(arr_delay > 0, "delayed", "on-time") + ) +flights |> + mutate_query( + description = + case_when( + arr_delay < -5 ~ "early", + arr_delay < 5 ~ "on-time", + arr_delay >= 5 ~ "late" + ) + ) +``` + +`CASE WHEN` is also used for some other functions that don't have a direct translation from R to SQL. +A good example of this is `cut()`: + +```{r} +flights |> + mutate_query( + description = cut( + arr_delay, + breaks = c(-Inf, -5, 5, Inf), + labels = c("early", "on-time", "late") + ) + ) +``` + +dbplyr also translates common string and date-time manipulation functions, which you can learn about in `vignette("translation-function", package = "dbplyr")`. +dbplyr's translations are certainly not perfect, and there are many R functions that aren't translated yet, but dbplyr does a surprisingly good job covering the functions that you'll use most of the time. + +## Summary + +In this chapter you learned how to access data from databases. +We focused on dbplyr, a dplyr "backend" that allows you to write the dplyr code you're familiar with, and have it be automatically translated to SQL. +We used that translation to teach you a little SQL; it's important to learn some SQL because it's *the* most commonly used language for working with data and knowing some will make it easier for you to communicate with other data folks who don't use R. +If you've finished this chapter and would like to learn more about SQL. +We have two recommendations: + +- [*SQL for Data Scientists*](https://sqlfordatascientists.com) by Renée M. P. Teate is an introduction to SQL designed specifically for the needs of data scientists, and includes examples of the sort of highly interconnected data you're likely to encounter in real organizations. +- [*Practical SQL*](https://www.practicalsql.com) by Anthony DeBarros is written from the perspective of a data journalist (a data scientist specialized in telling compelling stories) and goes into more detail about getting your data into a database and running your own DBMS. + +In the next chapter, we'll learn about another dplyr backend for working with large data: arrow. +Arrow is designed for working with large files on disk, and is a natural complement to databases. diff --git a/datetimes.Rmd b/datetimes.Rmd deleted file mode 100644 index e9477f558..000000000 --- a/datetimes.Rmd +++ /dev/null @@ -1,571 +0,0 @@ -# Dates and times - -## Introduction - -This chapter will show you how to work with dates and times in R. At first glance, dates and times seem simple. You use them all the time in your regular life, and they don't seem to cause much confusion. However, the more you learn about dates and times, the more complicated they seem to get. To warm up, try these three seemingly simple questions: - -* Does every year have 365 days? -* Does every day have 24 hours? -* Does every minute have 60 seconds? - -I'm sure you know that not every year has 365 days, but do you know the full rule for determining if a year is a leap year? (It has three parts.) You might have remembered that many parts of the world use daylight savings time (DST), so that some days have 23 hours, and others have 25. You might not have known that some minutes have 61 seconds because every now and then leap seconds are added because the Earth's rotation is gradually slowing down. - -Dates and times are hard because they have to reconcile two physical phenomena (the rotation of the Earth and its orbit around the sun) with a whole raft of geopolitical phenomena including months, time zones, and DST. This chapter won't teach you every last detail about dates and times, but it will give you a solid grounding of practical skills that will help you with common data analysis challenges. - -### Prerequisites - -This chapter will focus on the __lubridate__ package, which makes it easier to work with dates and times in R. lubridate is not part of core tidyverse because you only need it when you're working with dates/times. We will also need nycflights13 for practice data. - -```{r setup, message = FALSE} -library(tidyverse) - -library(lubridate) -library(nycflights13) -``` - -## Creating date/times - -There are three types of date/time data that refer to an instant in time: - -* A __date__. Tibbles print this as ``. - -* A __time__ within a day. Tibbles print this as `