From b48021af6ac665a9482ded819b9cb96c632fd252 Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Wed, 3 May 2023 07:47:11 -0700 Subject: [PATCH] [automation] transform lesson to sandpaper --- .editorconfig | 26 ++ .github/workflows/README.md | 198 ++++++++++++++ .github/workflows/pr-close-signal.yaml | 23 ++ .github/workflows/pr-comment.yaml | 185 +++++++++++++ .github/workflows/pr-post-remove-branch.yaml | 32 +++ .github/workflows/pr-preflight.yaml | 39 +++ .github/workflows/pr-receive.yaml | 131 ++++++++++ .github/workflows/sandpaper-main.yaml | 61 +++++ .github/workflows/sandpaper-version.txt | 1 + .github/workflows/update-cache.yaml | 125 +++++++++ .github/workflows/update-workflows.yaml | 66 +++++ .github/workflows/workbench-beta-phase.yml | 60 +++++ .gitignore | 55 ++++ CODE_OF_CONDUCT.md | 13 + CONTRIBUTING.md | 121 +++++++++ LICENSE.md | 79 ++++++ README.md | 16 +- _extras/guide.md | 114 -------- config.yaml | 87 +++++++ episodes/00-intro.md | 100 ++++--- episodes/01-format-data.md | 219 +++++++++------- episodes/02-common-mistakes.md | 166 ++++++------ episodes/03-dates-as-data.md | 245 +++++++++++------- episodes/04-quality-control.md | 182 +++++++------ episodes/05-exporting-data.md | 98 ++++--- .../data}/survey_data_spreadsheet_messy.xls | Bin .../data}/survey_sorting_exercise.xlsx | Bin {fig => episodes/fig}/1_helpful_clippy.jpg | Bin {fig => episodes/fig}/2_datasheet_example.jpg | Bin {fig => episodes/fig}/3_white_table_1.jpg | Bin {fig => episodes/fig}/4_merged_cells.jpg | Bin {fig => episodes/fig}/5_excel_dates_1.jpg | Bin {fig => episodes/fig}/6_excel_dates_2.jpg | Bin {fig => episodes/fig}/7_excel_dates_3.jpg | Bin {img => episodes/fig}/DC1_logo_small.png | Bin {img => episodes/fig}/DataONE_LOGO.jpg | Bin .../fig}/conditional_formating.png | Bin .../creative-commons-attribution-license.png | Bin {fig => episodes/fig}/csv-mistake.png | Bin {fig => episodes/fig}/data_validation.png | Bin .../fig}/data_validation_window.png | Bin {fig => episodes/fig}/drop_down_list.png | Bin {fig => episodes/fig}/drop_down_list2.png | Bin {fig => episodes/fig}/error_alert.png | Bin {fig => episodes/fig}/excel-to-csv.png | Bin .../fig}/excel_tables_example.png | Bin .../fig}/excel_tables_example1.png | Bin {fig => episodes/fig}/formatting.png | Bin {fig => episodes/fig}/good_formatting.png | Bin {fig => episodes/fig}/input_message.png | Bin {fig => episodes/fig}/invalid_value.png | Bin {fig => episodes/fig}/multiple-info.png | Bin {fig => episodes/fig}/plot_validation.png | Bin {fig => episodes/fig}/single-info.png | Bin .../fig}/solution_exercise_1_dates.png | Bin {fig => episodes/fig}/sorting.png | Bin {fig => episodes/fig}/sorting_button.png | Bin {fig => episodes/fig}/sorting_solution_1.png | Bin {fig => episodes/fig}/sorting_solution_2.png | Bin .../fig}/spreadsheet-setup-updated.png | Bin {fig => episodes/fig}/spreadsheet-setup.png | Bin index.md | 75 +++--- {_extras => instructors}/datamanagement.md | 24 +- instructors/instructor-notes.md | 115 ++++++++ {_extras => learners}/discuss.md | 38 +-- reference.md => learners/reference.md | 12 +- learners/setup.md | 58 +++++ profiles/learner-profiles.md | 5 + setup.md | 53 ---- site/README.md | 2 + 70 files changed, 2157 insertions(+), 667 deletions(-) create mode 100644 .editorconfig create mode 100755 .github/workflows/README.md create mode 100755 .github/workflows/pr-close-signal.yaml create mode 100755 .github/workflows/pr-comment.yaml create mode 100755 .github/workflows/pr-post-remove-branch.yaml create mode 100755 .github/workflows/pr-preflight.yaml create mode 100755 .github/workflows/pr-receive.yaml create mode 100755 .github/workflows/sandpaper-main.yaml create mode 100644 .github/workflows/sandpaper-version.txt create mode 100755 .github/workflows/update-cache.yaml create mode 100755 .github/workflows/update-workflows.yaml create mode 100644 .github/workflows/workbench-beta-phase.yml create mode 100644 .gitignore create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 LICENSE.md delete mode 100644 _extras/guide.md create mode 100644 config.yaml rename {data => episodes/data}/survey_data_spreadsheet_messy.xls (100%) rename {data => episodes/data}/survey_sorting_exercise.xlsx (100%) rename {fig => episodes/fig}/1_helpful_clippy.jpg (100%) rename {fig => episodes/fig}/2_datasheet_example.jpg (100%) rename {fig => episodes/fig}/3_white_table_1.jpg (100%) rename {fig => episodes/fig}/4_merged_cells.jpg (100%) rename {fig => episodes/fig}/5_excel_dates_1.jpg (100%) rename {fig => episodes/fig}/6_excel_dates_2.jpg (100%) rename {fig => episodes/fig}/7_excel_dates_3.jpg (100%) rename {img => episodes/fig}/DC1_logo_small.png (100%) rename {img => episodes/fig}/DataONE_LOGO.jpg (100%) rename {fig => episodes/fig}/conditional_formating.png (100%) rename {img => episodes/fig}/creative-commons-attribution-license.png (100%) rename {fig => episodes/fig}/csv-mistake.png (100%) rename {fig => episodes/fig}/data_validation.png (100%) rename {fig => episodes/fig}/data_validation_window.png (100%) rename {fig => episodes/fig}/drop_down_list.png (100%) rename {fig => episodes/fig}/drop_down_list2.png (100%) rename {fig => episodes/fig}/error_alert.png (100%) rename {fig => episodes/fig}/excel-to-csv.png (100%) rename {fig => episodes/fig}/excel_tables_example.png (100%) rename {fig => episodes/fig}/excel_tables_example1.png (100%) rename {fig => episodes/fig}/formatting.png (100%) rename {fig => episodes/fig}/good_formatting.png (100%) rename {fig => episodes/fig}/input_message.png (100%) rename {fig => episodes/fig}/invalid_value.png (100%) rename {fig => episodes/fig}/multiple-info.png (100%) rename {fig => episodes/fig}/plot_validation.png (100%) rename {fig => episodes/fig}/single-info.png (100%) rename {fig => episodes/fig}/solution_exercise_1_dates.png (100%) rename {fig => episodes/fig}/sorting.png (100%) rename {fig => episodes/fig}/sorting_button.png (100%) rename {fig => episodes/fig}/sorting_solution_1.png (100%) rename {fig => episodes/fig}/sorting_solution_2.png (100%) rename {fig => episodes/fig}/spreadsheet-setup-updated.png (100%) rename {fig => episodes/fig}/spreadsheet-setup.png (100%) rename {_extras => instructors}/datamanagement.md (65%) create mode 100644 instructors/instructor-notes.md rename {_extras => learners}/discuss.md (76%) rename reference.md => learners/reference.md (88%) create mode 100644 learners/setup.md create mode 100644 profiles/learner-profiles.md delete mode 100644 setup.md create mode 100644 site/README.md diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..5bf4860 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,26 @@ +root = true + +[*] +charset = utf-8 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.md] +indent_size = 2 +indent_style = space +max_line_length = 100 # Please keep this in sync with bin/lesson_check.py! +trim_trailing_whitespace = false # keep trailing spaces in markdown - 2+ spaces are translated to a hard break (
) + +[*.r] +max_line_length = 80 + +[*.py] +indent_size = 4 +indent_style = space +max_line_length = 79 + +[*.sh] +end_of_line = lf + +[Makefile] +indent_style = tab diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100755 index 0000000..101967e --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,198 @@ +# Carpentries Workflows + +This directory contains workflows to be used for Lessons using the {sandpaper} +lesson infrastructure. Two of these workflows require R (`sandpaper-main.yaml` +and `pr-recieve.yaml`) and the rest are bots to handle pull request management. + +These workflows will likely change as {sandpaper} evolves, so it is important to +keep them up-to-date. To do this in your lesson you can do the following in your +R console: + +```r +# Install/Update sandpaper +options(repos = c(carpentries = "https://carpentries.r-universe.dev/", + CRAN = "https://cloud.r-project.org")) +install.packages("sandpaper") + +# update the workflows in your lesson +library("sandpaper") +update_github_workflows() +``` + +Inside this folder, you will find a file called `sandpaper-version.txt`, which +will contain a version number for sandpaper. This will be used in the future to +alert you if a workflow update is needed. + +What follows are the descriptions of the workflow files: + +## Deployment + +### 01 Build and Deploy (sandpaper-main.yaml) + +This is the main driver that will only act on the main branch of the repository. +This workflow does the following: + + 1. checks out the lesson + 2. provisions the following resources + - R + - pandoc + - lesson infrastructure (stored in a cache) + - lesson dependencies if needed (stored in a cache) + 3. builds the lesson via `sandpaper:::ci_deploy()` + +#### Caching + +This workflow has two caches; one cache is for the lesson infrastructure and +the other is for the the lesson dependencies if the lesson contains rendered +content. These caches are invalidated by new versions of the infrastructure and +the `renv.lock` file, respectively. If there is a problem with the cache, +manual invaliation is necessary. You will need maintain access to the repository +and you can either go to the actions tab and [click on the caches button to find +and invalidate the failing cache](https://github.blog/changelog/2022-10-20-manage-caches-in-your-actions-workflows-from-web-interface/) +or by setting the `CACHE_VERSION` secret to the current date (which will +invalidate all of the caches). + +## Updates + +### Setup Information + +These workflows run on a schedule and at the maintainer's request. Because they +create pull requests that update workflows/require the downstream actions to run, +they need a special repository/organization secret token called +`SANDPAPER_WORKFLOW` and it must have the `public_repo` and `workflow` scope. + +This can be an individual user token, OR it can be a trusted bot account. If you +have a repository in one of the official Carpentries accounts, then you do not +need to worry about this token being present because the Carpentries Core Team +will take care of supplying this token. + +If you want to use your personal account: you can go to + +to create a token. Once you have created your token, you should copy it to your +clipboard and then go to your repository's settings > secrets > actions and +create or edit the `SANDPAPER_WORKFLOW` secret, pasting in the generated token. + +If you do not specify your token correctly, the runs will not fail and they will +give you instructions to provide the token for your repository. + +### 02 Maintain: Update Workflow Files (update-workflow.yaml) + +The {sandpaper} repository was designed to do as much as possible to separate +the tools from the content. For local builds, this is absolutely true, but +there is a minor issue when it comes to workflow files: they must live inside +the repository. + +This workflow ensures that the workflow files are up-to-date. The way it work is +to download the update-workflows.sh script from GitHub and run it. The script +will do the following: + +1. check the recorded version of sandpaper against the current version on github +2. update the files if there is a difference in versions + +After the files are updated, if there are any changes, they are pushed to a +branch called `update/workflows` and a pull request is created. Maintainers are +encouraged to review the changes and accept the pull request if the outputs +are okay. + +This update is run ~~weekly or~~ on demand. + +### 03 Maintain: Update Pacakge Cache (update-cache.yaml) + +For lessons that have generated content, we use {renv} to ensure that the output +is stable. This is controlled by a single lockfile which documents the packages +needed for the lesson and the version numbers. This workflow is skipped in +lessons that do not have generated content. + +Because the lessons need to remain current with the package ecosystem, it's a +good idea to make sure these packages can be updated periodically. The +update cache workflow will do this by checking for updates, applying them in a +branch called `updates/packages` and creating a pull request with _only the +lockfile changed_. + +From here, the markdown documents will be rebuilt and you can inspect what has +changed based on how the packages have updated. + +## Pull Request and Review Management + +Because our lessons execute code, pull requests are a secruity risk for any +lesson and thus have security measures associted with them. **Do not merge any +pull requests that do not pass checks and do not have bots commented on them.** + +This series of workflows all go together and are described in the following +diagram and the below sections: + +![Graph representation of a pull request](https://carpentries.github.io/sandpaper/articles/img/pr-flow.dot.svg) + +### Pre Flight Pull Request Validation (pr-preflight.yaml) + +This workflow runs every time a pull request is created and its purpose is to +validate that the pull request is okay to run. This means the following things: + +1. The pull request does not contain modified workflow files +2. If the pull request contains modified workflow files, it does not contain + modified content files (such as a situation where @carpentries-bot will + make an automated pull request) +3. The pull request does not contain an invalid commit hash (e.g. from a fork + that was made before a lesson was transitioned from styles to use the + workbench). + +Once the checks are finished, a comment is issued to the pull request, which +will allow maintainers to determine if it is safe to run the +"Receive Pull Request" workflow from new contributors. + +### Recieve Pull Request (pr-recieve.yaml) + +**Note of caution:** This workflow runs arbitrary code by anyone who creates a +pull request. GitHub has safeguarded the token used in this workflow to have no +priviledges in the repository, but we have taken precautions to protect against +spoofing. + +This workflow is triggered with every push to a pull request. If this workflow +is already running and a new push is sent to the pull request, the workflow +running from the previous push will be cancelled and a new workflow run will be +started. + +The first step of this workflow is to check if it is valid (e.g. that no +workflow files have been modified). If there are workflow files that have been +modified, a comment is made that indicates that the workflow is not run. If +both a workflow file and lesson content is modified, an error will occurr. + +The second step (if valid) is to build the generated content from the pull +request. This builds the content and uploads three artifacts: + +1. The pull request number (pr) +2. A summary of changes after the rendering process (diff) +3. The rendered files (build) + +Because this workflow builds generated content, it follows the same general +process as the `sandpaper-main` workflow with the same caching mechanisms. + +The artifacts produced are used by the next workflow. + +### Comment on Pull Request (pr-comment.yaml) + +This workflow is triggered if the `pr-recieve.yaml` workflow is successful. +The steps in this workflow are: + +1. Test if the workflow is valid and comment the validity of the workflow to the + pull request. +2. If it is valid: create an orphan branch with two commits: the current state + of the repository and the proposed changes. +3. If it is valid: update the pull request comment with the summary of changes + +Importantly: if the pull request is invalid, the branch is not created so any +malicious code is not published. + +From here, the maintainer can request changes from the author and eventually +either merge or reject the PR. When this happens, if the PR was valid, the +preview branch needs to be deleted. + +### Send Close PR Signal (pr-close-signal.yaml) + +Triggered any time a pull request is closed. This emits an artifact that is the +pull request number for the next action + +### Remove Pull Request Branch (pr-post-remove-branch.yaml) + +Tiggered by `pr-close-signal.yaml`. This removes the temporary branch associated with +the pull request (if it was created). diff --git a/.github/workflows/pr-close-signal.yaml b/.github/workflows/pr-close-signal.yaml new file mode 100755 index 0000000..9b129d5 --- /dev/null +++ b/.github/workflows/pr-close-signal.yaml @@ -0,0 +1,23 @@ +name: "Bot: Send Close Pull Request Signal" + +on: + pull_request: + types: + [closed] + +jobs: + send-close-signal: + name: "Send closing signal" + runs-on: ubuntu-latest + if: ${{ github.event.action == 'closed' }} + steps: + - name: "Create PRtifact" + run: | + mkdir -p ./pr + printf ${{ github.event.number }} > ./pr/NUM + - name: Upload Diff + uses: actions/upload-artifact@v3 + with: + name: pr + path: ./pr + diff --git a/.github/workflows/pr-comment.yaml b/.github/workflows/pr-comment.yaml new file mode 100755 index 0000000..bb2eb03 --- /dev/null +++ b/.github/workflows/pr-comment.yaml @@ -0,0 +1,185 @@ +name: "Bot: Comment on the Pull Request" + +# read-write repo token +# access to secrets +on: + workflow_run: + workflows: ["Receive Pull Request"] + types: + - completed + +concurrency: + group: pr-${{ github.event.workflow_run.pull_requests[0].number }} + cancel-in-progress: true + + +jobs: + # Pull requests are valid if: + # - they match the sha of the workflow run head commit + # - they are open + # - no .github files were committed + test-pr: + name: "Test if pull request is valid" + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + payload: ${{ steps.check-pr.outputs.payload }} + number: ${{ steps.get-pr.outputs.NUM }} + msg: ${{ steps.check-pr.outputs.MSG }} + steps: + - name: 'Download PR artifact' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'pr' + + - name: "Get PR Number" + if: ${{ steps.dl.outputs.success == 'true' }} + id: get-pr + run: | + unzip pr.zip + echo "NUM=$(<./NR)" >> $GITHUB_OUTPUT + + - name: "Fail if PR number was not present" + id: bad-pr + if: ${{ steps.dl.outputs.success != 'true' }} + run: | + echo '::error::A pull request number was not recorded. The pull request that triggered this workflow is likely malicious.' + exit 1 + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "Check PR" + id: check-pr + if: ${{ steps.dl.outputs.success == 'true' }} + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ steps.get-pr.outputs.NUM }} + sha: ${{ github.event.workflow_run.head_sha }} + headroom: 3 # if it's within the last three commits, we can keep going, because it's likely rapid-fire + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + fail_on_error: true + + # Create an orphan branch on this repository with two commits + # - the current HEAD of the md-outputs branch + # - the output from running the current HEAD of the pull request through + # the md generator + create-branch: + name: "Create Git Branch" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + NR: ${{ needs.test-pr.outputs.number }} + permissions: + contents: write + steps: + - name: 'Checkout md outputs' + uses: actions/checkout@v3 + with: + ref: md-outputs + path: built + fetch-depth: 1 + + - name: 'Download built markdown' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'built' + + - if: ${{ steps.dl.outputs.success == 'true' }} + run: unzip built.zip + + - name: "Create orphan and push" + if: ${{ steps.dl.outputs.success == 'true' }} + run: | + cd built/ + git config --local user.email "actions@github.com" + git config --local user.name "GitHub Actions" + CURR_HEAD=$(git rev-parse HEAD) + git checkout --orphan md-outputs-PR-${NR} + git add -A + git commit -m "source commit: ${CURR_HEAD}" + ls -A | grep -v '^.git$' | xargs -I _ rm -r '_' + cd .. + unzip -o -d built built.zip + cd built + git add -A + git commit --allow-empty -m "differences for PR #${NR}" + git push -u --force --set-upstream origin md-outputs-PR-${NR} + + # Comment on the Pull Request with a link to the branch and the diff + comment-pr: + name: "Comment on Pull Request" + needs: [test-pr, create-branch] + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + NR: ${{ needs.test-pr.outputs.number }} + permissions: + pull-requests: write + steps: + - name: 'Download comment artifact' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'diff' + + - if: ${{ steps.dl.outputs.success == 'true' }} + run: unzip ${{ github.workspace }}/diff.zip + + - name: "Comment on PR" + id: comment-diff + if: ${{ steps.dl.outputs.success == 'true' }} + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ env.NR }} + path: ${{ github.workspace }}/diff.md + + # Comment if the PR is open and matches the SHA, but the workflow files have + # changed + comment-changed-workflow: + name: "Comment if workflow files have changed" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ always() && needs.test-pr.outputs.is_valid == 'false' }} + env: + NR: ${{ github.event.workflow_run.pull_requests[0].number }} + body: ${{ needs.test-pr.outputs.msg }} + permissions: + pull-requests: write + steps: + - name: 'Check for spoofing' + id: dl + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: 'built' + + - name: 'Alert if spoofed' + id: spoof + if: ${{ steps.dl.outputs.success == 'true' }} + run: | + echo 'body<> $GITHUB_ENV + echo '' >> $GITHUB_ENV + echo '## :x: DANGER :x:' >> $GITHUB_ENV + echo 'This pull request has modified workflows that created output. Close this now.' >> $GITHUB_ENV + echo '' >> $GITHUB_ENV + echo 'EOF' >> $GITHUB_ENV + + - name: "Comment on PR" + id: comment-diff + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ env.NR }} + body: ${{ env.body }} + diff --git a/.github/workflows/pr-post-remove-branch.yaml b/.github/workflows/pr-post-remove-branch.yaml new file mode 100755 index 0000000..62c2e98 --- /dev/null +++ b/.github/workflows/pr-post-remove-branch.yaml @@ -0,0 +1,32 @@ +name: "Bot: Remove Temporary PR Branch" + +on: + workflow_run: + workflows: ["Bot: Send Close Pull Request Signal"] + types: + - completed + +jobs: + delete: + name: "Delete branch from Pull Request" + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + permissions: + contents: write + steps: + - name: 'Download artifact' + uses: carpentries/actions/download-workflow-artifact@main + with: + run: ${{ github.event.workflow_run.id }} + name: pr + - name: "Get PR Number" + id: get-pr + run: | + unzip pr.zip + echo "NUM=$(<./NUM)" >> $GITHUB_OUTPUT + - name: 'Remove branch' + uses: carpentries/actions/remove-branch@main + with: + pr: ${{ steps.get-pr.outputs.NUM }} diff --git a/.github/workflows/pr-preflight.yaml b/.github/workflows/pr-preflight.yaml new file mode 100755 index 0000000..d0d7420 --- /dev/null +++ b/.github/workflows/pr-preflight.yaml @@ -0,0 +1,39 @@ +name: "Pull Request Preflight Check" + +on: + pull_request_target: + branches: + ["main"] + types: + ["opened", "synchronize", "reopened"] + +jobs: + test-pr: + name: "Test if pull request is valid" + if: ${{ github.event.action != 'closed' }} + runs-on: ubuntu-latest + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + permissions: + pull-requests: write + steps: + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "Check PR" + id: check-pr + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ github.event.number }} + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + fail_on_error: true + - name: "Comment result of validation" + id: comment-diff + if: ${{ always() }} + uses: carpentries/actions/comment-diff@main + with: + pr: ${{ github.event.number }} + body: ${{ steps.check-pr.outputs.MSG }} diff --git a/.github/workflows/pr-receive.yaml b/.github/workflows/pr-receive.yaml new file mode 100755 index 0000000..371ef54 --- /dev/null +++ b/.github/workflows/pr-receive.yaml @@ -0,0 +1,131 @@ +name: "Receive Pull Request" + +on: + pull_request: + types: + [opened, synchronize, reopened] + +concurrency: + group: ${{ github.ref }} + cancel-in-progress: true + +jobs: + test-pr: + name: "Record PR number" + if: ${{ github.event.action != 'closed' }} + runs-on: ubuntu-latest + outputs: + is_valid: ${{ steps.check-pr.outputs.VALID }} + steps: + - name: "Record PR number" + id: record + if: ${{ always() }} + run: | + echo ${{ github.event.number }} > ${{ github.workspace }}/NR # 2022-03-02: artifact name fixed to be NR + - name: "Upload PR number" + id: upload + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: pr + path: ${{ github.workspace }}/NR + - name: "Get Invalid Hashes File" + id: hash + run: | + echo "json<> $GITHUB_OUTPUT + - name: "echo output" + run: | + echo "${{ steps.hash.outputs.json }}" + - name: "Check PR" + id: check-pr + uses: carpentries/actions/check-valid-pr@main + with: + pr: ${{ github.event.number }} + invalid: ${{ fromJSON(steps.hash.outputs.json)[github.repository] }} + + build-md-source: + name: "Build markdown source files if valid" + needs: test-pr + runs-on: ubuntu-latest + if: ${{ needs.test-pr.outputs.is_valid == 'true' }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + CHIVE: ${{ github.workspace }}/site/chive + PR: ${{ github.workspace }}/site/pr + MD: ${{ github.workspace }}/site/built + steps: + - name: "Check Out Main Branch" + uses: actions/checkout@v3 + + - name: "Check Out Staging Branch" + uses: actions/checkout@v3 + with: + ref: md-outputs + path: ${{ env.MD }} + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Set up Pandoc" + uses: r-lib/actions/setup-pandoc@v2 + + - name: "Setup Lesson Engine" + uses: carpentries/actions/setup-sandpaper@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Setup Package Cache" + uses: carpentries/actions/setup-lesson-deps@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Validate and Build Markdown" + id: build-site + run: | + sandpaper::package_cache_trigger(TRUE) + sandpaper::validate_lesson(path = '${{ github.workspace }}') + sandpaper:::build_markdown(path = '${{ github.workspace }}', quiet = FALSE) + shell: Rscript {0} + + - name: "Generate Artifacts" + id: generate-artifacts + run: | + sandpaper:::ci_bundle_pr_artifacts( + repo = '${{ github.repository }}', + pr_number = '${{ github.event.number }}', + path_md = '${{ env.MD }}', + path_pr = '${{ env.PR }}', + path_archive = '${{ env.CHIVE }}', + branch = 'md-outputs' + ) + shell: Rscript {0} + + - name: "Upload PR" + uses: actions/upload-artifact@v3 + with: + name: pr + path: ${{ env.PR }} + + - name: "Upload Diff" + uses: actions/upload-artifact@v3 + with: + name: diff + path: ${{ env.CHIVE }} + retention-days: 1 + + - name: "Upload Build" + uses: actions/upload-artifact@v3 + with: + name: built + path: ${{ env.MD }} + retention-days: 1 + + - name: "Teardown" + run: sandpaper::reset_site() + shell: Rscript {0} diff --git a/.github/workflows/sandpaper-main.yaml b/.github/workflows/sandpaper-main.yaml new file mode 100755 index 0000000..e17707a --- /dev/null +++ b/.github/workflows/sandpaper-main.yaml @@ -0,0 +1,61 @@ +name: "01 Build and Deploy Site" + +on: + push: + branches: + - main + - master + schedule: + - cron: '0 0 * * 2' + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build?' + required: true + default: 'Maintainer (via GitHub)' + reset: + description: 'Reset cached markdown files' + required: false + default: false + type: boolean +jobs: + full-build: + name: "Build Full Site" + runs-on: ubuntu-latest + permissions: + checks: write + contents: write + pages: write + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + steps: + + - name: "Checkout Lesson" + uses: actions/checkout@v3 + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Set up Pandoc" + uses: r-lib/actions/setup-pandoc@v2 + + - name: "Setup Lesson Engine" + uses: carpentries/actions/setup-sandpaper@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Setup Package Cache" + uses: carpentries/actions/setup-lesson-deps@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: "Deploy Site" + run: | + reset <- "${{ github.event.inputs.reset }}" == "true" + sandpaper::package_cache_trigger(TRUE) + sandpaper:::ci_deploy(reset = reset) + shell: Rscript {0} diff --git a/.github/workflows/sandpaper-version.txt b/.github/workflows/sandpaper-version.txt new file mode 100644 index 0000000..4aa0906 --- /dev/null +++ b/.github/workflows/sandpaper-version.txt @@ -0,0 +1 @@ +0.11.15 diff --git a/.github/workflows/update-cache.yaml b/.github/workflows/update-cache.yaml new file mode 100755 index 0000000..676d742 --- /dev/null +++ b/.github/workflows/update-cache.yaml @@ -0,0 +1,125 @@ +name: "03 Maintain: Update Package Cache" + +on: + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build (enter github username to tag yourself)?' + required: true + default: 'monthly run' + schedule: + # Run every tuesday + - cron: '0 0 * * 2' + +jobs: + preflight: + name: "Preflight Check" + runs-on: ubuntu-latest + outputs: + ok: ${{ steps.check.outputs.ok }} + steps: + - id: check + run: | + if [[ ${{ github.event_name }} == 'workflow_dispatch' ]]; then + echo "ok=true" >> $GITHUB_OUTPUT + echo "Running on request" + # using single brackets here to avoid 08 being interpreted as octal + # https://github.com/carpentries/sandpaper/issues/250 + elif [ `date +%d` -le 7 ]; then + # If the Tuesday lands in the first week of the month, run it + echo "ok=true" >> $GITHUB_OUTPUT + echo "Running on schedule" + else + echo "ok=false" >> $GITHUB_OUTPUT + echo "Not Running Today" + fi + + check_renv: + name: "Check if We Need {renv}" + runs-on: ubuntu-latest + needs: preflight + if: ${{ needs.preflight.outputs.ok == 'true'}} + outputs: + needed: ${{ steps.renv.outputs.exists }} + steps: + - name: "Checkout Lesson" + uses: actions/checkout@v3 + - id: renv + run: | + if [[ -d renv ]]; then + echo "exists=true" >> $GITHUB_OUTPUT + fi + + check_token: + name: "Check SANDPAPER_WORKFLOW token" + runs-on: ubuntu-latest + needs: check_renv + if: ${{ needs.check_renv.outputs.needed == 'true' }} + outputs: + workflow: ${{ steps.validate.outputs.wf }} + repo: ${{ steps.validate.outputs.repo }} + steps: + - name: "validate token" + id: validate + uses: carpentries/actions/check-valid-credentials@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + + update_cache: + name: "Update Package Cache" + needs: check_token + if: ${{ needs.check_token.outputs.repo== 'true' }} + runs-on: ubuntu-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RENV_PATHS_ROOT: ~/.local/share/renv/ + steps: + + - name: "Checkout Lesson" + uses: actions/checkout@v3 + + - name: "Set up R" + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + install-r: false + + - name: "Update {renv} deps and determine if a PR is needed" + id: update + uses: carpentries/actions/update-lockfile@main + with: + cache-version: ${{ secrets.CACHE_VERSION }} + + - name: Create Pull Request + id: cpr + if: ${{ steps.update.outputs.n > 0 }} + uses: carpentries/create-pull-request@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + delete-branch: true + branch: "update/packages" + commit-message: "[actions] update ${{ steps.update.outputs.n }} packages" + title: "Update ${{ steps.update.outputs.n }} packages" + body: | + :robot: This is an automated build + + This will update ${{ steps.update.outputs.n }} packages in your lesson with the following versions: + + ``` + ${{ steps.update.outputs.report }} + ``` + + :stopwatch: In a few minutes, a comment will appear that will show you how the output has changed based on these updates. + + If you want to inspect these changes locally, you can use the following code to check out a new branch: + + ```bash + git fetch origin update/packages + git checkout update/packages + ``` + + - Auto-generated by [create-pull-request][1] on ${{ steps.update.outputs.date }} + + [1]: https://github.com/carpentries/create-pull-request/tree/main + labels: "type: package cache" + draft: false diff --git a/.github/workflows/update-workflows.yaml b/.github/workflows/update-workflows.yaml new file mode 100755 index 0000000..288bcd1 --- /dev/null +++ b/.github/workflows/update-workflows.yaml @@ -0,0 +1,66 @@ +name: "02 Maintain: Update Workflow Files" + +on: + workflow_dispatch: + inputs: + name: + description: 'Who triggered this build (enter github username to tag yourself)?' + required: true + default: 'weekly run' + clean: + description: 'Workflow files/file extensions to clean (no wildcards, enter "" for none)' + required: false + default: '.yaml' + schedule: + # Run every Tuesday + - cron: '0 0 * * 2' + +jobs: + check_token: + name: "Check SANDPAPER_WORKFLOW token" + runs-on: ubuntu-latest + outputs: + workflow: ${{ steps.validate.outputs.wf }} + repo: ${{ steps.validate.outputs.repo }} + steps: + - name: "validate token" + id: validate + uses: carpentries/actions/check-valid-credentials@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + + update_workflow: + name: "Update Workflow" + runs-on: ubuntu-latest + needs: check_token + if: ${{ needs.check_token.outputs.workflow == 'true' }} + steps: + - name: "Checkout Repository" + uses: actions/checkout@v3 + + - name: Update Workflows + id: update + uses: carpentries/actions/update-workflows@main + with: + clean: ${{ github.event.inputs.clean }} + + - name: Create Pull Request + id: cpr + if: "${{ steps.update.outputs.new }}" + uses: carpentries/create-pull-request@main + with: + token: ${{ secrets.SANDPAPER_WORKFLOW }} + delete-branch: true + branch: "update/workflows" + commit-message: "[actions] update sandpaper workflow to version ${{ steps.update.outputs.new }}" + title: "Update Workflows to Version ${{ steps.update.outputs.new }}" + body: | + :robot: This is an automated build + + Update Workflows from sandpaper version ${{ steps.update.outputs.old }} -> ${{ steps.update.outputs.new }} + + - Auto-generated by [create-pull-request][1] on ${{ steps.update.outputs.date }} + + [1]: https://github.com/carpentries/create-pull-request/tree/main + labels: "type: template and tools" + draft: false diff --git a/.github/workflows/workbench-beta-phase.yml b/.github/workflows/workbench-beta-phase.yml new file mode 100644 index 0000000..2faa25d --- /dev/null +++ b/.github/workflows/workbench-beta-phase.yml @@ -0,0 +1,60 @@ +name: "Deploy to AWS" + +on: + workflow_run: + workflows: ["01 Build and Deploy Site"] + types: + - completed + workflow_dispatch: + +jobs: + preflight: + name: "Preflight Check" + runs-on: ubuntu-latest + outputs: + ok: ${{ steps.check.outputs.ok }} + folder: ${{ steps.check.outputs.folder }} + steps: + - id: check + run: | + if [[ -z "${{ secrets.DISTRIBUTION }}" || -z "${{ secrets.AWS_ACCESS_KEY_ID }}" || -z "${{ secrets.AWS_SECRET_ACCESS_KEY }}" ]]; then + echo ":information_source: No site configured" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo 'To deploy the preview on AWS, you need the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` and `DISTRIBUTION` secrets set up' >> $GITHUB_STEP_SUMMARY + else + echo "::set-output name=folder::"$(sed -E 's^.+/(.+)^\1^' <<< ${{ github.repository }}) + echo "::set-output name=ok::true" + fi + + full-build: + name: "Deploy to AWS" + needs: [preflight] + if: ${{ needs.preflight.outputs.ok }} + runs-on: ubuntu-latest + steps: + + - name: "Checkout site folder" + uses: actions/checkout@v3 + with: + ref: 'gh-pages' + path: 'source' + + - name: "Deploy to Bucket" + uses: jakejarvis/s3-sync-action@v0.5.1 + with: + args: --acl public-read --follow-symlinks --delete --exclude '.git/*' + env: + AWS_S3_BUCKET: preview.carpentries.org + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + SOURCE_DIR: 'source' + DEST_DIR: ${{ needs.preflight.outputs.folder }} + + - name: "Invalidate CloudFront" + uses: chetan/invalidate-cloudfront-action@master + env: + PATHS: /* + AWS_REGION: 'us-east-1' + DISTRIBUTION: ${{ secrets.DISTRIBUTION }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b8ab706 --- /dev/null +++ b/.gitignore @@ -0,0 +1,55 @@ +# sandpaper files +episodes/*html +site/* +!site/README.md + +# History files +.Rhistory +.Rapp.history +# Session Data files +.RData +# User-specific files +.Ruserdata +# Example code in package build process +*-Ex.R +# Output files from R CMD build +/*.tar.gz +# Output files from R CMD check +/*.Rcheck/ +# RStudio files +.Rproj.user/ +# produced vignettes +vignettes/*.html +vignettes/*.pdf +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth +# knitr and R markdown default cache directories +*_cache/ +/cache/ +# Temporary files created by R markdown +*.utf8.md +*.knit.md +# R Environment Variables +.Renviron +# pkgdown site +docs/ +# translation temp files +po/*~ +# renv detritus +renv/sandbox/ +*.pyc +*~ +.DS_Store +.ipynb_checkpoints +.sass-cache +.jekyll-cache/ +.jekyll-metadata +__pycache__ +_site +.Rproj.user +.bundle/ +.vendor/ +vendor/ +.docker-vendor/ +Gemfile.lock +.*history diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..f19b804 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,13 @@ +--- +title: "Contributor Code of Conduct" +--- + +As contributors and maintainers of this project, +we pledge to follow the [The Carpentries Code of Conduct][coc]. + +Instances of abusive, harassing, or otherwise unacceptable behavior +may be reported by following our [reporting guidelines][coc-reporting]. + + +[coc-reporting]: https://docs.carpentries.org/topic_folders/policies/incident-reporting.html +[coc]: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..ec44704 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,121 @@ +## Contributing + +[The Carpentries][cp-site] ([Software Carpentry][swc-site], [Data +Carpentry][dc-site], and [Library Carpentry][lc-site]) are open source +projects, and we welcome contributions of all kinds: new lessons, fixes to +existing material, bug reports, and reviews of proposed changes are all +welcome. + +### Contributor Agreement + +By contributing, you agree that we may redistribute your work under [our +license](LICENSE.md). In exchange, we will address your issues and/or assess +your change proposal as promptly as we can, and help you become a member of our +community. Everyone involved in [The Carpentries][cp-site] agrees to abide by +our [code of conduct](CODE_OF_CONDUCT.md). + +### How to Contribute + +The easiest way to get started is to file an issue to tell us about a spelling +mistake, some awkward wording, or a factual error. This is a good way to +introduce yourself and to meet some of our community members. + +1. If you do not have a [GitHub][github] account, you can [send us comments by + email][contact]. However, we will be able to respond more quickly if you use + one of the other methods described below. + +2. If you have a [GitHub][github] account, or are willing to [create + one][github-join], but do not know how to use Git, you can report problems + or suggest improvements by [creating an issue][issues]. This allows us to + assign the item to someone and to respond to it in a threaded discussion. + +3. If you are comfortable with Git, and would like to add or change material, + you can submit a pull request (PR). Instructions for doing this are + [included below](#using-github). + +Note: if you want to build the website locally, please refer to [The Workbench +documentation][template-doc]. + +### Where to Contribute + +1. If you wish to change this lesson, add issues and pull requests here. +2. If you wish to change the template used for workshop websites, please refer + to [The Workbench documentation][template-doc]. + + +### What to Contribute + +There are many ways to contribute, from writing new exercises and improving +existing ones to updating or filling in the documentation and submitting [bug +reports][issues] about things that do not work, are not clear, or are missing. +If you are looking for ideas, please see [the list of issues for this +repository][repo], or the issues for [Data Carpentry][dc-issues], [Library +Carpentry][lc-issues], and [Software Carpentry][swc-issues] projects. + +Comments on issues and reviews of pull requests are just as welcome: we are +smarter together than we are on our own. **Reviews from novices and newcomers +are particularly valuable**: it's easy for people who have been using these +lessons for a while to forget how impenetrable some of this material can be, so +fresh eyes are always welcome. + +### What *Not* to Contribute + +Our lessons already contain more material than we can cover in a typical +workshop, so we are usually *not* looking for more concepts or tools to add to +them. As a rule, if you want to introduce a new idea, you must (a) estimate how +long it will take to teach and (b) explain what you would take out to make room +for it. The first encourages contributors to be honest about requirements; the +second, to think hard about priorities. + +We are also not looking for exercises or other material that only run on one +platform. Our workshops typically contain a mixture of Windows, macOS, and +Linux users; in order to be usable, our lessons must run equally well on all +three. + +### Using GitHub + +If you choose to contribute via GitHub, you may want to look at [How to +Contribute to an Open Source Project on GitHub][how-contribute]. In brief, we +use [GitHub flow][github-flow] to manage changes: + +1. Create a new branch in your desktop copy of this repository for each + significant change. +2. Commit the change in that branch. +3. Push that branch to your fork of this repository on GitHub. +4. Submit a pull request from that branch to the [upstream repository][repo]. +5. If you receive feedback, make changes on your desktop and push to your + branch on GitHub: the pull request will update automatically. + +NB: The published copy of the lesson is usually in the `main` branch. + +Each lesson has a team of maintainers who review issues and pull requests or +encourage others to do so. The maintainers are community volunteers, and have +final say over what gets merged into the lesson. + +### Other Resources + +The Carpentries is a global organisation with volunteers and learners all over +the world. We share values of inclusivity and a passion for sharing knowledge, +teaching and learning. There are several ways to connect with The Carpentries +community listed at including via social +media, slack, newsletters, and email lists. You can also [reach us by +email][contact]. + +[repo]: https://example.com/FIXME +[contact]: mailto:team@carpentries.org +[cp-site]: https://carpentries.org/ +[dc-issues]: https://github.com/issues?q=user%3Adatacarpentry +[dc-lessons]: https://datacarpentry.org/lessons/ +[dc-site]: https://datacarpentry.org/ +[discuss-list]: https://lists.software-carpentry.org/listinfo/discuss +[github]: https://github.com +[github-flow]: https://guides.github.com/introduction/flow/ +[github-join]: https://github.com/join +[how-contribute]: https://egghead.io/series/how-to-contribute-to-an-open-source-project-on-github +[issues]: https://carpentries.org/help-wanted-issues/ +[lc-issues]: https://github.com/issues?q=user%3ALibraryCarpentry +[swc-issues]: https://github.com/issues?q=user%3Aswcarpentry +[swc-lessons]: https://software-carpentry.org/lessons/ +[swc-site]: https://software-carpentry.org/ +[lc-site]: https://librarycarpentry.org/ +[template-doc]: https://carpentries.github.io/workbench/ diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..7632871 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,79 @@ +--- +title: "Licenses" +--- + +## Instructional Material + +All Carpentries (Software Carpentry, Data Carpentry, and Library Carpentry) +instructional material is made available under the [Creative Commons +Attribution license][cc-by-human]. The following is a human-readable summary of +(and not a substitute for) the [full legal text of the CC BY 4.0 +license][cc-by-legal]. + +You are free: + +- to **Share**---copy and redistribute the material in any medium or format +- to **Adapt**---remix, transform, and build upon the material + +for any purpose, even commercially. + +The licensor cannot revoke these freedoms as long as you follow the license +terms. + +Under the following terms: + +- **Attribution**---You must give appropriate credit (mentioning that your work + is derived from work that is Copyright (c) The Carpentries and, where + practical, linking to ), provide a [link to the + license][cc-by-human], and indicate if changes were made. You may do so in + any reasonable manner, but not in any way that suggests the licensor endorses + you or your use. + +- **No additional restrictions**---You may not apply legal terms or + technological measures that legally restrict others from doing anything the + license permits. With the understanding that: + +Notices: + +* You do not have to comply with the license for elements of the material in + the public domain or where your use is permitted by an applicable exception + or limitation. +* No warranties are given. The license may not give you all of the permissions + necessary for your intended use. For example, other rights such as publicity, + privacy, or moral rights may limit how you use the material. + +## Software + +Except where otherwise noted, the example programs and other software provided +by The Carpentries are made available under the [OSI][osi]-approved [MIT +license][mit-license]. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +## Trademark + +"The Carpentries", "Software Carpentry", "Data Carpentry", and "Library +Carpentry" and their respective logos are registered trademarks of [Community +Initiatives][ci]. + +[cc-by-human]: https://creativecommons.org/licenses/by/4.0/ +[cc-by-legal]: https://creativecommons.org/licenses/by/4.0/legalcode +[mit-license]: https://opensource.org/licenses/mit-license.html +[ci]: https://communityin.org/ +[osi]: https://opensource.org diff --git a/README.md b/README.md index 8970afd..e7dd95a 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,14 @@ -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3269869.svg)](https://doi.org/10.5281/zenodo.3269869) -[![Create a Slack Account with us](https://img.shields.io/badge/Create_Slack_Account-The_Carpentries-071159.svg)](https://swc-slack-invite.herokuapp.com/) -[![Slack Status](https://img.shields.io/badge/Slack_Channel-dc--ecology--data--org-E01563.svg)](https://swcarpentry.slack.com/messages/C9WJAN3CH) +> **ATTENTION** This is an experimental test of [The Carpentries Workbench](https://carpentries.github.io/workbench) lesson infrastructure. +> It was automatically converted from the source lesson via [the lesson transition script](https://github.com/carpentries/lesson-transition/). +> +> If anything seems off, please contact Zhian Kamvar [zkamvar@carpentries.org](mailto:zkamvar@carpentries.org) +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3269869.svg)](https://doi.org/10.5281/zenodo.3269869) +[![Create a Slack Account with us](https://img.shields.io/badge/Create_Slack_Account-The_Carpentries-071159.svg)](https://swc-slack-invite.herokuapp.com/) +[![Slack Status](https://img.shields.io/badge/Slack_Channel-dc--ecology--data--org-E01563.svg)](https://swcarpentry.slack.com/messages/C9WJAN3CH) -Data Carpentry Core Curriculum -============= +# Data Carpentry Core Curriculum This repository contains core material for the 2-day data carpentry workshop. Please see our [contribution guidelines](CONTRIBUTING.md) before contributing updates, bug fixes, or other corrections. - + + diff --git a/_extras/guide.md b/_extras/guide.md deleted file mode 100644 index 43b5378..0000000 --- a/_extras/guide.md +++ /dev/null @@ -1,114 +0,0 @@ ---- -layout: page -title: Instructor Notes ---- - -## Instructor notes - -## Lesson motivation and learning objectives - -The purpose of this lesson is *not* to teach how to do data analysis in spreadsheets, -but to teach good data organization and how to do some data cleaning and -quality control in a spreadsheet program. - -## Lesson design - -#### [Introduction](../00-intro/) - -* Introduce that we're teaching data organization, and that we're using -spreadsheets, because most people do data entry in spreadsheets or -have data in spreadsheets. -* Emphasize that we are teaching good practice in data organization and that -this is the foundation of their research practice. Without organized and clean -data, it will be difficult for them to apply the things we're teaching in the -rest of the workshop to their data. -* Much of their lives as a researcher will be spent on this 'data wrangling' stage, but -some of it can be prevented with good strategies for data collection up front. -* Tell that we're not teaching data analysis or plotting in spreadsheets, because it's -very manual and also not reproducible. That's why we're teaching SQL, R, Python! -* Now let's talk about spreadsheets, and when we say spreadsheets, we mean any program that -does spreadsheets like Excel, LibreOffice, OpenOffice. Most learners are probably using Excel. -* Ask the audience any things they've accidentally done in spreadsheets. Talk about an example of your own, like that you accidentally sorted only a single column and not the rest -of the data in the spreadsheet. What are the pain points!? -* As people answer highlight some of these issues with spreadsheets - -#### [Formatting data](../01-format-data/) - -* Go through the point about keeping track of your steps and keeping raw data raw -* Go through the cardinal rule of spreadsheets about columns, rows and cells -* Hand them a messy data file and have them pair up and work together to clean up the data. -*Give them 15 minutes to do this.* -* Ask for what people did to clean the data. As they bring up different points you can -refer to them in the 02-common-mistakes.md file, or expand a bit on the point they brought up. -If you are just teaching the lesson, it would be good to familiarize yourself with -the set of mistakes in 02-common-mistakes. All these mistakes are present in the messy -dataset. -* If you get a response where they've fixed the date, you can pause and go to the -03-dates-as-data.md lesson. Or you can say you'll come back to dates at the end. -There's an exercise in that file about how to change the -date into three columns using Excel's built in MONTH, DAY, YEAR functions. Have them -run through that exercise. - -#### [Common formatting problems](../02-common-mistakes/) - -* **Don't go through this chapter** except to refer to as responses to the exercise in -the previous chapter. - -#### [Dates as data](../03-dates-as-data/) - -* Do the exercise and make the point about dates either in response to a learner bringing -up date as an issue during the responses, or at the end of the response time. - -#### [Quality control](../04-quality-control/) -*This lesson is optional* - -The challenge with this lesson is that the instructor's version of the spreadsheet software is going to look different than about half the room's. It makes -it challenging to show where you can find menu options and navigate through. - -Instead discuss the concepts of quality control, and how things like sorting can help you find outliers in your data. - -#### [Exporting data](../05-exporting-data/) - -* Have the students export their cleaned data as CSV. Reiterate again the need for -data in this format for the other tools we'll be using. - -#### Concluding points - -* Now your data is organized so that a computer can read and understand it. This -let's you use the full power of the computer for your analyses as we'll see in the -rest of the workshop. -* While your data is now neatly organized, it still might have errors or missing data -or other problems. It's like you put all your data in the right drawers, but the -drawers might still be messy. The next lesson is going to teach you OpenRefine which -is great for data cleaning and for some of the quality control that we touched on -in this lesson. It also has the advantage that it automatically keeps track of the -steps you take. - -## Technical tips and tricks - -Provide information on setting up your environment for learners to view your -live coding (increasing text size, changing text color, etc), as well as -general recommendations for working with coding tools to best suit the -learning environment. - -## Common problems - -#### Excel looks and acts different on different operating systems - -The main challenge with this lesson is that Excel looks very different and how you -do things is even different between Mac and PC, and between different versions of -Excel. So, the presenter's environment will only be the same as some of the learners. - -We need better notes and screenshots of how things work on both Mac and PC. But we -likely won't be able to cover all the different versions of Excel. - -If you have a helper who has experience with the other OS than you, it would be good -to prep them to help with this lesson and tell how people to do things in the other OS. - -#### People are not interactive or responsive on the Exercise - -This lesson depends on people working on the exercise and responding with things -that are fixed. If your audience is reluctant to participate, start out with -some things on your own, or ask a helper for their answers. This generally gets -even a reluctant audience started. - diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..edc5093 --- /dev/null +++ b/config.yaml @@ -0,0 +1,87 @@ +#------------------------------------------------------------ +# Values for this lesson. +#------------------------------------------------------------ + +# Which carpentry is this (swc, dc, lc, or cp)? +# swc: Software Carpentry +# dc: Data Carpentry +# lc: Library Carpentry +# cp: Carpentries (to use for instructor training for instance) +# incubator: The Carpentries Incubator +carpentry: 'dc' + +# Overall title for pages. +title: 'Data Organization in Spreadsheets for Ecologists' + +# Date the lesson was created (YYYY-MM-DD, this is empty by default) +created: + +# Comma-separated list of keywords for the lesson +keywords: 'software, data, lesson, The Carpentries' + +# Life cycle stage of the lesson +# possible values: pre-alpha, alpha, beta, stable +life_cycle: 'stable' + +# License of the lesson materials (recommended CC-BY 4.0) +license: 'CC-BY 4.0' + +# Link to the source repository for this lesson +source: 'https://github.com/fishtree-attempt/spreadsheet-ecology-lesson/' + +# Default branch of your lesson +branch: 'main' + +# Who to contact if there are any issues +contact: 'team@carpentries.org' + +# Navigation ------------------------------------------------ +# +# Use the following menu items to specify the order of +# individual pages in each dropdown section. Leave blank to +# include all pages in the folder. +# +# Example ------------- +# +# episodes: +# - introduction.md +# - first-steps.md +# +# learners: +# - setup.md +# +# instructors: +# - instructor-notes.md +# +# profiles: +# - one-learner.md +# - another-learner.md + +# Order of episodes in your lesson +episodes: +- 00-intro.md +- 01-format-data.md +- 02-common-mistakes.md +- 03-dates-as-data.md +- 04-quality-control.md +- 05-exporting-data.md + +# Information for Learners +learners: + +# Information for Instructors +instructors: + +# Learner Profiles +profiles: + +# Customisation --------------------------------------------- +# +# This space below is where custom yaml items (e.g. pinning +# sandpaper and varnish versions) should live + + +url: https://preview.carpentries.org/spreadsheet-ecology-lesson +analytics: carpentries +lang: en +workbench-beta: 'true' diff --git a/episodes/00-intro.md b/episodes/00-intro.md index 6948bc4..8b78a70 100644 --- a/episodes/00-intro.md +++ b/episodes/00-intro.md @@ -1,15 +1,21 @@ --- -title: "Introduction" +title: Introduction teaching: 15 exercises: 3 -questions: -- "What are basic principles for using spreadsheets for good data organization?" -objectives: -- "Describe best practices for organizing data so computers can make the best use of data sets." -keypoints: -- "Good data organization is the foundation of any research project." --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Describe best practices for organizing data so computers can make the best use of data sets. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What are basic principles for using spreadsheets for good data organization? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + Good data organization is the foundation of your research project. Most researchers have data or do data entry in spreadsheets. Spreadsheet programs are very useful graphical @@ -19,7 +25,8 @@ quality control functions. ### Spreadsheet outline After this lesson, you will be able to: -- Implement best practices in data table formatting + +- Implement best practices in data table formatting - Identify and address common formatting mistakes - Understand approaches for handling dates in spreadsheets - Utilize basic quality control features and data manipulation practices @@ -28,13 +35,13 @@ After this lesson, you will be able to: *Overall good data practices* Spreadsheets are good for data entry. Therefore we have a lot of data -in spreadsheets. +in spreadsheets. Much of your time as a researcher will be spent in this 'data wrangling' stage. It's not the most fun, but it's necessary. We'll teach you how to think -about data organization and some practices for more effective data wrangling. Not only -will this save time and effort, it also paves the way for more equitable and inclusive -science. Making our work more accessible provides wider avenues for collaboration -and improvement! +about data organization and some practices for more effective data wrangling. Not only +will this save time and effort, it also paves the way for more equitable and inclusive +science. Making our work more accessible provides wider avenues for collaboration +and improvement! ### What this lesson will not teach you @@ -45,7 +52,7 @@ and improvement! If you're looking to do this, a good reference is [Head First Excel](https://www.amazon.com/Head-First-Excel-learners-spreadsheets/dp/0596807694/), published by O'Reilly. ---- +*** ### Why aren't we teaching data analysis in spreadsheets @@ -67,11 +74,16 @@ Free spreadsheet programs that can also be used are LibreOffice Calc, and even G Commands may differ a bit between programs, but the general idea is the same. -> ## Exercise -> - How many people have used spreadsheets in their research? -> - How many people have accidentally done something that made them -> frustrated or sad? -{: .callout} +::::::::::::::::::::::::::::::::::::::::: callout + +## Exercise + +- How many people have used spreadsheets in their research? +- How many people have accidentally done something that made them + frustrated or sad? + + +:::::::::::::::::::::::::::::::::::::::::::::::::: Spreadsheets encompass a lot of the things we need to be able to do as researchers. We can use them for: @@ -84,7 +96,6 @@ to be able to do as researchers. We can use them for: We do a lot of different operations in spreadsheets. What kind of operations do you do in spreadsheets? Which ones do you think spreadsheets are good for? - ## Problems with Spreadsheets Spreadsheets are good for data entry, but in reality we tend to @@ -93,7 +104,7 @@ to create data tables for publications, to generate summary statistics, and make figures. Generating tables for publications in a spreadsheet is not -optimal - often, when formatting a data table for publication, we’re +optimal - often, when formatting a data table for publication, we're reporting key summary statistics in a way that is not really meant to be read as data, and often involves special formatting (merging cells, creating borders, making it pretty). Cutting and pasting from a spreadsheet @@ -101,25 +112,25 @@ to a document software (like Word) can have unpredictable results. We advise you to create tables within these document software using the document's own table editing software. -The latter two applications, generating statistics and figures, should -be used with caution: because of the graphical, drag and drop nature of -spreadsheet programs, it can be very difficult, if not impossible, to -replicate your steps (much less retrace anyone else's), particularly if your -stats or figures require you to do more complex calculations. Furthermore, -in doing calculations in a spreadsheet, it’s easy to accidentally apply a -slightly different formula to multiple adjacent cells. When using a -command-line based statistics program like R or SAS, it’s practically -impossible to apply a calculation to one observation in your -dataset but not another unless you’re doing it on purpose. +The latter two applications, generating statistics and figures, should +be used with caution: because of the graphical, drag and drop nature of +spreadsheet programs, it can be very difficult, if not impossible, to +replicate your steps (much less retrace anyone else's), particularly if your +stats or figures require you to do more complex calculations. Furthermore, +in doing calculations in a spreadsheet, it's easy to accidentally apply a +slightly different formula to multiple adjacent cells. When using a +command-line based statistics program like R or SAS, it's practically +impossible to apply a calculation to one observation in your +dataset but not another unless you're doing it on purpose. ### Using Spreadsheets for Data Entry and Cleaning -However, there are circumstances where you might want to use a spreadsheet -program to produce “quick and dirty” calculations or figures, and data +However, there are circumstances where you might want to use a spreadsheet +program to produce "quick and dirty" calculations or figures, and data cleaning will help you use some of these features. Data cleaning also -puts your data in a better format prior to importation into a -statistical analysis program. We will show you how to use some features of -spreadsheet programs to check your data quality along the way and produce +puts your data in a better format prior to importation into a +statistical analysis program. We will show you how to use some features of +spreadsheet programs to check your data quality along the way and produce preliminary summary statistics. In this lesson, we will assume that you are most likely using Excel as @@ -129,9 +140,16 @@ to be the program most used by biologists and ecologists. In this lesson we're going to talk about: -1. [Formatting data tables in spreadsheets](../01-format-data/) -2. [Formatting problems](../02-common-mistakes/) -3. [Dates as data](../03-dates-as-data/) -4. [Quality control](../04-quality-control/) -5. [Exporting data](../05-exporting-data/) +1. [Formatting data tables in spreadsheets](01-format-data.md) +2. [Formatting problems](02-common-mistakes.md) +3. [Dates as data](03-dates-as-data.md) +4. [Quality control](04-quality-control.md) +5. [Exporting data](05-exporting-data.md) + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Good data organization is the foundation of any research project. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + diff --git a/episodes/01-format-data.md b/episodes/01-format-data.md index f5ba109..606cf44 100644 --- a/episodes/01-format-data.md +++ b/episodes/01-format-data.md @@ -1,18 +1,22 @@ --- -title: "Formatting data tables in Spreadsheets" +title: Formatting data tables in Spreadsheets teaching: 15 exercises: 20 -questions: -- "How do we format data in spreadsheets for effective data use?" -objectives: -- "Describe best practices for data entry and formatting in spreadsheets." -- "Apply best practices to arrange variables and observations in a spreadsheet." -keypoints: -- "Never modify your raw data. Always make a copy before making any changes." -- "Keep track of all of the steps you take to clean your data in a plain text file." -- "Organize your data according to tidy data principles." --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Describe best practices for data entry and formatting in spreadsheets. +- Apply best practices to arrange variables and observations in a spreadsheet. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do we format data in spreadsheets for effective data use? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + The most common mistake made is treating spreadsheet programs like lab notebooks, that is, relying on context, notes in the margin, spatial layout of data and fields to convey information. As humans, we @@ -21,12 +25,12 @@ unless we explain to the computer what every single thing means (and that can be hard!), it will not be able to see how our data fits together. -Using the power of computers, we can manage and analyze data in much more +Using the power of computers, we can manage and analyze data in much more effective and faster ways, but to use that power, we have to set up -our data for the computer to be able to understand it (and computers are very +our data for the computer to be able to understand it (and computers are very literal). -This is why it’s extremely important to set up well-formatted +This is why it's extremely important to set up well-formatted tables from the outset - before you even start entering data from your very first preliminary experiment. Data organization is the foundation of your research project. It can make it easier or harder @@ -37,13 +41,17 @@ but some of these choices can limit your ability to work with the data in other have the you-of-6-months-from-now or your collaborator work with the data. -> ## Note -> -> The best layouts/formats (as well as software and -> interfaces) for data entry and data analysis might be -> different. It is important to take this into account, and ideally -> automate the conversion from one to another. -{: .callout} +::::::::::::::::::::::::::::::::::::::::: callout + +## Note + +The best layouts/formats (as well as software and +interfaces) for data entry and data analysis might be +different. It is important to take this into account, and ideally +automate the conversion from one to another. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: ### Keeping track of your analyses @@ -53,40 +61,42 @@ you started with. In order to be able to reproduce your analyses or figure out what you did when Reviewer #3 asks for a different analysis, you should - create a new file with your cleaned or analyzed data. Don't modify -the original dataset, or you will never know where you started! -- keep track of the steps you took in your clean up or analysis. You should track -these steps as you would any step in an experiment. We recommend that you -do this in a plain text file stored in the same folder as the data file. + the original dataset, or you will never know where you started! +- keep track of the steps you took in your clean up or analysis. You should track + these steps as you would any step in an experiment. We recommend that you + do this in a plain text file stored in the same folder as the data file. This might be an example of a spreadsheet setup: -![spreadsheet setup](../fig/spreadsheet-setup-updated.png) +![](fig/spreadsheet-setup-updated.png){alt='spreadsheet setup'} Put these principles in to practice today during your Exercises. -> ## Note -> -> This is out of scope for this lesson, but for information on how to -> maintain version control over your data, look at our lesson on -> ['Git'](http://swcarpentry.github.io/git-novice/). -{: .callout} +::::::::::::::::::::::::::::::::::::::::: callout +## Note + +This is out of scope for this lesson, but for information on how to +maintain version control over your data, look at our lesson on +['Git'](https://swcarpentry.github.io/git-novice/). -### Structuring data in spreadsheets +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Structuring data in spreadsheets The cardinal rule of using spreadsheet programs for data is to keep it "tidy": 1. Put all your variables in columns - the thing you're measuring, - like 'weight' or 'temperature'. + like 'weight' or 'temperature'. 2. Put each observation in its own row. 3. Don't combine multiple pieces of information in one - cell. Sometimes it just seems like one thing, but think if that's - the only way you'll want to be able to use or sort that data. + cell. Sometimes it just seems like one thing, but think if that's + the only way you'll want to be able to use or sort that data. 4. Leave the raw data raw - don't change it! 5. Export the cleaned data to a text-based format like CSV (comma-separated values) format. This - ensures that anyone can use the data, and is required by - most data repositories. + ensures that anyone can use the data, and is required by + most data repositories. For instance, we have data from a survey of small mammals in a desert ecosystem. Different people have gone to the field and entered data into a spreadsheet. They keep track of things like species, plot, @@ -94,12 +104,12 @@ weight, sex and date collected. If they were to keep track of the data like this: -![multiple-info example](../fig/multiple-info.png) +![](fig/multiple-info.png){alt='multiple-info example'} -the problem is that species and sex are in the same field. So, if they wanted to -look at all of one species or look at different weight distributions by sex, -it would be hard to do this using this data setup. If instead we put sex and species -in different columns, you can see that it would be much easier. +the problem is that species and sex are in the same field. So, if they wanted to +look at all of one species or look at different weight distributions by sex, +it would be hard to do this using this data setup. If instead we put sex and species +in different columns, you can see that it would be much easier. ### Columns for variables and rows for observations @@ -108,63 +118,86 @@ variables, rows = observations, cells = data (values). So, instead we should have: -![single-info example](../fig/single-info.png) - -> ## Discussion -> If not already discussed, introduce the dataset that will be used in this -> lesson, and in the other ecology lessons, the [Portal Project Teaching Dataset](http://www.datacarpentry.org/ecology-workshop/data/). -> -> The data used in the ecology lessons are observations of a small mammal community in southern Arizona. This is part of a project studying the effects of rodents and ants on the plant community that has been running for almost 40 years. The rodents are sampled on a series of 24 plots, with different experimental manipulations controlling which rodents are allowed to access which plots. -> -> This is a real dataset that has been used in over 100 publications. We’ve simplified it just a little bit for the workshop, but you can download the full dataset and work with it using exactly the same tools we’ll learn about today. -{: .discussion} - - -> ## Exercise -> -> We're going to take a messy version of the survey data and describe how we would clean it up. -> -> 1. Download the data by clicking [here](https://ndownloader.figshare.com/files/2252083) to get it from FigShare. -> 2. Open up the data in a spreadsheet program. -> 3. You can see that there are two tabs. Two field assistants conducted the surveys, one -in 2013 and one in 2014, and they both kept track of the data in their own way in tabs `2013` and `2014` of the dataset, ->respectively. Now -you're the person in charge of this project and you want to be able to -start analyzing the data. -> 4. With the person next to you, identify what is wrong with this spreadsheet. Also discuss the steps you would need to take to clean up the `2013` and `2014` tabs, and to put them all together in one spreadsheet. -> -> **Important** Do not forget our first piece of advice: to -> create a new file (or tab) for the cleaned data, never -> modify your original (raw) data. -> -> After you go through this exercise, we'll discuss as a group what was wrong -> with this data and how you would fix it. -> -> > ## Solution -> > - Take about 10 minutes to work on this exercise. -> > - All the mistakes in [02-common-mistakes](../02-common-mistakes) are present in the messy dataset. If the -> > exercise is done during a workshop, ask people what they saw as wrong with -> > the data. As they bring up different points, you can refer to [02-common-mistakes](../02-common-mistakes) -> > or expand a bit on the point they brought up. -> > - Note that there is a problem with dates in table 'plot 3' in `2014` tab. The field assistant who collected the data -> > for year 2014 initially forgot to include their data for 'plot 3'. They came back in 2015 to include the missing data and -> > entered the dates for 'plot 3' in the dataset without the year. Excel automatically filled in the missing year as the -> > current year (i.e. 2015) - introducing an error in the data without the field assistant realising. If you get a response -> > from the participants that they've spotted and fixed the problem with date, you can say you'll come back to dates again -> > towards the end of lesson in episode [03-dates-as-data](../03-dates-as-data). If participants have not spotted the -> > problem with dates in 'plot 3' table, that's fine as you will address peculiarities of working with dates in -> > spreadsheets in episode [03-dates-as-data](../03-dates-as-data). -> {: .solution} -{: .challenge} +![](fig/single-info.png){alt='single-info example'} + +:::::::::::::::::::::::::::::::::::::: discussion + +## Discussion + +If not already discussed, introduce the dataset that will be used in this +lesson, and in the other ecology lessons, the [Portal Project Teaching Dataset](https://www.datacarpentry.org/ecology-workshop/data/). + +The data used in the ecology lessons are observations of a small mammal community in southern Arizona. This is part of a project studying the effects of rodents and ants on the plant community that has been running for almost 40 years. The rodents are sampled on a series of 24 plots, with different experimental manipulations controlling which rodents are allowed to access which plots. + +This is a real dataset that has been used in over 100 publications. We've simplified it just a little bit for the workshop, but you can download the full dataset and work with it using exactly the same tools we'll learn about today. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +## Exercise + +We're going to take a messy version of the survey data and describe how we would clean it up. + +1. Download the data by clicking [here](https://ndownloader.figshare.com/files/2252083) to get it from FigShare. +2. Open up the data in a spreadsheet program. +3. You can see that there are two tabs. Two field assistants conducted the surveys, one + in 2013 and one in 2014, and they both kept track of the data in their own way in tabs `2013` and `2014` of the dataset, + respectively. Now + you're the person in charge of this project and you want to be able to + start analyzing the data. +4. With the person next to you, identify what is wrong with this spreadsheet. Also discuss the steps you would need to take to clean up the `2013` and `2014` tabs, and to put them all together in one spreadsheet. + +**Important** Do not forget our first piece of advice: to +create a new file (or tab) for the cleaned data, never +modify your original (raw) data. + +After you go through this exercise, we'll discuss as a group what was wrong +with this data and how you would fix it. + +::::::::::::::: solution + +## Solution + +- Take about 10 minutes to work on this exercise. +- All the mistakes in [02-common-mistakes](02-common-mistakes.md) are present in the messy dataset. If the + exercise is done during a workshop, ask people what they saw as wrong with + the data. As they bring up different points, you can refer to [02-common-mistakes](02-common-mistakes.md) + or expand a bit on the point they brought up. +- Note that there is a problem with dates in table 'plot 3' in `2014` tab. The field assistant who collected the data + for year 2014 initially forgot to include their data for 'plot 3'. They came back in 2015 to include the missing data and + entered the dates for 'plot 3' in the dataset without the year. Excel automatically filled in the missing year as the + current year (i.e. 2015) - introducing an error in the data without the field assistant realising. If you get a response + from the participants that they've spotted and fixed the problem with date, you can say you'll come back to dates again + towards the end of lesson in episode [03-dates-as-data](03-dates-as-data.md). If participants have not spotted the + problem with dates in 'plot 3' table, that's fine as you will address peculiarities of working with dates in + spreadsheets in episode [03-dates-as-data](03-dates-as-data.md). + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: There are excellent references to help you work with data in spreadsheets. These include: -Regarding R scripting; +Regarding R scripting; + > Hadley Wickham, *Tidy Data*, Vol. 59, Issue 10, Sep 2014, Journal of -> Statistical Software. [http://www.jstatsoft.org/v59/i10](http://www.jstatsoft.org/v59/i10). +> Statistical Software. [http://www.jstatsoft.org/v59/i10](https://www.jstatsoft.org/v59/i10). Appropriately Regarding Data organization in spreadsheets; -> Karl W. Broman & Kara H. Woo (2018) Data Organization in Spreadsheets, The American Statistician, 72:1, 2-10, + +> Karl W. Broman \& Kara H. Woo (2018) Data Organization in Spreadsheets, The American Statistician, 72:1, 2-10, > [DOI: 10.1080/00031305.2017.1375989](https://www.tandfonline.com/doi/full/10.1080/00031305.2017.1375989). +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Never modify your raw data. Always make a copy before making any changes. +- Keep track of all of the steps you take to clean your data in a plain text file. +- Organize your data according to tidy data principles. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + diff --git a/episodes/02-common-mistakes.md b/episodes/02-common-mistakes.md index 6720945..00216c7 100644 --- a/episodes/02-common-mistakes.md +++ b/episodes/02-common-mistakes.md @@ -1,25 +1,21 @@ --- -title: "Formatting problems" +title: Formatting problems teaching: 20 exercises: 0 -questions: -- "What are some common challenges with formatting data in spreadsheets and how can we avoid them?" -objectives: -- "Recognize and resolve common spreadsheet formatting problems." -keypoints: -- "Avoid using multiple tables within one spreadsheet." -- "Avoid spreading data across multiple tabs." -- "Record zeros as zeros." -- "Use an appropriate null value to record missing data." -- "Don't use formatting to convey information or to make your spreadsheet look pretty." -- "Place comments in a separate column." -- "Record units in column headers." -- "Include only one piece of information in a cell." -- "Avoid spaces, numbers and special characters in column headers." -- "Avoid special characters in your data." -- "Record metadata in a separate plain text file." --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Recognize and resolve common spreadsheet formatting problems. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What are some common challenges with formatting data in spreadsheets and how can we avoid them? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + ## Common Spreadsheet Errors This episode is meant to be used as a reference for discussion as learners identify issues with the messy dataset discussed in the @@ -38,47 +34,47 @@ There are a few potential errors to be on the lookout for in your own data as we - [Using problematic field names](#field_name) - [Using special characters in data](#special) - [Inclusion of metadata in data table](#metadata) -- [Date formatting](../03-dates-as-data/) +- [Date formatting](03-dates-as-data.md) - -## Using multiple tables +## Using multiple tables {#tables} A common strategy is creating multiple data tables within one spreadsheet. This confuses the computer, so don't do this! When you create multiple tables within one -spreadsheet, you’re drawing false associations between things for the computer, -which sees each row as an observation. You’re also potentially using the same +spreadsheet, you're drawing false associations between things for the computer, +which sees each row as an observation. You're also potentially using the same field name in multiple places, which will make it harder to clean your data up into a usable form. The example below depicts the problem: -![multiple tabs](../fig/2_datasheet_example.jpg) +![](fig/2_datasheet_example.jpg){alt='multiple tabs'} -In the example above, the computer will see (for example) row 4 and assume that all columns A-AF -refer to the same sample. This row actually represents four distinct samples -(sample 1 for each of four different collection dates - May 29th, June 12th, June 19th, and June 26th), +In the example above, the computer will see (for example) row 4 and assume that all columns A-AF +refer to the same sample. This row actually represents four distinct samples +(sample 1 for each of four different collection dates - May 29th, June 12th, June 19th, and June 26th), as well as some calculated summary statistics (an average (avr) and standard error of measurement (SEM)) for two of those samples. Other rows are similarly problematic. -## Using multiple tabs +## Using multiple tabs {#tabs} But what about workbook tabs? That seems like an easy way to organize data, right? Well, yes and no. When you create extra tabs, you fail to allow the computer to see connections in the data that are there (you have to introduce spreadsheet application-specific functions or scripting to ensure this connection). Say, for instance, you make a separate tab for each day you take a measurement. This isn't good practice for two reasons: + 1) you are more likely to accidentally add inconsistencies to your data if each time you take a measurement, you start recording data in a new tab, and 2) even if you manage to prevent all inconsistencies from creeping in, you will add an extra step for yourself before you analyze the -data because you will have to combine these data into a single datatable. You will have to explicitly tell the computer how to combine -tabs - and if the tabs are inconsistently formatted, you might even have to do it manually. + data because you will have to combine these data into a single datatable. You will have to explicitly tell the computer how to combine + tabs - and if the tabs are inconsistently formatted, you might even have to do it manually. -The next time you’re entering data, and you go to create another tab or table, ask yourself if you could avoid adding this tab by adding another column to your original spreadsheet. We used multiple tabs in our example of a messy data file, but now you've seen how you can reorganize your data to consolidate across tabs. +The next time you're entering data, and you go to create another tab or table, ask yourself if you could avoid adding this tab by adding another column to your original spreadsheet. We used multiple tabs in our example of a messy data file, but now you've seen how you can reorganize your data to consolidate across tabs. -Your data sheet might get very long over the course of the experiment. This makes it harder to enter data if you can’t see your headers -at the top of the spreadsheet. But don't repeat your header row. These can easily get mixed into the data, +Your data sheet might get very long over the course of the experiment. This makes it harder to enter data if you can't see your headers +at the top of the spreadsheet. But don't repeat your header row. These can easily get mixed into the data, leading to problems down the road. Instead you can freeze the column headers so that they remain visible even when you have a spreadsheet with many rows. [Documentation on how to freeze column headers in MS Excel](https://support.office.com/en-ca/article/Freeze-column-headings-for-easy-scrolling-57ccce0c-cf85-4725-9579-c5d13106ca6a) -## Not filling in zeros +## Not filling in zeros {#zeros} It might be that when you're measuring something, it's usually a zero, say the number of times a rabbit @@ -87,31 +83,29 @@ writing in the number zero in that column, when it's mostly zeros? However, there's a difference between a zero and a blank cell in a spreadsheet. To the computer, a zero is actually data. You measured or counted it. A blank cell means that it wasn't measured and the computer will interpret it as an unknown value (otherwise known as a -null value). +null value). The spreadsheets or statistical programs will likely mis-interpret blank cells that you intend to be zeros. By not entering the value of -your observation, you are telling your computer to represent that data as unknown or missing (null). This can cause problems with +your observation, you are telling your computer to represent that data as unknown or missing (null). This can cause problems with subsequent calculations or analyses. For example, the average of a set of numbers which includes a single null value is always null (because the computer can't guess the value of the missing observations). Because of this, it's very important to record zeros as zeros and truly missing data as nulls. +## Using problematic null values {#null} -## Using problematic null values **Example**: using -999, other numerical values, zero, or text to represent missing values. -Whatever the reason, it’s a problem if unknown or missing data is recorded as -999, 999, or 0. +Whatever the reason, it's a problem if unknown or missing data is recorded as -999, 999, or 0. Many statistical programs will not recognize that these are intended to represent missing (null) values. How these values are interpreted will depend on the software you use to analyze your data. +**Solutions**: - -**Solutions**: - -A solution will depend on the final application of your data and how you intend to analyze it, -but it is essential to use a clearly defined and CONSISTENT null indicator. Blank cells are the best choices for most applications; +A solution will depend on the final application of your data and how you intend to analyze it, +but it is essential to use a clearly defined and CONSISTENT null indicator. Blank cells are the best choices for most applications; when working in R, `NA` may be an acceptable null value choice. There are a many reasons why null values get represented differently within a dataset. -Sometimes confusing null values are automatically recorded from the measuring device. If that's the case, there's not much you can do, but it can be addressed in data cleaning with a tool like [OpenRefine](http://www.datacarpentry.org/OpenRefine-ecology-lesson/) before analyzing or sharing. In other cases, null values are used to convey different reasons why the data is missing. This is important information to capture, but is actually using one column to capture two pieces of information. Like for [using formatting to convey information](#formatting) it would be good here to create a new column like 'data_missing' and use that column to capture the different reasons. +Sometimes confusing null values are automatically recorded from the measuring device. If that's the case, there's not much you can do, but it can be addressed in data cleaning with a tool like [OpenRefine](https://www.datacarpentry.org/OpenRefine-ecology-lesson/) before analyzing or sharing. In other cases, null values are used to convey different reasons why the data is missing. This is important information to capture, but is actually using one column to capture two pieces of information. Like for [using formatting to convey information](#formatting) it would be good here to create a new column like 'data\_missing' and use that column to capture the different reasons. Whatever the reason, missing data is a problem. It is essential to use a clearly defined and consistent null indicator. Blanks (most applications) and NA (for R) are good choices. White et al, 2013, explain good choices for indicating null values for different software applications in their article: @@ -127,7 +121,7 @@ Blanks (most applications) and NA (for R) are good choices. White et al, 2013, e 0 Indistinguishable from a true zero - + Never use @@ -139,7 +133,7 @@ Blanks (most applications) and NA (for R) are good choices. White et al, 2013, e -999, 999 Not recognized as null by many programs without user input. Can be inadvertently entered into calculations - + Avoid @@ -186,56 +180,53 @@ Blanks (most applications) and NA (for R) are good choices. White et al, 2013, e - -## Using formatting to convey information +## Using formatting to convey information {#formatting} **Example**: highlighting cells, rows or columns that should be excluded from an analysis, leaving blank rows to indicate separations in data. -![formatting](../fig/formatting.png) +![](fig/formatting.png){alt='formatting'} **Solution**: create a new field to encode which data should be excluded. -![good formatting](../fig/good_formatting.png) - +![](fig/good_formatting.png){alt='good formatting'} -## Using formatting to make the data sheet look pretty +## Using formatting to make the data sheet look pretty {#formatting\_pretty} **Example**: merging cells. -**Solution**: If you’re not careful, formatting a worksheet to be more aesthetically pleasing can compromise your computer’s ability to +**Solution**: If you're not careful, formatting a worksheet to be more aesthetically pleasing can compromise your computer's ability to see associations in the data. Merged cells will make your data unreadable by statistics software. Consider restructuring your data in such a way that you will not need to merge cells to organize your data. - -## Placing comments or units in cells +## Placing comments or units in cells {#units} **Example**: Your data was collected, in part, by a summer student who you later found out was mis-identifying some of your species, some of the time. You want a way to note these data are suspect. **Solution**: Most analysis software can't see Excel or LibreOffice comments, and would be confused by comments placed within your data -cells. As described above for formatting, create another field if you need to add notes to cells. Similarly, don’t include units in -cells: ideally, all the measurements you place in one column should be in the same unit, but if for some reason they aren’t, create +cells. As described above for formatting, create another field if you need to add notes to cells. Similarly, don't include units in +cells: ideally, all the measurements you place in one column should be in the same unit, but if for some reason they aren't, create another field and specify the units the cell is in. - -## Entering more than one piece of information in a cell +## Entering more than one piece of information in a cell {#info} **Example**: You find one male, and one female of the same species. You enter this as 1M, 1F. -**Solution**: Don't include more than one piece of information in a cell. This will limit the ways in which you can analyze your data. +**Solution**: Don't include more than one piece of information in a cell. This will limit the ways in which you can analyze your data. If you need both these measurements, design your data sheet to include this information. For example, include one column for number of individuals and a separate column for sex. -## Using problematic field names +## Using problematic field names {#field\_name} + Choose descriptive field names, but be careful not to include spaces, numbers, or special characters of any kind. Spaces can be -misinterpreted by parsers that use whitespace as delimiters and some programs don’t like field names that are text strings that start -with numbers. +misinterpreted by parsers that use whitespace as delimiters and some programs don't like field names that are text strings that start +with numbers. Underscores (`_`) are a good alternative to spaces. Consider writing names in camel case (like this: ExampleFileName) to improve readability. Remember that abbreviations that make sense at the moment may not be so obvious in 6 months, but don't overdo it with names that are excessively long. Including the units in the field names avoids confusion and enables others to readily interpret your fields. -**Examples** +**Examples** @@ -280,7 +271,7 @@ that are excessively long. Including the units in the field names avoids confusi
-## Using special characters in data +## Using special characters in data {#special} **Example**: You treat your spreadsheet program as a word processor when writing notes, for example copying data directly from Word or other applications. @@ -293,30 +284,47 @@ relational database, dangerous things may occur, such as lines being cut in half General best practice is to avoid adding characters such as newlines, tabs, and vertical tabs. In other words, treat a text cell as if it were a simple web form that can only contain text and spaces. - -## Inclusion of metadata in data table +## Inclusion of metadata in data table {#metadata} **Example**: You add a legend at the top or bottom of your data table explaining column meaning, units, exceptions, etc. -**Solution**: Recording data about your data (“metadata”) is essential. You may be on intimate terms with your dataset while you are +**Solution**: Recording data about your data ("metadata") is essential. You may be on intimate terms with your dataset while you are collecting and analysing it, but the chances that you will still remember that the variable "sglmemgp" means single member of group, for -example, or the exact algorithm you used to transform a variable or create a derived one, after a few months, a year, or more are slim. +example, or the exact algorithm you used to transform a variable or create a derived one, after a few months, a year, or more are slim. As well, there are many reasons other people may want to examine or use your data - to understand your findings, to verify your findings, -to review your submitted publication, to replicate your results, to design a similar study, or even to archive your data for access and -re-use by others. While digital data by definition are machine-readable, understanding their meaning is a job for human beings. The +to review your submitted publication, to replicate your results, to design a similar study, or even to archive your data for access and +re-use by others. While digital data by definition are machine-readable, understanding their meaning is a job for human beings. The importance of documenting your data during the collection and analysis phase of your research cannot be overestimated, especially if your -research is going to be part of the scholarly record. +research is going to be part of the scholarly record. -However, metadata should not be contained in the data file itself. Unlike a table in a paper or a supplemental file, metadata (in the -form of legends) should not be included in a data file since this information is not data, and including it can disrupt how computer -programs interpret your data file. Rather, metadata should be stored as a separate file in the same directory as your data file, +However, metadata should not be contained in the data file itself. Unlike a table in a paper or a supplemental file, metadata (in the +form of legends) should not be included in a data file since this information is not data, and including it can disrupt how computer +programs interpret your data file. Rather, metadata should be stored as a separate file in the same directory as your data file, preferably in plain text format with a name that clearly associates it with your data file. Because metadata files are free text format, they also allow you to encode comments, units, information about how null values are encoded, etc. that are important to document but can -disrupt the formatting of your data file. +disrupt the formatting of your data file. + +Additionally, file or database level metadata describes how files that make up the dataset relate to each other; what format are they are +in; and whether they supercede or are superceded by previous files. A folder-level readme.txt file is the classic way of accounting for +all the files and folders in a project. + +(Text on metadata adapted from the online course Research Data [MANTRA](https://datalib.edina.ac.uk/mantra) by EDINA and Data Library, University of Edinburgh. MANTRA is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/).) + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Avoid using multiple tables within one spreadsheet. +- Avoid spreading data across multiple tabs. +- Record zeros as zeros. +- Use an appropriate null value to record missing data. +- Don't use formatting to convey information or to make your spreadsheet look pretty. +- Place comments in a separate column. +- Record units in column headers. +- Include only one piece of information in a cell. +- Avoid spaces, numbers and special characters in column headers. +- Avoid special characters in your data. +- Record metadata in a separate plain text file. + +:::::::::::::::::::::::::::::::::::::::::::::::::: -Additionally, file or database level metadata describes how files that make up the dataset relate to each other; what format are they are -in; and whether they supercede or are superceded by previous files. A folder-level readme.txt file is the classic way of accounting for -all the files and folders in a project. -(Text on metadata adapted from the online course Research Data [MANTRA](http://datalib.edina.ac.uk/mantra) by EDINA and Data Library, University of Edinburgh. MANTRA is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/).) diff --git a/episodes/03-dates-as-data.md b/episodes/03-dates-as-data.md index 72b0433..f1f88c3 100644 --- a/episodes/03-dates-as-data.md +++ b/episodes/03-dates-as-data.md @@ -1,18 +1,24 @@ --- -title: "Dates as data" +title: Dates as data teaching: 10 exercises: 3 -questions: -- "What are good approaches for handling dates in spreadsheets?" -objectives: -- "Describe how dates are stored and formatted in spreadsheets." -- "Describe the advantages of alternative date formatting in spreadsheets." -- "Demonstrate best practices for entering dates in spreadsheets." -keypoints: -- "Treating dates as multiple pieces of data rather than one makes them easier to handle." --- -Dates in spreadsheets can be a problem. For one thing, dates +::::::::::::::::::::::::::::::::::::::: objectives + +- Describe how dates are stored and formatted in spreadsheets. +- Describe the advantages of alternative date formatting in spreadsheets. +- Demonstrate best practices for entering dates in spreadsheets. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What are good approaches for handling dates in spreadsheets? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Dates in spreadsheets can be a problem. For one thing, dates are stored in a single column. While this seems the most natural way to record dates, it actually is not best practice. A spreadsheet application will display the dates in a @@ -22,98 +28,121 @@ and stores the dates may be problematic. In particular, please remember that DATE functions that are valid for a given spreadsheet program (be it LibreOffice Calc, Microsoft Excel, OpenOffice, Gnumeric, etc.) are usually guaranteed to be compatible only within the same -family of products. Most of the images of spreadsheets in this lesson come +family of products. Most of the images of spreadsheets in this lesson come from Microsoft Excel, run on a Mac or on Windows. -Regardless of your spreadsheet, if you will later need to export the data and +Regardless of your spreadsheet, if you will later need to export the data and ***need to conserve the timestamps***, you are better off handling them using one of the solutions discussed below. -One of the big problems with Excel is it can [turn things that aren't dates into dates](https://nsaunders.wordpress.com/2012/10/22/gene-name-errors-and-excel-lessons-not-learned/), -for example gene/protein names or identifiers like MAR1, DEC1, OCT4 will be changed to dates, and you cannot retreive the -original name or identifier (except manually). So if you avoid the date format overall, it's easier to work with these types of data. +One of the big problems with Excel is it can [turn things that aren't dates into dates](https://nsaunders.wordpress.com/2012/10/22/gene-name-errors-and-excel-lessons-not-learned/), +for example gene/protein names or identifiers like MAR1, DEC1, OCT4 will be changed to dates, and you cannot retreive the +original name or identifier (except manually). So if you avoid the date format overall, it's easier to work with these types of data. When you must work with dates, here is how to do it efficiently. -> ## Exercise -> -> Challenge: pulling month, day and year out of dates -> -> - Let's create a tab called `dates` in our data spreadsheet and copy the 'plot 3' table from the `2014` tab (that contains the problematic dates). -> - Let’s extract month, day and year from the dates in the `Date collected` column into new columns. For this we -> can use the following built-in Excel functions: -> -> `YEAR()` -> -> `MONTH()` -> -> `DAY()` -> -> (Make sure the new columns are formatted as a number and not as a date.) -> -> You can see that even though we expected the year to be 2014, the year is actually 2015. What happened here is that the field assistant who collected the data for year 2014 initially forgot to include their data for 'plot 3' in this dataset. They came back in 2015 to add the missing data into the dataset and entered the dates for 'plot 3' without the year. Excel automatically interpreted the year as 2015 - the year the data was entered into the spreadsheet and not the year the data was collected. Thereby, the spreadsheet program introduced an error in the dataset without the field assistant realising. -> -> > ## Solution -> > ![dates, exersize 1](../fig/solution_exercise_1_dates.png) -> > {: .output} -> {: .solution} -{: .challenge} - -> ## Exercise -> -> Challenge: pulling hour, minute and second out of the current time -> -> Current time and date are best retrieved using the functions `NOW()`, which -> returns the current date and time, and `TODAY()`, which returns the current -> date. The results will be formatted according to your computer's settings. -> -> 1) Extract the year, month and day from the current date and time string -> returned by the `NOW()` function. -> 2) Calculate the current time using `NOW()-TODAY()`. -> 3) Extract the hour, minute and second from the current time using -> functions `HOUR()`, `MINUTE()` and `SECOND()`. -> 4) Press `F9` to force the spreadsheet to recalculate the `NOW()` function, -> and check that it has been updated. -> > ## Solution -> > 1) To get the year, type `=YEAR(NOW())` into any cell in your spreadsheet. To get the month, type `=MONTH(NOW())`. To get the day, type `=DAY(NOW())`. -> > 2) Typing `=NOW()-TODAY()` will result in a decimal value that is not easily human parsable to a clock-based time. You will need to use the strategies in the third part of this challenge to convert this decimal value to readable time. -> > 3) To extract the hour, type `=HOUR(NOW()-TODAY())` and similarly for minute and second. -> {: .solution} -{: .challenge} +::::::::::::::::::::::::::::::::::::::: challenge + +## Exercise + +Challenge: pulling month, day and year out of dates + +- Let's create a tab called `dates` in our data spreadsheet and copy the 'plot 3' table from the `2014` tab (that contains the problematic dates). +- Let's extract month, day and year from the dates in the `Date collected` column into new columns. For this we + can use the following built-in Excel functions: + +`YEAR()` + +`MONTH()` + +`DAY()` + +(Make sure the new columns are formatted as a number and not as a date.) + +You can see that even though we expected the year to be 2014, the year is actually 2015. What happened here is that the field assistant who collected the data for year 2014 initially forgot to include their data for 'plot 3' in this dataset. They came back in 2015 to add the missing data into the dataset and entered the dates for 'plot 3' without the year. Excel automatically interpreted the year as 2015 - the year the data was entered into the spreadsheet and not the year the data was collected. Thereby, the spreadsheet program introduced an error in the dataset without the field assistant realising. + +::::::::::::::: solution + +## Solution + +![](fig/solution_exercise_1_dates.png) +{alt='dates, exersize 1' .output} + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::: challenge + +## Exercise + +Challenge: pulling hour, minute and second out of the current time + +Current time and date are best retrieved using the functions `NOW()`, which +returns the current date and time, and `TODAY()`, which returns the current +date. The results will be formatted according to your computer's settings. + +1) Extract the year, month and day from the current date and time string + returned by the `NOW()` function. +2) Calculate the current time using `NOW()-TODAY()`. +3) Extract the hour, minute and second from the current time using + functions `HOUR()`, `MINUTE()` and `SECOND()`. +4) Press `F9` to force the spreadsheet to recalculate the `NOW()` function, + and check that it has been updated. + +::::::::::::::: solution + +## Solution + +1) To get the year, type `=YEAR(NOW())` into any cell in your spreadsheet. To get the month, type `=MONTH(NOW())`. To get the day, type `=DAY(NOW())`. +2) Typing `=NOW()-TODAY()` will result in a decimal value that is not easily human parsable to a clock-based time. You will need to use the strategies in the third part of this challenge to convert this decimal value to readable time. +3) To extract the hour, type `=HOUR(NOW()-TODAY())` and similarly for minute and second. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: ## Preferred date format It is much safer to store dates with [YEAR, MONTH, DAY](#day) in separate columns or as [YEAR and DAY-OF-YEAR](#doy) in separate columns. -**Note**: Excel is unable to parse dates from before 1899-12-31, and will thus leave these untouched. If you’re mixing historic data +**Note**: Excel is unable to parse dates from before 1899-12-31, and will thus leave these untouched. If you're mixing historic data from before and after this date, Excel will translate only the post-1900 dates into its internal format, thus resulting in mixed data. -If you’re working with historic data, be extremely careful with your dates! +If you're working with historic data, be extremely careful with your dates! Excel also entertains a second date system, the 1904 date system, as the default in Excel for Macintosh. This system will assign a different serial number than the [1900 date system](https://support.microsoft.com/en-us/help/214330/differences-between-the-1900-and-the-1904-date-system-in-excel). Because of this, -[dates must be checked for accuracy when exporting data from Excel](http://uc3.cdlib.org/2014/04/09/abandon-all-hope-ye-who-enter-dates-in-excel/) (look for dates that are ~4 years off). +[dates must be checked for accuracy when exporting data from Excel](https://uc3.cdlib.org/2014/04/09/abandon-all-hope-ye-who-enter-dates-in-excel/) (look for dates that are ~4 years off). ## Date formats in spreadsheets -Spreadsheet programs have numerous “useful features” which allow them to handle dates in a variety of ways. +Spreadsheet programs have numerous "useful features" which allow them to handle dates in a variety of ways. -![Many formats, many ambiguities](../fig/5_excel_dates_1.jpg) +![](fig/5_excel_dates_1.jpg){alt="Many formats, many ambiguities"} -But these "features" often allow ambiguity to creep into your data. Ideally, data should be as unambiguous as possible. +But these "features" often allow ambiguity to creep into your data. Ideally, data should be as unambiguous as possible. ### Dates stored as integers The first thing you need to know is that Excel stores dates as numbers - see the last column in the above figure. Essentially, it counts the days from a default of December 31, 1899, and thus stores July 2, 2014 as the serial number 41822. -(But wait. That’s the default on my version of Excel. We’ll get into how this can introduce problems down the line later in this lesson. ) +(But wait. That's the default on my version of Excel. We'll get into how this can introduce problems down the line later in this lesson. ) This serial number thing can actually be useful in some circumstances. By using the above functions we can easily add days, months or years to a given date. Say you had a sampling plan where you needed to sample every thirty seven days. In another cell, you could type: - - =B2+37 - + +``` +=B2+37 +``` + And it would return - 8-Aug +``` +8-Aug +``` because it understands the date as a number `41822`, and `41822 + 37 = 41859` which Excel interprets as August 8, 2014. It retains the format (for the most @@ -133,58 +162,69 @@ As for dates, times are handled in a similar way; seconds can be directly added but to add hour and minutes we need to make sure that we are adding the quantities to the correct entities. -Which brings us to the many different ways Excel provides in how it displays dates. If you refer to the figure above, you’ll see that -there are many ways that ambiguity creeps into your data depending on the format you chose when you enter your data, and if you’re not -fully aware of which format you’re using, you can end up actually entering your data in a way that Excel will badly misinterpret and -you will end up with errors in your data that will be extremely difficult to track down and troubleshoot. - -> ## Exercise -> What happens to the dates in the `dates` tab of our workbook if we save this sheet in Excel (in `csv` format) and then open the file in a plain text editor (like TextEdit or Notepad)? What happens to the dates if we then open the `csv` file in Excel? -> > ## Solution -> > - Click to the `dates` tab of the workbook and double-click on any of the values in the `Date collected` column. Notice that the dates display with the year 2015. -> > - Select `File -> Save As` in Excel and in the drop down menu for file format select `CSV UTF-8 (Comma delimited) (.csv)`. Click `Save`. -> > - You will see a pop-up that says "This workbook cannot be saved in the selected file format because it contains multiple sheets." Choose `Save Active Sheet`. -> > - Navigate to the file in your finder application. Right click and select `Open With`. Choose a plain text editor application and view the file. Notice that the dates display as month/day without any year information. -> > - Now right click on the file again and open with Excel. Notice that the dates display with the current year, not 2015. -> > As you can see, exporting data from Excel and then importing it back into Excel fundamentally changed the data once again! -> {: .solution} -{: .challenge} +Which brings us to the many different ways Excel provides in how it displays dates. If you refer to the figure above, you'll see that +there are many ways that ambiguity creeps into your data depending on the format you chose when you enter your data, and if you're not +fully aware of which format you're using, you can end up actually entering your data in a way that Excel will badly misinterpret and +you will end up with errors in your data that will be extremely difficult to track down and troubleshoot. + +::::::::::::::::::::::::::::::::::::::: challenge + +## Exercise + +What happens to the dates in the `dates` tab of our workbook if we save this sheet in Excel (in `csv` format) and then open the file in a plain text editor (like TextEdit or Notepad)? What happens to the dates if we then open the `csv` file in Excel? + +::::::::::::::: solution + +## Solution + +- Click to the `dates` tab of the workbook and double-click on any of the values in the `Date collected` column. Notice that the dates display with the year 2015. +- Select `File -> Save As` in Excel and in the drop down menu for file format select `CSV UTF-8 (Comma delimited) (.csv)`. Click `Save`. +- You will see a pop-up that says "This workbook cannot be saved in the selected file format because it contains multiple sheets." Choose `Save Active Sheet`. +- Navigate to the file in your finder application. Right click and select `Open With`. Choose a plain text editor application and view the file. Notice that the dates display as month/day without any year information. +- Now right click on the file again and open with Excel. Notice that the dates display with the current year, not 2015. + As you can see, exporting data from Excel and then importing it back into Excel fundamentally changed the data once again! + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: **Note** You will notice that when exporting into a text-based format (such as CSV), Excel will export its internal date integer instead of a useful value (that is, the dates will be represented as integer numbers). This can potentially lead to problems if you use other software to manipulate the file. ### Advantages of Alternative Date Formatting -### Storing dates as YEAR, MONTH, DAY +### Storing dates as YEAR, MONTH, DAY {#day} Storing dates in YEAR, MONTH, DAY format helps remove this ambiguity. Let's look at this issue a bit closer. For instance this is a spreadsheet representing insect counts that were taken every few days over the summer, and things went something like this: -![So, so ambiguous, it's even confusing Excel](../fig/6_excel_dates_2.jpg) +![](fig/6_excel_dates_2.jpg){alt="So, so ambiguous, it's even confusing Excel"} If Excel was to be believed, this person had been collecting bugs **in the future**. Now, we have no doubt this person is highly capable, but I believe time travel was beyond even their grasp. Entering dates in one cell is helpful but due to the fact that the spreadsheet programs may interpret and save the data in different ways -(doing that somewhat behind the scenes), there is a better practice. +(doing that somewhat behind the scenes), there is a better practice. In dealing with dates in spreadsheets, separate date data into separate fields (day, month, year), which will eliminate any chance of -ambiguity. +ambiguity. -### Storing dates as YEAR, DAY-OF-YEAR +### Storing dates as YEAR, DAY-OF-YEAR {#doy} There is also another option. You can also store dates as year and day of year (DOY). Why? Because depending on your question, this might be what's useful to you, and there is practically no possibility for ambiguity creeping in. -Statistical models often incorporate year as a factor, or a categorical variable, rather than a numeric variable, to account for -year-to-year variation, and DOY can be used to measure the passage of time within a year. +Statistical models often incorporate year as a factor, or a categorical variable, rather than a numeric variable, to account for +year-to-year variation, and DOY can be used to measure the passage of time within a year. -So, can you convert all your dates into DOY format? Well, in Excel, here’s a useful guide: +So, can you convert all your dates into DOY format? Well, in Excel, here's a useful guide: -![Kill that ambiguity before it bites you!](../fig/7_excel_dates_3.jpg) +![](fig/7_excel_dates_3.jpg){alt="Kill that ambiguity before it bites you!"} -### Storing dates as a single string +### Storing dates as a single string {#str} Another alternative could be to convert the date string into a single string using the `YYYYMMDD` format. @@ -193,14 +233,21 @@ This option also works for datetimes using the `YYYYMMDDhhmmss` format. So the datetime `March 24, 2015 17:25:35` would become `20150324172535`, where: - YYYY: the full year, i.e. 2015 MM: the month, i.e. 03 DD: the day of month, i.e. 24 hh: hour of day, i.e. 17 mm: minutes, i.e. 25 -ss: seconds, i.e. 35 +ss: seconds, i.e. 35 Such strings will be correctly sorted in ascending or descending order, and by knowing the format they can then be correctly processed by the receiving software. + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Treating dates as multiple pieces of data rather than one makes them easier to handle. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/04-quality-control.md b/episodes/04-quality-control.md index 0a482d1..732d6cd 100644 --- a/episodes/04-quality-control.md +++ b/episodes/04-quality-control.md @@ -1,18 +1,21 @@ --- -title: "Quality control" +title: Quality control teaching: 20 exercises: 0 -questions: -- "How can we carry out basic quality control and quality assurance in spreadsheets?" -objectives: -- "Apply quality control techniques to identify errors in spreadsheets and limit incorrect data entry." -keypoints: -- "Always copy your original spreadsheet file and work with a copy so you don't affect the raw data." -- "Use data validation to prevent accidentally entering invalid data." -- "Use sorting to check for invalid data." -- "Use conditional formatting (cautiously) to check for invalid data." --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Apply quality control techniques to identify errors in spreadsheets and limit incorrect data entry. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How can we carry out basic quality control and quality assurance in spreadsheets? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + When you have a well-structured data table, you can use several simple techniques within your spreadsheet to ensure the data you enter is free of errors. These approaches include techniques that are @@ -20,7 +23,7 @@ implemented prior to entering data (quality assurance) and techniques that are used after entering data to check for errors (quality control). -# Quality Assurance +## Quality Assurance Quality assurance stops bad data from ever being entered by checking to see if values are valid during data entry. For example, if research is being conducted @@ -37,17 +40,17 @@ in each data column. 2\. On the `Data` tab select `Data Validation` -![Image of Data Validation button on Data tab](../fig/data_validation.png) +![](fig/data_validation.png){alt='Image of Data Validation button on Data tab'} 3\. In the `Allow` box select the kind of data that should be in the - column. Options include whole numbers, decimals, lists of items, dates, and - other values. +column. Options include whole numbers, decimals, lists of items, dates, and +other values. + +![](fig/data_validation_window.png){alt='Image of Data Validation window'} -![Image of Data Validation window](../fig/data_validation_window.png) - 4\. After selecting an item enter any additional details. For example, if you've - chosen a list of values, enter a comma-delimited or semi-colon list of allowable - values in the `Source` box. +chosen a list of values, enter a comma-delimited or semi-colon list of allowable +values in the `Source` box. Let's try this out by setting the plot column in our spreadsheet to only allow plot values that are integers between 1 and 24. @@ -57,23 +60,23 @@ plot values that are integers between 1 and 24. 3. In the `Allow` box select `Whole number` 4. Set the minimum and maximum values to 1 and 24. -![Image of Data Validation window for validating plot values](../fig/plot_validation.png) +![](fig/plot_validation.png){alt='Image of Data Validation window for validating plot values'} Now let's try entering a new value in the plot column that isn't a valid plot. The spreadsheet stops us from entering the wrong value and asks us if we would like to try again. -![Image of error when trying to enter invalid data](../fig/invalid_value.png) +![](fig/invalid_value.png){alt='Image of error when trying to enter invalid data'} You can also customize the resulting message to be more informative by entering your own message in the `Input Message` tab -![Image of Input Message tab](../fig/input_message.png) +![](fig/input_message.png){alt='Image of Input Message tab'} or allow invalid data to result in a warning rather than an error by modifying the `Style` option on the `Error Alert` tab. -![Image of Error Alert tab](../fig/error_alert.png) +![](fig/error_alert.png){alt='Image of Error Alert tab'} Quality assurance can make data entry easier as well as more robust. For example, if you use a list of options to restrict data entry, the spreadsheet @@ -81,12 +84,12 @@ will provide you with a drop-downlist of the available items. So, instead of trying to remember how to spell *Dipodomys spectabilis*, you can select the right option from the list. -![Image of drop-down menu](../fig/drop_down_list2.png) +![](fig/drop_down_list2.png){alt='Image of drop-down menu'} -# Quality Control +## Quality Control Tip: *Before doing any quality control operations, save your original file with the formulas and a name indicating it is the original -data. Create a separate file with appropriate naming and versioning, and ensure your data is stored as values and not as formulas. +data. Create a separate file with appropriate naming and versioning, and ensure your data is stored as values and not as formulas. Because formulas refer to other cells, and you may be moving cells around, you may compromise the integrity of your data if you do not take this step!* @@ -94,73 +97,94 @@ readMe (README) files: As you start manipulating your data files, create a readM document your manipulations so that they may be easily understood and replicated, either by your future self or by an independent researcher. Your readMe file should document all of the files in your data set (including documentation), describe their content and format, and lay out the organizing principles of folders and subfolders. For each of the separate files listed, it is a good idea to -document the manipulations or analyses that were carried out on those data. +document the manipulations or analyses that were carried out on those data. [Cornell University's Research Data Management Service Group](https://data.research.cornell.edu/content/readme) provides detailed guidelines for how to write a good readMe file, along with an adaptable template. -## Sorting +### Sorting + Bad values often sort to the bottom or top of the column. For example, if your data should be numeric, then alphabetical and null data will group at the ends of the sorted data. Sort your data by each field, one at a time. Scan through each column, but pay the most -attention to the top and the bottom of a column. +attention to the top and the bottom of a column. If your dataset is well-structured and does not contain formulas, sorting should never affect the integrity of your dataset. **Remember** to expand your sort in order to prevent data corruption. Expanding your sort ensures that the all the data in one row move together instead of only sorting a single column in isolation. Sorting by only a single column will scramble your data - a single row will no longer represent an individual observation. -> ## Exercise -> -> We've combined all of the tables from the messy data into a single table in a single tab. Download this semi-cleaned data file to your computer: [survey_sorting_exercise](https://github.com/datacarpentry/spreadsheet-ecology-lesson/blob/gh-pages/data/survey_sorting_exercise.xlsx?raw=true) -> -> Once downloaded, sort the `Weight_grams` column in your spreadsheet program from `Largest to Smallest`. -> -> What do you notice? -> -> > ## Solution -> > -> > Click the Sort button on the data tab in Excel. A pop-up will appear. Make sure you select `Expand the selection`. -> > -> > ![quality_control0, exercise1](../fig/sorting_button.png) -> > {: .output} -> > -> > The following window will display, choose the column you want to sort as well as the sort order. -> > -> > ![quality_control1, exercise1](../fig/sorting_example.png) -> > {: .output} -> > -> > -> > **Note** how the odd values sort to the top and bottom of the tabular data. -> > The cells containing no data values sort to the bottom of the tabular data, while the cells where the letter "g" was included can be found towards the top. This is a powerful way to check your data for outliers and odd values. -> > -> > ![quality_control2, exercise1](../fig/sorting_solution_1.png) -> > {: .output} -> > -> > ![quality_control3, exercise1](../fig/sorting_solution_2.png) -> > {: .output} -> > -> {: .solution} -{: .challenge} - - - -## Conditional formatting ## +::::::::::::::::::::::::::::::::::::::: challenge + +### Exercise + +We've combined all of the tables from the messy data into a single table in a single tab. Download this semi-cleaned data file to your computer: [survey\_sorting\_exercise](https://github.com/datacarpentry/spreadsheet-ecology-lesson/blob/gh-pages/data/survey_sorting_exercise.xlsx?raw=true) + +Once downloaded, sort the `Weight_grams` column in your spreadsheet program from `Largest to Smallest`. + +What do you notice? + +::::::::::::::: solution + +### Solution + +Click the Sort button on the data tab in Excel. A pop-up will appear. Make sure you select `Expand the selection`. + +![](fig/sorting_button.png) +{alt='quality\_control0, exercise1' .output} + +The following window will display, choose the column you want to sort as well as the sort order. + +![](fig/sorting_example.png) +{alt='quality\_control1, exercise1' .output} + +**Note** how the odd values sort to the top and bottom of the tabular data. +The cells containing no data values sort to the bottom of the tabular data, while the cells where the letter "g" was included can be found towards the top. This is a powerful way to check your data for outliers and odd values. + +![](fig/sorting_solution_1.png) +{alt='quality\_control2, exercise1' .output} + +![](fig/sorting_solution_2.png) +{alt='quality\_control3, exercise1' .output} + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Conditional formatting + Conditional formatting basically can do something like color code your values by some criteria or lowest to highest. This makes it easy to scan your data for outliers. Conditional formatting should be used with caution, but it can be a great way to flag inconsistent values when entering data. -> ## Exercise -> 1. Make sure the Weight_grams column is highlighted. -> 2. In the main Excel menu bar, click `Home` > `Conditional Formatting...` choose a formatting rule. -> 3. Apply any `2-Color Scale` formatting rule. -> 4. Now we can scan through and different colors will stand out. Do you notice any strange values? -> -> > ## Solution -> > -> > Cells that contain non-numerical values are not colored. This includes both the cells where the letter "g" was included and the empty cells. -> > ![quality_control4, exercise2](../fig/conditional_formating.png) -> > {: .output} -> > -> {: .solution} -{: .challenge} +::::::::::::::::::::::::::::::::::::::: challenge + +### Exercise + +1. Make sure the Weight\_grams column is highlighted. +2. In the main Excel menu bar, click `Home` > `Conditional Formatting...` choose a formatting rule. +3. Apply any `2-Color Scale` formatting rule. +4. Now we can scan through and different colors will stand out. Do you notice any strange values? + +::::::::::::::: solution + +### Solution + +Cells that contain non-numerical values are not colored. This includes both the cells where the letter "g" was included and the empty cells. +![](fig/conditional_formating.png) +{alt='quality\_control4, exercise2' .output} + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: It is nice to be able to do these scans in spreadsheets, but we also can do these -checks in a programming language like R, or in OpenRefine or SQL. +checks in a programming language like R, or in OpenRefine or SQL. + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Always copy your original spreadsheet file and work with a copy so you don't affect the raw data. +- Use data validation to prevent accidentally entering invalid data. +- Use sorting to check for invalid data. +- Use conditional formatting (cautiously) to check for invalid data. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/episodes/05-exporting-data.md b/episodes/05-exporting-data.md index 071b12f..a29703a 100644 --- a/episodes/05-exporting-data.md +++ b/episodes/05-exporting-data.md @@ -1,23 +1,28 @@ --- -title: "Exporting data" +title: Exporting data teaching: 10 exercises: 0 -questions: -- "How can we export data from spreadsheets in a way that is useful for downstream applications?" -objectives: -- "Store spreadsheet data in universal file formats." -- "Export data from a spreadsheet to a CSV file." -keypoints: -- "Data stored in common spreadsheet formats will often not be read correctly into data analysis software, introducing errors into your data." -- "Exporting data from spreadsheets to formats like CSV or TSV puts it in a format that can be used consistently by most programs." --- +::::::::::::::::::::::::::::::::::::::: objectives + +- Store spreadsheet data in universal file formats. +- Export data from a spreadsheet to a CSV file. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How can we export data from spreadsheets in a way that is useful for downstream applications? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + Storing the data you're going to work with for your analyses in Excel default file format (`*.xls` or `*.xlsx` - depending on the Excel version) isn't a good idea. Why? - Because it is a proprietary format, and it is possible that in - the future, technology won’t exist (or will become sufficiently + the future, technology won't exist (or will become sufficiently rare) to make it inconvenient, if not impossible, to open the file. - Other spreadsheet software may not be able to open files @@ -30,20 +35,20 @@ version) isn't a good idea. Why? to deposit your data in a data repository, and most of them don't accept Excel format. It needs to be in one of the formats discussed below. - + - The above points also apply to other formats such as open data formats used by LibreOffice / Open Office. These formats are not static and do not get parsed the same way by different software packages. -As an example of inconsistencies in data storage, do you remember how we talked about how Excel stores dates earlier? It turns out that -there are multiple defaults for different versions of the software, and you can switch between them all. So, say you’re +As an example of inconsistencies in data storage, do you remember how we talked about how Excel stores dates earlier? It turns out that +there are multiple defaults for different versions of the software, and you can switch between them all. So, say you're compiling Excel-stored data from multiple sources. There are dates in each file - Excel interprets them as their own internally consistent -serial numbers. When you combine the data, Excel will take the serial number from the place you’re importing it from, and interpret it -using the rule set for the version of Excel you’re using. Essentially, you could be adding errors to your data, and it wouldn’t +serial numbers. When you combine the data, Excel will take the serial number from the place you're importing it from, and interpret it +using the rule set for the version of Excel you're using. Essentially, you could be adding errors to your data, and it wouldn't necessarily be flagged by any data cleaning methods if your ranges overlap. Storing data in a universal, open, and static format will help deal with this problem. Try tab-delimited (tab separated values or TSV) or comma-delimited (comma separated values or CSV). CSV files are plain text files where the columns are separated by commas, hence 'comma separated values' or CSV. The advantage of a CSV file over an Excel/SPSS/etc. file is that we can open and read a CSV file -using just about any software, including plain text editors like TextEdit or NotePad. +using just about any software, including plain text editors like TextEdit or NotePad. Data in a CSV file can also be easily imported into other formats and environments, such as SQLite and R. We're not tied to a certain version of a certain expensive program when we work with CSV files, so it's a @@ -58,7 +63,7 @@ To save a file you have opened in Excel in CSV format: An important note for backwards compatibility: you can open CSV files in Excel! -![Saving an Excel file to CSV](../fig/excel-to-csv.png) +![](fig/excel-to-csv.png){alt='Saving an Excel file to CSV'} ## A Note on Cross-platform Operability @@ -66,7 +71,7 @@ By default, most coding and statistical environments expect UNIX-style line endi As such, when exporting to CSV using Excel, your data in text format will look like this: ->data1,data21,24,5 +> data1,data21,24,5 When opening your CSV file in Excel again, it will parse it as follows: @@ -76,34 +81,38 @@ However, if you open your CSV file on a different system that does not parse the Your data in text format then look like this: ->data1
->data2
->1
->2
->… +> data1
+> data2
+> 1
+> 2
+> … You will then see a weird character or possibly the string `CR` or `\r`: screen shot 2017-03-31 at 7 26 42 pm -thus causing terrible things to happen to your data. For example, `2\r` is not a valid integer, and thus will throw an error (if you’re lucky) when you attempt to operate on it in R or Python. Note that this happens on Excel for OSX as well as Windows, due to legacy Windows compatibility. +thus causing terrible things to happen to your data. For example, `2\r` is not a valid integer, and thus will throw an error (if you're lucky) when you attempt to operate on it in R or Python. Note that this happens on Excel for OSX as well as Windows, due to legacy Windows compatibility. There are a handful of solutions for enforcing uniform UNIX-style line endings on your exported CSV files: -1. When exporting from Excel, save as a “Windows comma separated (.csv)” file +1. When exporting from Excel, save as a "Windows comma separated (.csv)" file + 2. If you store your data file under version control using Git, edit the `.git/config` file in your repository to automatically translate `\r\n` line endings into `\n`. -Add the following to the file ([see the detailed tutorial](http://nicercode.github.io/blog/2013-04-30-excel-and-line-endings)): - - [filter "cr"] - clean = LC_CTYPE=C awk '{printf(\"%s\\n\", $0)}' | LC_CTYPE=C tr '\\r' '\\n' - smudge = tr '\\n' '\\r'` - - and then create a file `.gitattributes` that contains the line: - - *.csv filter=cr - - -3. Use [dos2unix](http://dos2unix.sourceforge.net/) (available on OSX, *nix, and Cygwin) on local files to standardize line endings. + Add the following to the file ([see the detailed tutorial](https://nicercode.github.io/blog/2013-04-30-excel-and-line-endings)): + + ``` + [filter "cr"] + clean = LC_CTYPE=C awk '{printf(\"%s\\n\", $0)}' | LC_CTYPE=C tr '\\r' '\\n' + smudge = tr '\\n' '\\r'` + ``` + + and then create a file `.gitattributes` that contains the line: + + ``` + *.csv filter=cr + ``` + +3. Use [dos2unix](https://dos2unix.sourceforge.net/) (available on OSX, \*nix, and Cygwin) on local files to standardize line endings. #### A note on R and `.xlsx` @@ -121,11 +130,20 @@ worksheets in the `.xlsx` documents. #### Caveats on commas -In some datasets, the data values themselves may include commas (,). This is particularly true in countries that +In some datasets, the data values themselves may include commas (,). This is particularly true in countries that use commas as decimal separators. In that case, the software which you use (including Excel) will most likely incorrectly display the data in columns. This is because the commas which are a part of the data values will be -interpreted as delimiters. +interpreted as delimiters. If you are working with data that contains commas, you likely will need to use another delimiter when working in a spreadsheet. In this case, consider using tabs as your delimiter and working with TSV files. TSV files can be exported from spreadsheet -programs in the same way as CSV files. For more of a discussion on data formats and potential issues with commas within datasets see [the discussion page](http://www.datacarpentry.org/spreadsheet-ecology-lesson/discuss/). +programs in the same way as CSV files. For more of a discussion on data formats and potential issues with commas within datasets see [the discussion page](https://www.datacarpentry.org/spreadsheet-ecology-lesson/discuss/). + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Data stored in common spreadsheet formats will often not be read correctly into data analysis software, introducing errors into your data. +- Exporting data from spreadsheets to formats like CSV or TSV puts it in a format that can be used consistently by most programs. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/data/survey_data_spreadsheet_messy.xls b/episodes/data/survey_data_spreadsheet_messy.xls similarity index 100% rename from data/survey_data_spreadsheet_messy.xls rename to episodes/data/survey_data_spreadsheet_messy.xls diff --git a/data/survey_sorting_exercise.xlsx b/episodes/data/survey_sorting_exercise.xlsx similarity index 100% rename from data/survey_sorting_exercise.xlsx rename to episodes/data/survey_sorting_exercise.xlsx diff --git a/fig/1_helpful_clippy.jpg b/episodes/fig/1_helpful_clippy.jpg similarity index 100% rename from fig/1_helpful_clippy.jpg rename to episodes/fig/1_helpful_clippy.jpg diff --git a/fig/2_datasheet_example.jpg b/episodes/fig/2_datasheet_example.jpg similarity index 100% rename from fig/2_datasheet_example.jpg rename to episodes/fig/2_datasheet_example.jpg diff --git a/fig/3_white_table_1.jpg b/episodes/fig/3_white_table_1.jpg similarity index 100% rename from fig/3_white_table_1.jpg rename to episodes/fig/3_white_table_1.jpg diff --git a/fig/4_merged_cells.jpg b/episodes/fig/4_merged_cells.jpg similarity index 100% rename from fig/4_merged_cells.jpg rename to episodes/fig/4_merged_cells.jpg diff --git a/fig/5_excel_dates_1.jpg b/episodes/fig/5_excel_dates_1.jpg similarity index 100% rename from fig/5_excel_dates_1.jpg rename to episodes/fig/5_excel_dates_1.jpg diff --git a/fig/6_excel_dates_2.jpg b/episodes/fig/6_excel_dates_2.jpg similarity index 100% rename from fig/6_excel_dates_2.jpg rename to episodes/fig/6_excel_dates_2.jpg diff --git a/fig/7_excel_dates_3.jpg b/episodes/fig/7_excel_dates_3.jpg similarity index 100% rename from fig/7_excel_dates_3.jpg rename to episodes/fig/7_excel_dates_3.jpg diff --git a/img/DC1_logo_small.png b/episodes/fig/DC1_logo_small.png similarity index 100% rename from img/DC1_logo_small.png rename to episodes/fig/DC1_logo_small.png diff --git a/img/DataONE_LOGO.jpg b/episodes/fig/DataONE_LOGO.jpg similarity index 100% rename from img/DataONE_LOGO.jpg rename to episodes/fig/DataONE_LOGO.jpg diff --git a/fig/conditional_formating.png b/episodes/fig/conditional_formating.png similarity index 100% rename from fig/conditional_formating.png rename to episodes/fig/conditional_formating.png diff --git a/img/creative-commons-attribution-license.png b/episodes/fig/creative-commons-attribution-license.png similarity index 100% rename from img/creative-commons-attribution-license.png rename to episodes/fig/creative-commons-attribution-license.png diff --git a/fig/csv-mistake.png b/episodes/fig/csv-mistake.png similarity index 100% rename from fig/csv-mistake.png rename to episodes/fig/csv-mistake.png diff --git a/fig/data_validation.png b/episodes/fig/data_validation.png similarity index 100% rename from fig/data_validation.png rename to episodes/fig/data_validation.png diff --git a/fig/data_validation_window.png b/episodes/fig/data_validation_window.png similarity index 100% rename from fig/data_validation_window.png rename to episodes/fig/data_validation_window.png diff --git a/fig/drop_down_list.png b/episodes/fig/drop_down_list.png similarity index 100% rename from fig/drop_down_list.png rename to episodes/fig/drop_down_list.png diff --git a/fig/drop_down_list2.png b/episodes/fig/drop_down_list2.png similarity index 100% rename from fig/drop_down_list2.png rename to episodes/fig/drop_down_list2.png diff --git a/fig/error_alert.png b/episodes/fig/error_alert.png similarity index 100% rename from fig/error_alert.png rename to episodes/fig/error_alert.png diff --git a/fig/excel-to-csv.png b/episodes/fig/excel-to-csv.png similarity index 100% rename from fig/excel-to-csv.png rename to episodes/fig/excel-to-csv.png diff --git a/fig/excel_tables_example.png b/episodes/fig/excel_tables_example.png similarity index 100% rename from fig/excel_tables_example.png rename to episodes/fig/excel_tables_example.png diff --git a/fig/excel_tables_example1.png b/episodes/fig/excel_tables_example1.png similarity index 100% rename from fig/excel_tables_example1.png rename to episodes/fig/excel_tables_example1.png diff --git a/fig/formatting.png b/episodes/fig/formatting.png similarity index 100% rename from fig/formatting.png rename to episodes/fig/formatting.png diff --git a/fig/good_formatting.png b/episodes/fig/good_formatting.png similarity index 100% rename from fig/good_formatting.png rename to episodes/fig/good_formatting.png diff --git a/fig/input_message.png b/episodes/fig/input_message.png similarity index 100% rename from fig/input_message.png rename to episodes/fig/input_message.png diff --git a/fig/invalid_value.png b/episodes/fig/invalid_value.png similarity index 100% rename from fig/invalid_value.png rename to episodes/fig/invalid_value.png diff --git a/fig/multiple-info.png b/episodes/fig/multiple-info.png similarity index 100% rename from fig/multiple-info.png rename to episodes/fig/multiple-info.png diff --git a/fig/plot_validation.png b/episodes/fig/plot_validation.png similarity index 100% rename from fig/plot_validation.png rename to episodes/fig/plot_validation.png diff --git a/fig/single-info.png b/episodes/fig/single-info.png similarity index 100% rename from fig/single-info.png rename to episodes/fig/single-info.png diff --git a/fig/solution_exercise_1_dates.png b/episodes/fig/solution_exercise_1_dates.png similarity index 100% rename from fig/solution_exercise_1_dates.png rename to episodes/fig/solution_exercise_1_dates.png diff --git a/fig/sorting.png b/episodes/fig/sorting.png similarity index 100% rename from fig/sorting.png rename to episodes/fig/sorting.png diff --git a/fig/sorting_button.png b/episodes/fig/sorting_button.png similarity index 100% rename from fig/sorting_button.png rename to episodes/fig/sorting_button.png diff --git a/fig/sorting_solution_1.png b/episodes/fig/sorting_solution_1.png similarity index 100% rename from fig/sorting_solution_1.png rename to episodes/fig/sorting_solution_1.png diff --git a/fig/sorting_solution_2.png b/episodes/fig/sorting_solution_2.png similarity index 100% rename from fig/sorting_solution_2.png rename to episodes/fig/sorting_solution_2.png diff --git a/fig/spreadsheet-setup-updated.png b/episodes/fig/spreadsheet-setup-updated.png similarity index 100% rename from fig/spreadsheet-setup-updated.png rename to episodes/fig/spreadsheet-setup-updated.png diff --git a/fig/spreadsheet-setup.png b/episodes/fig/spreadsheet-setup.png similarity index 100% rename from fig/spreadsheet-setup.png rename to episodes/fig/spreadsheet-setup.png diff --git a/index.md b/index.md index 791fe7d..68a8953 100644 --- a/index.md +++ b/index.md @@ -1,18 +1,23 @@ --- -layout: lesson +site: sandpaper::sandpaper_site --- -{% include base_path.html %} +> **ATTENTION** This is an experimental test of [The Carpentries Workbench](https://carpentries.github.io/workbench) lesson infrastructure. +> It was automatically converted from the source lesson via [the lesson transition script](https://github.com/carpentries/lesson-transition/). +> +> If anything seems off, please contact Zhian Kamvar [zkamvar@carpentries.org](mailto:zkamvar@carpentries.org) -Good data organization is the foundation of any research project. Most + + +Good data organization is the foundation of any research project. Most researchers have data in spreadsheets, so it's the place that many research -projects start. +projects start. -We organize data in spreadsheets in the ways that we as humans want to work with the data, +We organize data in spreadsheets in the ways that we as humans want to work with the data, but computers require that data be organized in particular ways. In order -to use tools that make computation more efficient, such as programming -languages like R or Python, we need to structure our data the way that -computers need the data. Since this is where most research projects start, +to use tools that make computation more efficient, such as programming +languages like R or Python, we need to structure our data the way that +computers need the data. Since this is where most research projects start, this is where we want to start too! In this lesson, you will learn: @@ -27,29 +32,39 @@ In this lesson, however, you will *not* learn about data analysis with spreadshe Much of your time as a researcher will be spent in the initial 'data wrangling' stage, where you need to organize the data to perform a proper analysis later. It's not the most fun, but it is necessary. In this lesson you will -learn how to think about data organization and some practices for more +learn how to think about data organization and some practices for more effective data wrangling. With this approach you can better format current data and plan new data collection so less data wrangling is needed. +:::::::::::::::::::::::::::::::::::::::::: prereq + +## Getting Started + +Data Carpentry's teaching is hands-on, so participants are encouraged to use +their own computers to insure the proper setup of tools for an efficient +workflow.
**These lessons assume no prior knowledge of the skills or tools.** + +To get started, follow the directions in the "[Setup](learners/setup.md)" tab to +download data to your computer and follow any installation instructions. + +#### Prerequisites + +This lesson requires a working copy of spreadsheet software, such as Microsoft +Excel or LibreOffice or OpenOffice.org (see more details in "[Setup](learners/setup.md)"). +
To most effectively use these materials, please make sure to install +everything *before* working through this lesson. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::: prereq + +## For Instructors + +If you are teaching this lesson in a workshop, please see the +[Instructor notes](instructors/instructor-notes.md). + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + -> ## Getting Started -> -> Data Carpentry's teaching is hands-on, so participants are encouraged to use -> their own computers to insure the proper setup of tools for an efficient -> workflow.
**These lessons assume no prior knowledge of the skills or tools.** -> -> To get started, follow the directions in the "[Setup]({{ relative_root_path }}/{% link setup.md %})" tab to -> download data to your computer and follow any installation instructions. -> -> #### Prerequisites -> -> This lesson requires a working copy of spreadsheet software, such as Microsoft -> Excel or LibreOffice or OpenOffice.org (see more details in "[Setup]({{ relative_root_path }}/{% link setup.md %})"). ->
To most effectively use these materials, please make sure to install -> everything *before* working through this lesson. -{: .prereq} - -> ## For Instructors -> If you are teaching this lesson in a workshop, please see the -> [Instructor notes](guide/). -{: .prereq} diff --git a/_extras/datamanagement.md b/instructors/datamanagement.md similarity index 65% rename from _extras/datamanagement.md rename to instructors/datamanagement.md index 3df95ef..01400ca 100644 --- a/_extras/datamanagement.md +++ b/instructors/datamanagement.md @@ -1,31 +1,33 @@ --- -layout: page title: Data Management Tips --- -[The following is distilled from a _Nature_ Toolbox article that was published on 1 April 2019: Perkel, J.M. 11 ways to avert a data-storage disaster. _Nature_ **568**, 131-132 (2019). [https://www.nature.com/articles/d41586-019-01040-w](https://www.nature.com/articles/d41586-019-01040-w)] + +[The following is distilled from a *Nature* Toolbox article that was published on 1 April 2019: Perkel, J.M. 11 ways to avert a data-storage disaster. *Nature* **568**, 131-132 (2019). [https://www.nature.com/articles/d41586-019-01040-w](https://www.nature.com/articles/d41586-019-01040-w)] **Tips for managing your data** 1. **3-2-1**: Fires, floods, and theft all happen -- not to mention bitrot (the deterioration of storage media over time). So, keep at least three copies of your data, on two different media, at least one of which is off-site. (And be sure to store your local copies under proper environmental conditions!) -2. **Talk to the experts**: Make your institution’s professionals your first call. Ask IT team colleagues about free or low-cost backup options available; your librarian about data management strategies; and your grant officers about regulations regarding how, and how long, to store data +2. **Talk to the experts**: Make your institution's professionals your first call. Ask IT team colleagues about free or low-cost backup options available; your librarian about data management strategies; and your grant officers about regulations regarding how, and how long, to store data -3. **Safeguard privacy**: Private data, such as student information, cannot be stored just anywhere. And if those data are lost or stolen, you could face consequences. If you have, or plan to store such data, speak to your institution’s IT team for advice. +3. **Safeguard privacy**: Private data, such as student information, cannot be stored just anywhere. And if those data are lost or stolen, you could face consequences. If you have, or plan to store such data, speak to your institution's IT team for advice. -4. **Manage your data**: Make your backups more effective, and more future-proof, by developing a data-management plan. Establish file-naming conventions and organizational strategies -- for instance, that each project gets its own directory, with dedicated subdirectories for data and code. Decide which data will be backed up, and which can be discarded. Determine where different data types will be backed up, and how often. And, document everything: Keep a copy of your data management plan where people can find it, and annotate your experiments with README files that indicate the experiment, the file structures, required applications or scripts, and so on. +4. **Manage your data**: Make your backups more effective, and more future-proof, by developing a data-management plan. Establish file-naming conventions and organizational strategies -- for instance, that each project gets its own directory, with dedicated subdirectories for data and code. Decide which data will be backed up, and which can be discarded. Determine where different data types will be backed up, and how often. And, document everything: Keep a copy of your data management plan where people can find it, and annotate your experiments with README files that indicate the experiment, the file structures, required applications or scripts, and so on. -5. **Share with others**: Your colleagues may also need access to your data. Can someone else understand what the data are and where they’re located if you’re not around? Make sure they have access and permissions to the data and that they are able to understand what the files are and how they are organized (see **Manage your data**). +5. **Share with others**: Your colleagues may also need access to your data. Can someone else understand what the data are and where they're located if you're not around? Make sure they have access and permissions to the data and that they are able to understand what the files are and how they are organized (see **Manage your data**). 6. **Be realistic**: Once you develop a backup strategy, discuss it in the lab. Is it accessible to new colleagues, or only command-line experts? Is it doable after pulling an all-nighter? And how effective is it? Simulate what would happen if disaster should strike: What data will you lose, and what can you recover? -7. **Automate**: The thing about backups is, you need them when you least expect it. So don’t rely on remembering to run a backup -- automate them. +7. **Automate**: The thing about backups is, you need them when you least expect it. So don't rely on remembering to run a backup -- automate them. -8. **Test your backup**: Do your backups actually work? Test them to find out. Make sure you can actually open key files, and that you have the required applications to read them. Use checksums to ensure data integrity. And test the files on a different system if possible, as, if your primary computer should fail, you won’t have access to its contents. +8. **Test your backup**: Do your backups actually work? Test them to find out. Make sure you can actually open key files, and that you have the required applications to read them. Use checksums to ensure data integrity. And test the files on a different system if possible, as, if your primary computer should fail, you won't have access to its contents. -9. **Protect your raw data**: Always backup your raw data. And keep it safe: Duplicate the data before working with it, and open it read-only. An errant file-open command in write mode (e.g., in Python: `f = open (filename, ‘w’)`) instead of read (`‘r’`) is all it takes to wipe out a file for good. +9. **Protect your raw data**: Always backup your raw data. And keep it safe: Duplicate the data before working with it, and open it read-only. An errant file-open command in write mode (e.g., in Python: `f = open (filename, ‘w')`) instead of read (`‘r'`) is all it takes to wipe out a file for good. 10. **Keep one backup offline**: An always-on backup system is a recipe for disaster. If your computer is hacked, or suffers a power-spike, the backup system can also be compromised. So keep at least one backup offline, just in case. -11. **Plan ahead**: Storage media don’t last forever, so periodically review your backup strategy to make sure it’s current. Do you still have devices that can read the backups? Is it time to migrate to a newer platform? And don’t neglect your cloud storage: companies can shift priorities, or you can lose your passwords. So double-up on those too, just in case. +11. **Plan ahead**: Storage media don't last forever, so periodically review your backup strategy to make sure it's current. Do you still have devices that can read the backups? Is it time to migrate to a newer platform? And don't neglect your cloud storage: companies can shift priorities, or you can lose your passwords. So double-up on those too, just in case. + +12. **Expect the unexpected**: Make periodic disaster assessments to safeguard your hardware. If you live in a flood zone, the basement may not be the best place to store a server. If you live in an earthquake-prone area, you might anchor your computers so they don't fall over. And if your computers are located near fire-control sprinklers, you might raise them off the ground, or shield them from potential leaks. + -12. **Expect the unexpected**: Make periodic disaster assessments to safeguard your hardware. If you live in a flood zone, the basement may not be the best place to store a server. If you live in an earthquake-prone area, you might anchor your computers so they don’t fall over. And if your computers are located near fire-control sprinklers, you might raise them off the ground, or shield them from potential leaks. diff --git a/instructors/instructor-notes.md b/instructors/instructor-notes.md new file mode 100644 index 0000000..c596b42 --- /dev/null +++ b/instructors/instructor-notes.md @@ -0,0 +1,115 @@ +--- +title: Instructor Notes +--- + +## Instructor notes + +## Lesson motivation and learning objectives + +The purpose of this lesson is *not* to teach how to do data analysis in spreadsheets, +but to teach good data organization and how to do some data cleaning and +quality control in a spreadsheet program. + +## Lesson design + +#### [Introduction](../episodes/00-intro.md) + +- Introduce that we're teaching data organization, and that we're using + spreadsheets, because most people do data entry in spreadsheets or + have data in spreadsheets. +- Emphasize that we are teaching good practice in data organization and that + this is the foundation of their research practice. Without organized and clean + data, it will be difficult for them to apply the things we're teaching in the + rest of the workshop to their data. +- Much of their lives as a researcher will be spent on this 'data wrangling' stage, but + some of it can be prevented with good strategies for data collection up front. +- Tell that we're not teaching data analysis or plotting in spreadsheets, because it's + very manual and also not reproducible. That's why we're teaching SQL, R, Python! +- Now let's talk about spreadsheets, and when we say spreadsheets, we mean any program that + does spreadsheets like Excel, LibreOffice, OpenOffice. Most learners are probably using Excel. +- Ask the audience any things they've accidentally done in spreadsheets. Talk about an example of your own, like that you accidentally sorted only a single column and not the rest + of the data in the spreadsheet. What are the pain points!? +- As people answer highlight some of these issues with spreadsheets + +#### [Formatting data](../episodes/01-format-data.md) + +- Go through the point about keeping track of your steps and keeping raw data raw +- Go through the cardinal rule of spreadsheets about columns, rows and cells +- Hand them a messy data file and have them pair up and work together to clean up the data. + *Give them 15 minutes to do this.* +- Ask for what people did to clean the data. As they bring up different points you can + refer to them in the 02-common-mistakes.md file, or expand a bit on the point they brought up. + If you are just teaching the lesson, it would be good to familiarize yourself with + the set of mistakes in 02-common-mistakes. All these mistakes are present in the messy + dataset. +- If you get a response where they've fixed the date, you can pause and go to the + 03-dates-as-data.md lesson. Or you can say you'll come back to dates at the end. + There's an exercise in that file about how to change the + date into three columns using Excel's built in MONTH, DAY, YEAR functions. Have them + run through that exercise. + +#### [Common formatting problems](../episodes/02-common-mistakes.md) + +- **Don't go through this chapter** except to refer to as responses to the exercise in + the previous chapter. + +#### [Dates as data](../episodes/03-dates-as-data.md) + +- Do the exercise and make the point about dates either in response to a learner bringing + up date as an issue during the responses, or at the end of the response time. + +#### [Quality control](../episodes/04-quality-control.md) + +*This lesson is optional* + +The challenge with this lesson is that the instructor's version of the spreadsheet software is going to look different than about half the room's. It makes +it challenging to show where you can find menu options and navigate through. + +Instead discuss the concepts of quality control, and how things like sorting can help you find outliers in your data. + +#### [Exporting data](../episodes/05-exporting-data.md) + +- Have the students export their cleaned data as CSV. Reiterate again the need for + data in this format for the other tools we'll be using. + +#### Concluding points + +- Now your data is organized so that a computer can read and understand it. This + let's you use the full power of the computer for your analyses as we'll see in the + rest of the workshop. +- While your data is now neatly organized, it still might have errors or missing data + or other problems. It's like you put all your data in the right drawers, but the + drawers might still be messy. The next lesson is going to teach you OpenRefine which + is great for data cleaning and for some of the quality control that we touched on + in this lesson. It also has the advantage that it automatically keeps track of the + steps you take. + +## Technical tips and tricks + +Provide information on setting up your environment for learners to view your +live coding (increasing text size, changing text color, etc), as well as +general recommendations for working with coding tools to best suit the +learning environment. + +## Common problems + +#### Excel looks and acts different on different operating systems + +The main challenge with this lesson is that Excel looks very different and how you +do things is even different between Mac and PC, and between different versions of +Excel. So, the presenter's environment will only be the same as some of the learners. + +We need better notes and screenshots of how things work on both Mac and PC. But we +likely won't be able to cover all the different versions of Excel. + +If you have a helper who has experience with the other OS than you, it would be good +to prep them to help with this lesson and tell how people to do things in the other OS. + +#### People are not interactive or responsive on the Exercise + +This lesson depends on people working on the exercise and responding with things +that are fixed. If your audience is reluctant to participate, start out with +some things on your own, or ask a helper for their answers. This generally gets +even a reluctant audience started. + + diff --git a/_extras/discuss.md b/learners/discuss.md similarity index 76% rename from _extras/discuss.md rename to learners/discuss.md index b169e94..983d483 100644 --- a/_extras/discuss.md +++ b/learners/discuss.md @@ -1,36 +1,39 @@ --- -layout: page title: Discussion --- ### Dealing with commas as part of data values in `csv` files -When talking about [exporting data](../05-exporting-data/) we discussed how to export Excel file formats into `csv`. Comma Separated Value files are indeed very useful and allow for easily exchanging and sharing data. +When talking about [exporting data](../episodes/05-exporting-data.md) we discussed how to export Excel file formats into `csv`. Comma Separated Value files are indeed very useful and allow for easily exchanging and sharing data. However, there are some significant problems with this particular format. Quite often the data values themselves may include commas (,). In that case, the software which you use (including Excel) will most likely incorrectly display the data in columns. This is because the commas which are a part of the data values will be interpreted as delimiters. For example, our data might look like this: - - species_id,genus,species,taxa - AB,Amphispiza,bilineata,Bird - AH,Ammospermophilus,harrisi,Rodent-not,censused - AS,Ammodramus,savannarum,Bird - BA,Baiomys,taylori,Rodent - -In the record `AH,Ammospermophilus,harrisi,Rodent-not,censused` the value for `taxa` includes a comma (`Rodent-not,censused`). + +``` + species_id,genus,species,taxa + AB,Amphispiza,bilineata,Bird + AH,Ammospermophilus,harrisi,Rodent-not,censused + AS,Ammodramus,savannarum,Bird + BA,Baiomys,taylori,Rodent +``` + +In the record `AH,Ammospermophilus,harrisi,Rodent-not,censused` the value for `taxa` includes a comma (`Rodent-not,censused`). If we try to read the above into Excel (or other spreadsheet program), we will get something like this: -![Issue with importing csv format](../fig/csv-mistake.png) +![](fig/csv-mistake.png){alt='Issue with importing csv format'} -The value for `taxa` was split into two columns (instead of being put in one column `D`). This can propagate to a number of further errors. For example, the extra column will be interpreted as a column with many missing values (and without a proper header). In addition to that, the value in column `D` for the record in row 3 (so the one where the value for 'taxa' contained the comma) is now incorrect. +The value for `taxa` was split into two columns (instead of being put in one column `D`). This can propagate to a number of further errors. For example, the extra column will be interpreted as a column with many missing values (and without a proper header). In addition to that, the value in column `D` for the record in row 3 (so the one where the value for 'taxa' contained the comma) is now incorrect. If you want to store your data in `csv` format and expect that your data values may contain commas, you can avoid the problem discussed above by putting the values in quotes (""). Applying this rule, our data might look like this: - species_id,genus,species,taxa - "AB","Amphispiza","bilineata","Bird" - "AH","Ammospermophilus","harrisi","Rodent-not, censused" - "AS","Ammodramus","savannarum","Bird" - "BA","Baiomys","taylori","Rodent" +``` +species_id,genus,species,taxa +"AB","Amphispiza","bilineata","Bird" +"AH","Ammospermophilus","harrisi","Rodent-not, censused" +"AS","Ammodramus","savannarum","Bird" +"BA","Baiomys","taylori","Rodent" +``` Now opening this file as a `csv` in Excel will not lead to an extra column, because Excel will only use commas that fall outside of quotation marks as delimiting characters. However, if you are working with an already existing dataset in which the data values are not included in "" but which have commas as both delimiters and parts of data values, you are potentially facing a major problem with data cleaning. @@ -38,3 +41,4 @@ If the dataset you're dealing with contains hundreds or thousands of records, cl Cleaning up datasets is one of the major problems in many scientific disciplines. The approach almost always depends on the particular context. However, it is a good practice to clean the data in an automated fashion, for example by writing and running a script. The Python and R lessons will give you the basis for developing skills to build relevant scripts. + diff --git a/reference.md b/learners/reference.md similarity index 88% rename from reference.md rename to learners/reference.md index 2377355..9973df6 100644 --- a/reference.md +++ b/learners/reference.md @@ -1,15 +1,15 @@ --- -layout: reference +title: 'Glossary' --- ## Glossary -{:auto_ids} +{:auto\_ids} cleaned data : data that has been manipulated post-collection to remove errors or inaccuracies, introduce desired formatting changes, or otherwise prepare the data for analysis conditional formatting -: formatting that is applied to a specific cell or range of cells depending on a set of criteria +: formatting that is applied to a specific cell or range of cells depending on a set of criteria CSV (comma separated values) format : a plain text file format in which values are separated by commas @@ -18,7 +18,7 @@ factor : a variable that takes on a limited number of possible values (i.e. categorical data) metadata -: data which describes other data +: data which describes other data null value : a value used to record observations missing from a dataset @@ -30,7 +30,7 @@ plain text : unformatted text quality assurance -: any process which checks data for validity during entry +: any process which checks data for validity during entry quality control : any process which removes problematic data from a dataset @@ -49,3 +49,5 @@ TSV (tab separated values) format variable : a category of data being collected on the object being recorded (e.g. a mouse's weight) + + diff --git a/learners/setup.md b/learners/setup.md new file mode 100644 index 0000000..2502d2c --- /dev/null +++ b/learners/setup.md @@ -0,0 +1,58 @@ +--- +title: Setup +--- + +:::::::::::::::::::::::::::::::::::::::::: prereq + +## Data + +**Download** this data file to your computer: [https://ndownloader.figshare.com/files/2252083](https://ndownloader.figshare.com/files/2252083) + +#### About the data + +The data for this lesson is a part of the Data Carpentry Ecology workshop. +It is a teaching version of the Portal Database. The data in this lesson +is a subset of the teaching version that has been intentionally 'messed up' +for this lesson. + +The data for this lesson and the workshop are in the +[Portal Project Teaching Database](https://figshare.com/articles/Portal_Project_Teaching_Database/1314459) +available on FigShare, with a CC-BY license +available for reuse. + +> Ernest, M., Brown, J., Valone, T., and White, E.P. (2017). Portal Project Teaching Database. Version 6. Figshare. [DOI: 10.6084/m9.figshare.1314459.v6](https://figshare.com/articles/Portal_Project_Teaching_Database/1314459) + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::: prereq + +## Software + +To interact with spreadsheets, we can use LibreOffice, Microsoft Excel, Gnumeric, OpenOffice.org, or other programs. Commands may differ a bit between programs, but the general ideas for thinking about spreadsheets are the same. + +For this lesson, if you don't have a spreadsheet program already, you can use LibreOffice. It's a free, open source spreadsheet program. + +#### Windows + +- Download the Installer + - Install LibreOffice by going to [the installation page](https://www.libreoffice.org/download/libreoffice-fresh/). The version for Windows should automatically be selected. Click Download Version X.X.X (whichever is the most recent version). You will go to a page that asks about a donation, but you don't need to make one. Your download should begin automatically. +- Install LibreOffice +- Once the installer is downloaded, double click on it and LibreOffice should install. + +#### Mac OS X + +- Download the Installer + - Install LibreOffice by going to [the installation page](https://www.libreoffice.org/download/libreoffice-fresh/). The version for Mac should automatically be selected. Click Download Version X.X.X (whichever is the most recent version). You will go to a page that asks about a donation, but you don't need to make one. Your download should begin automatically. +- Install LibreOffice +- Once the installer is downloaded, double click on it and LibreOffice should install. + +#### Linux + +- Download the Installer + - Install LibreOffice by going to [the installation page](https://www.libreoffice.org/download/libreoffice-fresh/). The version for Linux should automatically be selected. Click Download Version X.X.X (whichever is the most recent version). You will go to a page that asks about a donation, but you don't need to make one. Your download should begin automatically. +- Install LibreOffice +- Once the installer is downloaded, double click on it and LibreOffice should install. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/profiles/learner-profiles.md b/profiles/learner-profiles.md new file mode 100644 index 0000000..434e335 --- /dev/null +++ b/profiles/learner-profiles.md @@ -0,0 +1,5 @@ +--- +title: FIXME +--- + +This is a placeholder file. Please add content here. diff --git a/setup.md b/setup.md deleted file mode 100644 index 89b4026..0000000 --- a/setup.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -layout: page -title: Setup ---- - -> ## Data -> -> **Download** this data file to your computer: [https://ndownloader.figshare.com/files/2252083](https://ndownloader.figshare.com/files/2252083) -> -> #### About the data -> The data for this lesson is a part of the Data Carpentry Ecology workshop. -> It is a teaching version of the Portal Database. The data in this lesson -> is a subset of the teaching version that has been intentionally 'messed up' -> for this lesson. -> -> The data for this lesson and the workshop are in the -> [Portal Project Teaching Database](https://figshare.com/articles/Portal_Project_Teaching_Database/1314459) -> available on FigShare, with a CC-BY license -> available for reuse. -> -> > Ernest, M., Brown, J., Valone, T., and White, E.P. (2017). Portal Project Teaching Database. Version 6. Figshare. [DOI: 10.6084/m9.figshare.1314459.v6](https://figshare.com/articles/Portal_Project_Teaching_Database/1314459) -> -{: .prereq} - -> ## Software -> -> To interact with spreadsheets, we can use LibreOffice, Microsoft Excel, Gnumeric, OpenOffice.org, or other programs. Commands may differ a bit between programs, but the general ideas for thinking about spreadsheets are the same. -> -> For this lesson, if you don't have a spreadsheet program already, you can use LibreOffice. It's a free, open source spreadsheet program. -> -> #### Windows -> -> - Download the Installer -> - Install LibreOffice by going to [the installation page](https://www.libreoffice.org/download/libreoffice-fresh/). The version for Windows should automatically be selected. Click Download Version X.X.X (whichever is the most recent version). You will go to a page that asks about a donation, but you don't need to make one. Your download should begin automatically. -> - Install LibreOffice -> - Once the installer is downloaded, double click on it and LibreOffice should install. -> -> #### Mac OS X -> -> - Download the Installer -> - Install LibreOffice by going to [the installation page](https://www.libreoffice.org/download/libreoffice-fresh/). The version for Mac should automatically be selected. Click Download Version X.X.X (whichever is the most recent version). You will go to a page that asks about a donation, but you don't need to make one. Your download should begin automatically. -> - Install LibreOffice -> - Once the installer is downloaded, double click on it and LibreOffice should install. -> -> -> #### Linux -> -> - Download the Installer -> - Install LibreOffice by going to [the installation page](https://www.libreoffice.org/download/libreoffice-fresh/). The version for Linux should automatically be selected. Click Download Version X.X.X (whichever is the most recent version). You will go to a page that asks about a donation, but you don't need to make one. Your download should begin automatically. -> - Install LibreOffice -> - Once the installer is downloaded, double click on it and LibreOffice should install. -> -{: .prereq} diff --git a/site/README.md b/site/README.md new file mode 100644 index 0000000..42997e3 --- /dev/null +++ b/site/README.md @@ -0,0 +1,2 @@ +This directory contains rendered lesson materials. Please do not edit files +here.