Merge branch 'feature/sys-diagnostics' of https://github.com/dfe-anal…

…ytical-services/dfeR into feature/sys-diagnostics
dfe-analytical-services · Jan 3, 2025 · f9341d2 · f9341d2
2 parents 0f1557c + ea3f9c2
commit f9341d2
Show file tree

Hide file tree

Showing 14 changed files with 490 additions and 74 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: dfeR
 Title: Common DfE R tasks
-Version: 0.6.1
+Version: 0.6.1.9000
 Authors@R: c(
     person("Cam", "Race", , "[email protected]", role = c("aut", "cre")),
     person("Laura", "Selby", , "[email protected]", role = "aut"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -26,6 +26,7 @@ export(pretty_num_table)
 export(pretty_time_taken)
 export(round_five_up)
 export(toggle_message)
+export(z_replace)
 import(renv, except = run)
 importFrom(emoji,emoji)
 importFrom(lifecycle,deprecated)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# dfeR (development version)
+
+Added lookup data geog_time_identifiers
+Added z_replace() to replace NA values in tables except for ones in geography and time columns that match ones in geog_time_identifiers. 
+
 # dfeR 0.6.1
 
 Patch to update the pretty_num() function so that the `dp` argument's default is 0. 

diff --git a/R/create_project.R b/R/create_project.R
@@ -175,79 +175,7 @@ create_project <- function(
 
 
   # Create the readme -----
-  readme_content <- c(
-    "# Readme",
-    "This is the template for a standard data analysis.",
-    "Please give an overview what you do in this project and how to ",
-    "navigate it.",
-    "",
-    "## Introduction",
-    "TODO: Give a short introduction of your project.",
-    "Let this section explain the objectives or the motivation behind ",
-    "this project.",
-    "",
-    "## Getting Started",
-    "TODO: Guide users through getting your code up and running on their ",
-    "own system. ",
-    "In this section you can talk about:",
-    "1.	Installation process",
-    "2.	Software dependencies",
-    "3.	Latest releases",
-    "4.	API references",
-    "",
-    "# Build and Test",
-    "TODO: Describe and show how to build your code and run the tests.",
-    "",
-    "# Contribute",
-    "TODO: Explain how other users and developers can contribute to make ",
-    "your code better.",
-    "",
-    "## Git integration",
-    "If you want to use git with your project (you should!), ",
-    "please do the following steps (replace `<name of your repository>` with ",
-    "the actual name):",
-    "",
-    "1.  Go to your git repository provider (GitHub/Azure DevOps) and create ",
-    "a new repository",
-    "2.  DON'T check 'Add a README file'",
-    "3.  Go to the Terminal within RStudio and type the following commands ",
-    "(for the URL, e.g. https://github.com):",
-    "",
-    "```bash",
-    "git init",
-    "git branch -M main",
-    "git remote add origin <URL of your GitHub/Azure DevOps instance>/<name ",
-    "of your repository>.git",
-    "```",
-    "",
-    "4.  Restart RStudio",
-    "5.  Type in the R terminal `bash git add .` to add all files to ",
-    "the commit",
-    "6.  Type in the R terminal `bash git commit -m ",
-    "'Your commit message (initial commit)'` to commit those files with ",
-    "a message.",
-    "7.  In the terminal, execute the following command:",
-    "",
-    "```bash",
-    "git push -u origin main",
-    "```",
-    "",
-    "8.  For the following commits, repeat this process",
-    "",
-    "NOTE: For sharing content on GitHub you should have ticked the ",
-    "'Create a .gitignore file for GitHub' checkbox when creating the project.",
-    "This will give create a strict .gitignore which is suitable for sharing ",
-    "code to the public.",
-    "Please also review to ensure no sensitive information is shared.",
-    "",
-    "For more information about the integration of git and RStudio, ",
-    "check out https://happygitwithr.com."
-  )
-
-  # Write to README.md
-  readme_concat <- paste0(readme_content, collapse = "\n")
-  writeLines(readme_concat, con = file.path(path, "README.md"))
-
+  file.copy("README_template.md", file.path(path, "README.md"))
 
   # .renvignore
   file.create(paste0(path, "/.renvignore"))

diff --git a/R/datasets_documentation.R b/R/datasets_documentation.R
@@ -104,3 +104,15 @@
 #' from
 #' https://geoportal.statistics.gov.uk/search?q=NAC_RGN
 "regions"
+
+#' Potential names for geography and time columns
+#'
+#' Potential names for geography and time columns in line with the ones used for
+#' the explore education statistics data screener.
+#'
+#'
+#' @format ## `geog_time_identifiers`
+#' A character vector with 38 potential column names in snake case format.
+#' @source curated by explore.statistics@@education.gov.uk.
+#' \href{https://shorturl.at/j4532}{Get guidance on time and geography data.}
+"geog_time_identifiers"
diff --git a/R/z_replace.R b/R/z_replace.R
@@ -0,0 +1,152 @@
+#' Replaces `NA` values in tables
+#'
+#' @description
+#' Replaces `NA` values in tables except for ones in time and geography
+#' columns that must be included in DfE official statistics.
+#' \href{https://shorturl.at/chy76}{Get more guidance on Open Data Standards.}
+#'
+#' @details
+
+#' Names of geography and time columns that are used in this function can be
+#' found in `dfeR::geog_time_identifiers`.
+#'
+#' @param data name of the table that you want to replace NA values in
+#' @param replacement_alt optional - if you want the NA replacement
+#' value to be different to "z"
+#' @param exclude_columns optional - additional columns to exclude from
+#' NA replacement.
+#' Column names that match ones found in `dfeR::geog_time_identifiers`
+#' will always be excluded because any missing data for these columns
+#' need more explicit codes to explain why data is not available.
+#'
+#' @return table with "z" or an alternate replacement value instead of `NA`
+#' values for columns that are not for time or geography.
+#' @export
+#' @seealso [dfeR::geog_time_identifiers]
+#' @examples
+#' # Create a table for the example
+#'
+#' df <- data.frame(
+#'   time_period = c(2022, 2022, 2022),
+#'   time_identifier = c("Calendar year", "Calendar year", "Calendar year"),
+#'   geographic_level = c("National", "Regional", "Regional"),
+#'   country_code = c("E92000001", "E92000001", "E92000001"),
+#'   country_name = c("England", "England", "England"),
+#'   region_code = c(NA, "E12000001", "E12000002"),
+#'   region_name = c(NA, "North East", "North West"),
+#'   mystery_count = c(42, 25, NA)
+#' )
+#'
+#' z_replace(df)
+#'
+#' # Use a different replacement value
+#' z_replace(df, replacement_alt = "c")
+#'
+z_replace <- function(data,
+                      replacement_alt = NULL,
+                      exclude_columns = NULL) {
+  # check if table is empty
+
+  # Check if the table has rows - if not, stop the process
+  if (nrow(data) < 1) {
+    stop("Table is empty or contains no rows.")
+  }
+  # check for same column names but different case or formatting
+
+  # load in potential column names
+
+  geog_time_identifiers <- dfeR::geog_time_identifiers
+
+  # check for same column names but different case or formatting
+
+  # standardize column names for potential column names
+
+  ref_col_names <- gsub("[[:punct:]]", " ", geog_time_identifiers)
+  # removing extra space
+  ref_col_names <- gsub("  ", " ", ref_col_names)
+  # adding _ instead of spaces
+  ref_col_names <- gsub(" ", "_", tolower(ref_col_names))
+
+
+  # standardize column names for data input
+  data_col_names_og <- colnames(data)
+
+  data_col_names <- gsub("[[:punct:]]", " ", data_col_names_og)
+  # removing extra space
+  data_col_names <- gsub("  ", " ", data_col_names)
+  # adding _ instead of spaces
+  data_col_names <- gsub(" ", "_", tolower(data_col_names))
+
+  # check if the column name exists by comparing standardized names
+
+  col_name_exists <- data_col_names %in% ref_col_names
+  # check if the formatting matches by comparing non-standardized
+  formatting_test <- data_col_names_og %in% geog_time_identifiers
+
+  if (any(col_name_exists %in% TRUE & formatting_test %in% FALSE) == TRUE) {
+    stop(
+      "Your table has geography and/or time column(s) that are not ",
+      "in snake_case.\nPlease amend your column names to match the formatting",
+      "to dfeR::geog_time_identifiers."
+    )
+  }
+
+  # check for alt NA replacement
+  # if no alt, provided, use z
+  if (is.null(replacement_alt)) {
+    replacement_alt <- "z"
+    # check that replacement_alt is a single character vector
+  } else if (!is.character(replacement_alt)) {
+    stop(
+      "You provided a ", data.class(replacement_alt),
+      " input for replacement_alt.\n",
+      "Please amend replace it with a character vector."
+    )
+  } else if (length(replacement_alt) > 1) {
+    stop(
+      "You provided multiple values for replacement_alt.\n",
+      "Please, only provide a single value."
+    )
+  } else {
+    # otherwise use the provided replacement
+    replacement_alt <- replacement_alt
+  }
+
+
+  # start loop based on exclude_columns
+
+  # if exclude columns is specified, use the snake case version
+  if (!is.null(exclude_columns)) {
+    data <- data %>%
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(
+          geog_time_identifiers,
+          exclude_columns
+        )),
+        ~ as.character(.)
+      )) %>%
+      # replace NAs
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(
+          geog_time_identifiers,
+          exclude_columns
+        )),
+        ~ dplyr::if_else(is.na(.), replacement_alt, .)
+      ))
+  } else {
+    # if exclude_columns is not specified, then use the saved potential
+    # location and time columns only
+    data <- data %>%
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(geog_time_identifiers)),
+        ~ as.character(.)
+      )) %>%
+      # replace NAs
+      dplyr::mutate(dplyr::across(
+        -tidyselect::any_of(c(geog_time_identifiers)),
+        ~ dplyr::if_else(is.na(.), replacement_alt, .)
+      ))
+  }
+
+  return(data)
+}
diff --git a/README_template.md b/README_template.md
@@ -0,0 +1,32 @@
+# Readme template
+
+*This is the README template for a publication project. Please update the italicised text in each section below with the specific details for your publication. You can find more information on why you should have a README file on the* [RAP for statistics](https://dfe-analytical-services.github.io/analysts-guide/RAP/rap-statistics.html) *page of the Analysts' Guide.*
+
+*Once you have added your information, ensure all italicised text is deleted.*
+
+## Introduction
+
+-   Purpose: *briefly explain the purpose of the code.*
+-   Overview: *Provide a high-level summary of the contents and structure of the repository.*
+
+## Requirements
+
+-   Access: *Detail any permissions or access needed to use the repository at the top of this section, e.g. access to specific SQL databases. This is crucial for enabling new users to use the repository.*
+-   Skills/knowledge: *Outline the required skills or knowledge, such as familiarity with specific packages in R, or SQL.*
+-   Version control/Renv: *State how version control is managed and whether Renv is being used.*
+
+## Getting started
+
+-   Setup instructions: *Provide step-by-step instructions on how to set up the environment, including installing dependencies.*
+-   Data input/output: *Describe the expected input data and where it can be found, as well as what output should be expected from the code.*
+
+## How to run and update
+
+-   Running the code: *Explain how users can best run the code, for example by running a run all script.*
+-   Updating guidelines: *Outline the process for updating and contributing to the repository, including specific scripts and lines where updates are frequently needed. Describe how to get changes reviewed.*
+-   Issue reporting: *Explain how to report issues or suggest improvements. This could be through issues if using GitHub, boards in Azure DevOps or by emailing the team.*
+
+## Contact details
+
+-   Main contacts: *List the names and contact information of people who maintain the repository.*
+-   Support channels: *Provide any information on how to get support, such as email addresses or teams channels.*
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -14,6 +14,7 @@ reference:
   - wd_pcon_lad_la_rgn_ctry
   - countries
   - regions
+  - geog_time_identifiers
 
 - title: Database connection
   desc: Helpful functions for connecting to databases in DfE
@@ -59,3 +60,8 @@ reference:
   - comma_sep
   - get_ons_api_data
   - toggle_message
+
+- title: Replace NA values
+  desc: Replace NA values with the default "z" or an alternative replacement
+  contents:
+  - z_replace
diff --git a/data-raw/geog_time_identifiers.R b/data-raw/geog_time_identifiers.R
@@ -0,0 +1,22 @@
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Get a list of potential location and time columns
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+# create a vector of possible time and geography column names
+geog_time_identifiers <- c(
+  "geographic_level", "country_code", "region_code", "new_la_code", "lad_code",
+  "pcon_code", "lsip_code", "local_enterprise_partnership_code",
+  "english_devolved_area_code", "opportunity_area_code", "ward_code",
+  "trust_id", "sponsor_id", "school_urn", "provider_ukprn", "institution_id",
+  "planning_area_code", "country_name", "region_name", "la_name", "lad_name",
+  "rsc_region_lead_name", "pcon_name", "lsip_name",
+  "local_enterprise_partnership_name", "english_devolved_area_name",
+  "opportunity_area_name", "ward_name", "trust_name", "sponsor_name",
+  "school_name", "provider_name", "institution_name", "planning_area_name",
+  "old_la_code", "school_laestab", "time_period", "time_identifier"
+)
+
+# write it out to the data folder
+
+usethis::use_data(geog_time_identifiers, overwrite = TRUE)
diff --git a/data/geog_time_identifiers.rda b/data/geog_time_identifiers.rda
diff --git a/inst/WORDLIST b/inst/WORDLIST
@@ -43,6 +43,7 @@ rgn
 sep
 ser
 shorthands
+shorturl
 sql
 uk
 utla

diff --git a/man/geog_time_identifiers.Rd b/man/geog_time_identifiers.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -43,6 +43,7 @@ rgn @@
     sep
     ser
     shorthands
+    shorturl
     sql
     uk
     utla
@@ Expand Down @@