diff --git a/DESCRIPTION b/DESCRIPTION index 3372ff1..57d1e8a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: dfeR Title: Common DfE R tasks -Version: 0.6.1 +Version: 0.6.1.9000 Authors@R: c( person("Cam", "Race", , "cameron.race@education.gov.uk", role = c("aut", "cre")), person("Laura", "Selby", , "laura.selby@education.gov.uk", role = "aut"), diff --git a/NAMESPACE b/NAMESPACE index 472d3c3..8adec1a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,7 @@ export(pretty_num_table) export(pretty_time_taken) export(round_five_up) export(toggle_message) +export(z_replace) import(renv, except = run) importFrom(emoji,emoji) importFrom(lifecycle,deprecated) diff --git a/NEWS.md b/NEWS.md index dca7aaa..39dd128 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# dfeR (development version) + +Added lookup data geog_time_identifiers +Added z_replace() to replace NA values in tables except for ones in geography and time columns that match ones in geog_time_identifiers. + # dfeR 0.6.1 Patch to update the pretty_num() function so that the `dp` argument's default is 0. diff --git a/R/create_project.R b/R/create_project.R index 96143b6..722741f 100644 --- a/R/create_project.R +++ b/R/create_project.R @@ -175,79 +175,7 @@ create_project <- function( # Create the readme ----- - readme_content <- c( - "# Readme", - "This is the template for a standard data analysis.", - "Please give an overview what you do in this project and how to ", - "navigate it.", - "", - "## Introduction", - "TODO: Give a short introduction of your project.", - "Let this section explain the objectives or the motivation behind ", - "this project.", - "", - "## Getting Started", - "TODO: Guide users through getting your code up and running on their ", - "own system. ", - "In this section you can talk about:", - "1. Installation process", - "2. Software dependencies", - "3. Latest releases", - "4. API references", - "", - "# Build and Test", - "TODO: Describe and show how to build your code and run the tests.", - "", - "# Contribute", - "TODO: Explain how other users and developers can contribute to make ", - "your code better.", - "", - "## Git integration", - "If you want to use git with your project (you should!), ", - "please do the following steps (replace `` with ", - "the actual name):", - "", - "1. Go to your git repository provider (GitHub/Azure DevOps) and create ", - "a new repository", - "2. DON'T check 'Add a README file'", - "3. Go to the Terminal within RStudio and type the following commands ", - "(for the URL, e.g. https://github.com):", - "", - "```bash", - "git init", - "git branch -M main", - "git remote add origin /.git", - "```", - "", - "4. Restart RStudio", - "5. Type in the R terminal `bash git add .` to add all files to ", - "the commit", - "6. Type in the R terminal `bash git commit -m ", - "'Your commit message (initial commit)'` to commit those files with ", - "a message.", - "7. In the terminal, execute the following command:", - "", - "```bash", - "git push -u origin main", - "```", - "", - "8. For the following commits, repeat this process", - "", - "NOTE: For sharing content on GitHub you should have ticked the ", - "'Create a .gitignore file for GitHub' checkbox when creating the project.", - "This will give create a strict .gitignore which is suitable for sharing ", - "code to the public.", - "Please also review to ensure no sensitive information is shared.", - "", - "For more information about the integration of git and RStudio, ", - "check out https://happygitwithr.com." - ) - - # Write to README.md - readme_concat <- paste0(readme_content, collapse = "\n") - writeLines(readme_concat, con = file.path(path, "README.md")) - + file.copy("README_template.md", file.path(path, "README.md")) # .renvignore file.create(paste0(path, "/.renvignore")) diff --git a/R/datasets_documentation.R b/R/datasets_documentation.R index fa17502..cffb579 100644 --- a/R/datasets_documentation.R +++ b/R/datasets_documentation.R @@ -104,3 +104,15 @@ #' from #' https://geoportal.statistics.gov.uk/search?q=NAC_RGN "regions" + +#' Potential names for geography and time columns +#' +#' Potential names for geography and time columns in line with the ones used for +#' the explore education statistics data screener. +#' +#' +#' @format ## `geog_time_identifiers` +#' A character vector with 38 potential column names in snake case format. +#' @source curated by explore.statistics@@education.gov.uk. +#' \href{https://shorturl.at/j4532}{Get guidance on time and geography data.} +"geog_time_identifiers" diff --git a/R/z_replace.R b/R/z_replace.R new file mode 100644 index 0000000..d595f1c --- /dev/null +++ b/R/z_replace.R @@ -0,0 +1,152 @@ +#' Replaces `NA` values in tables +#' +#' @description +#' Replaces `NA` values in tables except for ones in time and geography +#' columns that must be included in DfE official statistics. +#' \href{https://shorturl.at/chy76}{Get more guidance on Open Data Standards.} +#' +#' @details + +#' Names of geography and time columns that are used in this function can be +#' found in `dfeR::geog_time_identifiers`. +#' +#' @param data name of the table that you want to replace NA values in +#' @param replacement_alt optional - if you want the NA replacement +#' value to be different to "z" +#' @param exclude_columns optional - additional columns to exclude from +#' NA replacement. +#' Column names that match ones found in `dfeR::geog_time_identifiers` +#' will always be excluded because any missing data for these columns +#' need more explicit codes to explain why data is not available. +#' +#' @return table with "z" or an alternate replacement value instead of `NA` +#' values for columns that are not for time or geography. +#' @export +#' @seealso [dfeR::geog_time_identifiers] +#' @examples +#' # Create a table for the example +#' +#' df <- data.frame( +#' time_period = c(2022, 2022, 2022), +#' time_identifier = c("Calendar year", "Calendar year", "Calendar year"), +#' geographic_level = c("National", "Regional", "Regional"), +#' country_code = c("E92000001", "E92000001", "E92000001"), +#' country_name = c("England", "England", "England"), +#' region_code = c(NA, "E12000001", "E12000002"), +#' region_name = c(NA, "North East", "North West"), +#' mystery_count = c(42, 25, NA) +#' ) +#' +#' z_replace(df) +#' +#' # Use a different replacement value +#' z_replace(df, replacement_alt = "c") +#' +z_replace <- function(data, + replacement_alt = NULL, + exclude_columns = NULL) { + # check if table is empty + + # Check if the table has rows - if not, stop the process + if (nrow(data) < 1) { + stop("Table is empty or contains no rows.") + } + # check for same column names but different case or formatting + + # load in potential column names + + geog_time_identifiers <- dfeR::geog_time_identifiers + + # check for same column names but different case or formatting + + # standardize column names for potential column names + + ref_col_names <- gsub("[[:punct:]]", " ", geog_time_identifiers) + # removing extra space + ref_col_names <- gsub(" ", " ", ref_col_names) + # adding _ instead of spaces + ref_col_names <- gsub(" ", "_", tolower(ref_col_names)) + + + # standardize column names for data input + data_col_names_og <- colnames(data) + + data_col_names <- gsub("[[:punct:]]", " ", data_col_names_og) + # removing extra space + data_col_names <- gsub(" ", " ", data_col_names) + # adding _ instead of spaces + data_col_names <- gsub(" ", "_", tolower(data_col_names)) + + # check if the column name exists by comparing standardized names + + col_name_exists <- data_col_names %in% ref_col_names + # check if the formatting matches by comparing non-standardized + formatting_test <- data_col_names_og %in% geog_time_identifiers + + if (any(col_name_exists %in% TRUE & formatting_test %in% FALSE) == TRUE) { + stop( + "Your table has geography and/or time column(s) that are not ", + "in snake_case.\nPlease amend your column names to match the formatting", + "to dfeR::geog_time_identifiers." + ) + } + + # check for alt NA replacement + # if no alt, provided, use z + if (is.null(replacement_alt)) { + replacement_alt <- "z" + # check that replacement_alt is a single character vector + } else if (!is.character(replacement_alt)) { + stop( + "You provided a ", data.class(replacement_alt), + " input for replacement_alt.\n", + "Please amend replace it with a character vector." + ) + } else if (length(replacement_alt) > 1) { + stop( + "You provided multiple values for replacement_alt.\n", + "Please, only provide a single value." + ) + } else { + # otherwise use the provided replacement + replacement_alt <- replacement_alt + } + + + # start loop based on exclude_columns + + # if exclude columns is specified, use the snake case version + if (!is.null(exclude_columns)) { + data <- data %>% + dplyr::mutate(dplyr::across( + -tidyselect::any_of(c( + geog_time_identifiers, + exclude_columns + )), + ~ as.character(.) + )) %>% + # replace NAs + dplyr::mutate(dplyr::across( + -tidyselect::any_of(c( + geog_time_identifiers, + exclude_columns + )), + ~ dplyr::if_else(is.na(.), replacement_alt, .) + )) + } else { + # if exclude_columns is not specified, then use the saved potential + # location and time columns only + data <- data %>% + dplyr::mutate(dplyr::across( + -tidyselect::any_of(c(geog_time_identifiers)), + ~ as.character(.) + )) %>% + # replace NAs + dplyr::mutate(dplyr::across( + -tidyselect::any_of(c(geog_time_identifiers)), + ~ dplyr::if_else(is.na(.), replacement_alt, .) + )) + } + + return(data) +} diff --git a/README_template.md b/README_template.md new file mode 100644 index 0000000..9329906 --- /dev/null +++ b/README_template.md @@ -0,0 +1,32 @@ +# Readme template + +*This is the README template for a publication project. Please update the italicised text in each section below with the specific details for your publication. You can find more information on why you should have a README file on the* [RAP for statistics](https://dfe-analytical-services.github.io/analysts-guide/RAP/rap-statistics.html) *page of the Analysts' Guide.* + +*Once you have added your information, ensure all italicised text is deleted.* + +## Introduction + +- Purpose: *briefly explain the purpose of the code.* +- Overview: *Provide a high-level summary of the contents and structure of the repository.* + +## Requirements + +- Access: *Detail any permissions or access needed to use the repository at the top of this section, e.g. access to specific SQL databases. This is crucial for enabling new users to use the repository.* +- Skills/knowledge: *Outline the required skills or knowledge, such as familiarity with specific packages in R, or SQL.* +- Version control/Renv: *State how version control is managed and whether Renv is being used.* + +## Getting started + +- Setup instructions: *Provide step-by-step instructions on how to set up the environment, including installing dependencies.* +- Data input/output: *Describe the expected input data and where it can be found, as well as what output should be expected from the code.* + +## How to run and update + +- Running the code: *Explain how users can best run the code, for example by running a run all script.* +- Updating guidelines: *Outline the process for updating and contributing to the repository, including specific scripts and lines where updates are frequently needed. Describe how to get changes reviewed.* +- Issue reporting: *Explain how to report issues or suggest improvements. This could be through issues if using GitHub, boards in Azure DevOps or by emailing the team.* + +## Contact details + +- Main contacts: *List the names and contact information of people who maintain the repository.* +- Support channels: *Provide any information on how to get support, such as email addresses or teams channels.* diff --git a/_pkgdown.yml b/_pkgdown.yml index d1831d1..1e5dd7e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -14,6 +14,7 @@ reference: - wd_pcon_lad_la_rgn_ctry - countries - regions + - geog_time_identifiers - title: Database connection desc: Helpful functions for connecting to databases in DfE @@ -59,3 +60,8 @@ reference: - comma_sep - get_ons_api_data - toggle_message + +- title: Replace NA values + desc: Replace NA values with the default "z" or an alternative replacement + contents: + - z_replace diff --git a/data-raw/geog_time_identifiers.R b/data-raw/geog_time_identifiers.R new file mode 100644 index 0000000..97cd663 --- /dev/null +++ b/data-raw/geog_time_identifiers.R @@ -0,0 +1,22 @@ +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Get a list of potential location and time columns +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +# create a vector of possible time and geography column names +geog_time_identifiers <- c( + "geographic_level", "country_code", "region_code", "new_la_code", "lad_code", + "pcon_code", "lsip_code", "local_enterprise_partnership_code", + "english_devolved_area_code", "opportunity_area_code", "ward_code", + "trust_id", "sponsor_id", "school_urn", "provider_ukprn", "institution_id", + "planning_area_code", "country_name", "region_name", "la_name", "lad_name", + "rsc_region_lead_name", "pcon_name", "lsip_name", + "local_enterprise_partnership_name", "english_devolved_area_name", + "opportunity_area_name", "ward_name", "trust_name", "sponsor_name", + "school_name", "provider_name", "institution_name", "planning_area_name", + "old_la_code", "school_laestab", "time_period", "time_identifier" +) + +# write it out to the data folder + +usethis::use_data(geog_time_identifiers, overwrite = TRUE) diff --git a/data/geog_time_identifiers.rda b/data/geog_time_identifiers.rda new file mode 100644 index 0000000..e656c2d Binary files /dev/null and b/data/geog_time_identifiers.rda differ diff --git a/inst/WORDLIST b/inst/WORDLIST index 75f04d2..4fcc7fa 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -43,6 +43,7 @@ rgn sep ser shorthands +shorturl sql uk utla diff --git a/man/geog_time_identifiers.Rd b/man/geog_time_identifiers.Rd new file mode 100644 index 0000000..f5a0da6 --- /dev/null +++ b/man/geog_time_identifiers.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/datasets_documentation.R +\docType{data} +\name{geog_time_identifiers} +\alias{geog_time_identifiers} +\title{Potential names for geography and time columns} +\format{ +\subsection{\code{geog_time_identifiers}}{ + +A character vector with 38 potential column names in snake case format. +} +} +\source{ +curated by explore.statistics@education.gov.uk. +\href{https://shorturl.at/j4532}{Get guidance on time and geography data.} +} +\usage{ +geog_time_identifiers +} +\description{ +Potential names for geography and time columns in line with the ones used for +the explore education statistics data screener. +} +\keyword{datasets} diff --git a/man/z_replace.Rd b/man/z_replace.Rd new file mode 100644 index 0000000..687ebf5 --- /dev/null +++ b/man/z_replace.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/z_replace.R +\name{z_replace} +\alias{z_replace} +\title{Replaces \code{NA} values in tables} +\usage{ +z_replace(data, replacement_alt = NULL, exclude_columns = NULL) +} +\arguments{ +\item{data}{name of the table that you want to replace NA values in} + +\item{replacement_alt}{optional - if you want the NA replacement +value to be different to "z"} + +\item{exclude_columns}{optional - additional columns to exclude from +NA replacement. +Column names that match ones found in \code{dfeR::geog_time_identifiers} +will always be excluded because any missing data for these columns +need more explicit codes to explain why data is not available.} +} +\value{ +table with "z" or an alternate replacement value instead of \code{NA} +values for columns that are not for time or geography. +} +\description{ +Replaces \code{NA} values in tables except for ones in time and geography +columns that must be included in DfE official statistics. +\href{https://shorturl.at/chy76}{Get more guidance on Open Data Standards.} +} +\details{ +Names of geography and time columns that are used in this function can be +found in \code{dfeR::geog_time_identifiers}. +} +\examples{ +# Create a table for the example + +df <- data.frame( + time_period = c(2022, 2022, 2022), + time_identifier = c("Calendar year", "Calendar year", "Calendar year"), + geographic_level = c("National", "Regional", "Regional"), + country_code = c("E92000001", "E92000001", "E92000001"), + country_name = c("England", "England", "England"), + region_code = c(NA, "E12000001", "E12000002"), + region_name = c(NA, "North East", "North West"), + mystery_count = c(42, 25, NA) +) + +z_replace(df) + +# Use a different replacement value +z_replace(df, replacement_alt = "c") + +} +\seealso{ +\link{geog_time_identifiers} +} diff --git a/tests/testthat/test-z_replace.R b/tests/testthat/test-z_replace.R new file mode 100644 index 0000000..e8559c7 --- /dev/null +++ b/tests/testthat/test-z_replace.R @@ -0,0 +1,177 @@ +# Create a data frame for testing + +df <- data.frame( + time_period = c(2022, 2022, 2022), + time_identifier = c("Calendar year", "Calendar year", "Calendar year"), + geographic_level = c("National", "Regional", "Regional"), + country_code = c("E92000001", "E92000001", "E92000001"), + country_name = c("England", "England", "England"), + region_code = c(NA, "E12000001", "E12000002"), + region_name = c(NA, "North East", "North West"), + mystery_count = c(42, 25, NA) +) + +test_that("z_replace outputs are as expected", { + # testing standard functionality + expect_equal(z_replace(df), data.frame( + time_period = c(2022, 2022, 2022), + time_identifier = c("Calendar year", "Calendar year", "Calendar year"), + geographic_level = c("National", "Regional", "Regional"), + country_code = c("E92000001", "E92000001", "E92000001"), + country_name = c("England", "England", "England"), + region_code = c(NA, "E12000001", "E12000002"), + region_name = c(NA, "North East", "North West"), + mystery_count = c(42, 25, "z") + )) + + # testing alternative replacement + + expect_equal(z_replace(df, replacement_alt = "x"), data.frame( + time_period = c(2022, 2022, 2022), + time_identifier = c("Calendar year", "Calendar year", "Calendar year"), + geographic_level = c("National", "Regional", "Regional"), + country_code = c("E92000001", "E92000001", "E92000001"), + country_name = c("England", "England", "England"), + region_code = c(NA, "E12000001", "E12000002"), + region_name = c(NA, "North East", "North West"), + mystery_count = c(42, 25, "x") + )) + + + expect_equal(z_replace(df, replacement_alt = "c"), data.frame( + time_period = c(2022, 2022, 2022), + time_identifier = c("Calendar year", "Calendar year", "Calendar year"), + geographic_level = c("National", "Regional", "Regional"), + country_code = c("E92000001", "E92000001", "E92000001"), + country_name = c("England", "England", "England"), + region_code = c(NA, "E12000001", "E12000002"), + region_name = c(NA, "North East", "North West"), + mystery_count = c(42, 25, "c") + )) +}) + +# check error messages for non-empty data frames + +test_that("Error messages are as expected in non-empty frames", { + # testing error for non character strings in replacement_alt + expect_error( + z_replace(df, replacement_alt = 1), + cat( + "You provided a numeric input for replacement_alt.\n", + "Please amend replace it with a character vector." + ) + ) + + # testing error for multiple vectors in replacement_alt + expect_error( + z_replace(df, replacement_alt = c("a", "z", "x")), + cat( + "You provided multiple values for replacement_alt.\n", + "Please, only provide a single value." + ) + ) +}) +# Create a table to text exclude_columns + +df <- data.frame( + a = c("1", "2", "3", "z"), + b = c("1", "2", "z", "4"), + county_name = c("county1", "county2", NA_character_, "county3"), + country_code = c("country1", NA_character_, "country2", "country3"), + time_period = c(2008, 2023, 2024, as.double(NA)) +) + +# without including county_name in exclude_columns +test_that("exclude_columns works", { + # without including county_name in exclude_columns + expect_equal(z_replace(df), data.frame( + a = c("1", "2", "3", "z"), + b = c("1", "2", "z", "4"), + county_name = c("county1", "county2", "z", "county3"), + country_code = c("country1", NA_character_, "country2", "country3"), + time_period = c(2008, 2023, 2024, as.double(NA)) + )) + + + # including county_name in exclude_columns + expect_equal(z_replace(df, exclude_columns = "county_name"), data.frame( + a = c("1", "2", "3", "z"), + b = c("1", "2", "z", "4"), + county_name = c("county1", "county2", NA_character_, "county3"), + country_code = c("country1", NA_character_, "country2", "country3"), + time_period = c(2008, 2023, 2024, as.double(NA)) + )) +}) + + +# Checking speed of the function + +# make this reproducible +set.seed(123) +# create table with randomly generated numbers +df <- data.frame( + a = sample(1:1000, 10000, replace = TRUE), + b = sample(1:1000, 10000, replace = TRUE), + c = sample(1:1000, 10000, replace = TRUE), + d = sample(1:1000, 10000, replace = TRUE), + e = sample(1:1000, 10000, replace = TRUE), + f = sample(1:1000, 10000, replace = TRUE), + e = sample(1:1000, 10000, replace = TRUE), + h = sample(1:1000, 10000, replace = TRUE), + i = sample(1:1000, 10000, replace = TRUE), + j = sample(1:1000, 10000, replace = TRUE), + school_urn = sample(1:1000, 10000, replace = TRUE) +) + +# putting NAs in the table +df <- df %>% + dplyr::mutate(across( + a:school_urn, + ~ dplyr::if_else(. < 300, as.double(NA), .) + )) + +start_time <- Sys.time() +z_replace(df) +end_time <- Sys.time() +test_time <- difftime(end_time, start_time, units = "secs") + +# calculating the time it takes + +# testing that the speed is less than 0.25 second +test_that("Speed of the function", { + expect_equal(test_time < 0.25, TRUE) +}) + +# Check error message for empty data frame + +# create table +df <- data.frame() + +test_that("Error messages are as expected", { + expect_error(z_replace(df), "Table is empty or contains no rows.") + + expect_error( + z_replace(df, replacement_alt = "x"), + "Table is empty or contains no rows." + ) +}) + + +# Check error messages for when tables contain geography +# and time columns from th ees screener but different formatting + +df <- data.frame( + GEOGRAPHIC_LEVEL = c("level1", "level2", "level3", NA_character_), + time_period = c(2008, 2023, 2024, as.double(NA)) +) + +test_that("Formatting of column names are checked", { + expect_error( + z_replace(df), + cat( + "Your table has geography and/or time column(s) that are not", + "in snake_case.\nPlease amend your column names to match the formatting", + "to dfeR::geog_time_identifiers." + ) + ) +})