-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'feature/sys-diagnostics' of https://github.com/dfe-anal…
…ytical-services/dfeR into feature/sys-diagnostics
- Loading branch information
Showing
14 changed files
with
490 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
Type: Package | ||
Package: dfeR | ||
Title: Common DfE R tasks | ||
Version: 0.6.1 | ||
Version: 0.6.1.9000 | ||
Authors@R: c( | ||
person("Cam", "Race", , "[email protected]", role = c("aut", "cre")), | ||
person("Laura", "Selby", , "[email protected]", role = "aut"), | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
#' Replaces `NA` values in tables | ||
#' | ||
#' @description | ||
#' Replaces `NA` values in tables except for ones in time and geography | ||
#' columns that must be included in DfE official statistics. | ||
#' \href{https://shorturl.at/chy76}{Get more guidance on Open Data Standards.} | ||
#' | ||
#' @details | ||
|
||
#' Names of geography and time columns that are used in this function can be | ||
#' found in `dfeR::geog_time_identifiers`. | ||
#' | ||
#' @param data name of the table that you want to replace NA values in | ||
#' @param replacement_alt optional - if you want the NA replacement | ||
#' value to be different to "z" | ||
#' @param exclude_columns optional - additional columns to exclude from | ||
#' NA replacement. | ||
#' Column names that match ones found in `dfeR::geog_time_identifiers` | ||
#' will always be excluded because any missing data for these columns | ||
#' need more explicit codes to explain why data is not available. | ||
#' | ||
#' @return table with "z" or an alternate replacement value instead of `NA` | ||
#' values for columns that are not for time or geography. | ||
#' @export | ||
#' @seealso [dfeR::geog_time_identifiers] | ||
#' @examples | ||
#' # Create a table for the example | ||
#' | ||
#' df <- data.frame( | ||
#' time_period = c(2022, 2022, 2022), | ||
#' time_identifier = c("Calendar year", "Calendar year", "Calendar year"), | ||
#' geographic_level = c("National", "Regional", "Regional"), | ||
#' country_code = c("E92000001", "E92000001", "E92000001"), | ||
#' country_name = c("England", "England", "England"), | ||
#' region_code = c(NA, "E12000001", "E12000002"), | ||
#' region_name = c(NA, "North East", "North West"), | ||
#' mystery_count = c(42, 25, NA) | ||
#' ) | ||
#' | ||
#' z_replace(df) | ||
#' | ||
#' # Use a different replacement value | ||
#' z_replace(df, replacement_alt = "c") | ||
#' | ||
z_replace <- function(data, | ||
replacement_alt = NULL, | ||
exclude_columns = NULL) { | ||
# check if table is empty | ||
|
||
# Check if the table has rows - if not, stop the process | ||
if (nrow(data) < 1) { | ||
stop("Table is empty or contains no rows.") | ||
} | ||
# check for same column names but different case or formatting | ||
|
||
# load in potential column names | ||
|
||
geog_time_identifiers <- dfeR::geog_time_identifiers | ||
|
||
# check for same column names but different case or formatting | ||
|
||
# standardize column names for potential column names | ||
|
||
ref_col_names <- gsub("[[:punct:]]", " ", geog_time_identifiers) | ||
# removing extra space | ||
ref_col_names <- gsub(" ", " ", ref_col_names) | ||
# adding _ instead of spaces | ||
ref_col_names <- gsub(" ", "_", tolower(ref_col_names)) | ||
|
||
|
||
# standardize column names for data input | ||
data_col_names_og <- colnames(data) | ||
|
||
data_col_names <- gsub("[[:punct:]]", " ", data_col_names_og) | ||
# removing extra space | ||
data_col_names <- gsub(" ", " ", data_col_names) | ||
# adding _ instead of spaces | ||
data_col_names <- gsub(" ", "_", tolower(data_col_names)) | ||
|
||
# check if the column name exists by comparing standardized names | ||
|
||
col_name_exists <- data_col_names %in% ref_col_names | ||
# check if the formatting matches by comparing non-standardized | ||
formatting_test <- data_col_names_og %in% geog_time_identifiers | ||
|
||
if (any(col_name_exists %in% TRUE & formatting_test %in% FALSE) == TRUE) { | ||
stop( | ||
"Your table has geography and/or time column(s) that are not ", | ||
"in snake_case.\nPlease amend your column names to match the formatting", | ||
"to dfeR::geog_time_identifiers." | ||
) | ||
} | ||
|
||
# check for alt NA replacement | ||
# if no alt, provided, use z | ||
if (is.null(replacement_alt)) { | ||
replacement_alt <- "z" | ||
# check that replacement_alt is a single character vector | ||
} else if (!is.character(replacement_alt)) { | ||
stop( | ||
"You provided a ", data.class(replacement_alt), | ||
" input for replacement_alt.\n", | ||
"Please amend replace it with a character vector." | ||
) | ||
} else if (length(replacement_alt) > 1) { | ||
stop( | ||
"You provided multiple values for replacement_alt.\n", | ||
"Please, only provide a single value." | ||
) | ||
} else { | ||
# otherwise use the provided replacement | ||
replacement_alt <- replacement_alt | ||
} | ||
|
||
|
||
# start loop based on exclude_columns | ||
|
||
# if exclude columns is specified, use the snake case version | ||
if (!is.null(exclude_columns)) { | ||
data <- data %>% | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c( | ||
geog_time_identifiers, | ||
exclude_columns | ||
)), | ||
~ as.character(.) | ||
)) %>% | ||
# replace NAs | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c( | ||
geog_time_identifiers, | ||
exclude_columns | ||
)), | ||
~ dplyr::if_else(is.na(.), replacement_alt, .) | ||
)) | ||
} else { | ||
# if exclude_columns is not specified, then use the saved potential | ||
# location and time columns only | ||
data <- data %>% | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c(geog_time_identifiers)), | ||
~ as.character(.) | ||
)) %>% | ||
# replace NAs | ||
dplyr::mutate(dplyr::across( | ||
-tidyselect::any_of(c(geog_time_identifiers)), | ||
~ dplyr::if_else(is.na(.), replacement_alt, .) | ||
)) | ||
} | ||
|
||
return(data) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# Readme template | ||
|
||
*This is the README template for a publication project. Please update the italicised text in each section below with the specific details for your publication. You can find more information on why you should have a README file on the* [RAP for statistics](https://dfe-analytical-services.github.io/analysts-guide/RAP/rap-statistics.html) *page of the Analysts' Guide.* | ||
|
||
*Once you have added your information, ensure all italicised text is deleted.* | ||
|
||
## Introduction | ||
|
||
- Purpose: *briefly explain the purpose of the code.* | ||
- Overview: *Provide a high-level summary of the contents and structure of the repository.* | ||
|
||
## Requirements | ||
|
||
- Access: *Detail any permissions or access needed to use the repository at the top of this section, e.g. access to specific SQL databases. This is crucial for enabling new users to use the repository.* | ||
- Skills/knowledge: *Outline the required skills or knowledge, such as familiarity with specific packages in R, or SQL.* | ||
- Version control/Renv: *State how version control is managed and whether Renv is being used.* | ||
|
||
## Getting started | ||
|
||
- Setup instructions: *Provide step-by-step instructions on how to set up the environment, including installing dependencies.* | ||
- Data input/output: *Describe the expected input data and where it can be found, as well as what output should be expected from the code.* | ||
|
||
## How to run and update | ||
|
||
- Running the code: *Explain how users can best run the code, for example by running a run all script.* | ||
- Updating guidelines: *Outline the process for updating and contributing to the repository, including specific scripts and lines where updates are frequently needed. Describe how to get changes reviewed.* | ||
- Issue reporting: *Explain how to report issues or suggest improvements. This could be through issues if using GitHub, boards in Azure DevOps or by emailing the team.* | ||
|
||
## Contact details | ||
|
||
- Main contacts: *List the names and contact information of people who maintain the repository.* | ||
- Support channels: *Provide any information on how to get support, such as email addresses or teams channels.* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
# Get a list of potential location and time columns | ||
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
|
||
# create a vector of possible time and geography column names | ||
geog_time_identifiers <- c( | ||
"geographic_level", "country_code", "region_code", "new_la_code", "lad_code", | ||
"pcon_code", "lsip_code", "local_enterprise_partnership_code", | ||
"english_devolved_area_code", "opportunity_area_code", "ward_code", | ||
"trust_id", "sponsor_id", "school_urn", "provider_ukprn", "institution_id", | ||
"planning_area_code", "country_name", "region_name", "la_name", "lad_name", | ||
"rsc_region_lead_name", "pcon_name", "lsip_name", | ||
"local_enterprise_partnership_name", "english_devolved_area_name", | ||
"opportunity_area_name", "ward_name", "trust_name", "sponsor_name", | ||
"school_name", "provider_name", "institution_name", "planning_area_name", | ||
"old_la_code", "school_laestab", "time_period", "time_identifier" | ||
) | ||
|
||
# write it out to the data folder | ||
|
||
usethis::use_data(geog_time_identifiers, overwrite = TRUE) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ rgn | |
sep | ||
ser | ||
shorthands | ||
shorturl | ||
sql | ||
uk | ||
utla | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.