Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

127 compare header and schema #146

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions R/read_resource.R
Original file line number Diff line number Diff line change
Expand Up @@ -352,11 +352,11 @@ read_resource <- function(package, resource_name, col_select = NULL) {
# Read data directly
if (resource$read_from == "df") {
df <- dplyr::as_tibble(resource$data)

data_col_names <- colnames(df)
# Read data from data
} else if (resource$read_from == "data") {
df <- dplyr::as_tibble(do.call(rbind.data.frame, resource$data))

data_col_names <- colnames(df)
# Read data from path(s)
} else if (resource$read_from == "path" || resource$read_from == "url") {
dataframes <- list()
Expand Down Expand Up @@ -391,7 +391,41 @@ read_resource <- function(package, resource_name, col_select = NULL) {
}
# Merge data frames for all paths
df <- dplyr::bind_rows(dataframes)
# Read header from first file
data_col_names <-
readr::read_lines(file = paths[1], n_max = 1) %>%
I() %>%
readr::read_delim(delim = replace_null(dialect$delimiter, ","),
quote = replace_null(dialect$quoteChar, "\""),
escape_backslash = ifelse(
replace_null(dialect$escapeChar, "not set") == "\\", TRUE, FALSE
),
escape_double = ifelse(
# if escapeChar is set, set doubleQuote to FALSE (mutually exclusive)
replace_null(dialect$escapeChar, "not set") == "\\",
FALSE,
replace_null(dialect$doubleQuote, TRUE)
),
locale = locale,
na = replace_null(schema$missingValues, ""),
comment = replace_null(dialect$commentChar, ""),
trim_ws = replace_null(dialect$skipInitialSpace, FALSE),
col_names = FALSE,
show_col_types = FALSE) %>%
dplyr::slice_head(n = 1) %>%
unlist(use.names = FALSE)
}
# compare df header to schema
assertthat::assert_that(
identical(tolower(col_names), tolower(data_col_names)),
msg = glue::glue(
"Field names in `schema` must match column names in data:",
"\u2139 Field names: `{field_names_collapse}`",
"\u2139 Column names in data: `{data_col_names_collapse}`",
.sep = "\n",
field_names_collapse = glue::glue_collapse(col_names, sep = ", "),
data_col_names_collapse = glue::glue_collapse(data_col_names, sep = ", ")
))

return(df)
}
104 changes: 104 additions & 0 deletions tests/testthat/test-read_resource.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,110 @@ test_that("read_resource() returns error on column selection not in schema", {
)
})

test_that("read_resource() returns error on missing columns in data", {
temp_package_dir <- file.path(tempdir(),"missing_cols_package")
# Create datapackage with missing columns in data
dir.create(temp_package_dir)
file.copy(from = list.files(system.file("extdata", package = "frictionless"),
full.names = TRUE),
to = file.path(
temp_package_dir,
list.files(system.file("extdata", package = "frictionless"),
full.names = FALSE))
)
readr::read_csv(file.path(temp_package_dir, "deployments.csv"),
col_select = -start,
show_col_types = FALSE) %>%
readr::write_csv(file.path(temp_package_dir, "deployments.csv"))
# Clean up after test
on.exit(unlink(temp_package_dir, recursive = TRUE))
# Read the new package
missing_cols_package <-
suppressMessages(
read_package(file.path(temp_package_dir,"datapackage.json"))
)
# Test
expect_error(
read_resource(missing_cols_package, "deployments"),
regexp = "must match column names in data"
)
})

test_that("read_resource() returns error on extra columns in data", {
# clean up after test
on.exit(unlink(file.path(tempdir(),"extra_cols_package"), recursive = TRUE))
# create datapackage with extra columns in data
temp_package_dir <- file.path(tempdir(),"extra_cols_package")
dir.create(temp_package_dir)
file.copy(from = list.files(system.file("extdata", package = "frictionless"),
full.names = TRUE),
to = file.path(
temp_package_dir,
list.files(system.file("extdata", package = "frictionless"),
full.names = FALSE))
)
readr::read_csv(file.path(temp_package_dir, "deployments.csv"),
show_col_types = FALSE) %>%
dplyr::mutate(random_column = runif(n = 3)) %>%
readr::write_csv(file.path(temp_package_dir, "deployments.csv"))
# read the new package
extra_cols_package <-
suppressMessages(
read_package(file.path(temp_package_dir,"datapackage.json"))
)
# Test
expect_error(
read_resource(extra_cols_package, "deployments"),
regexp = "must match column names in data"
)
})

test_that("read_resource() returns error on missing columns in schema", {
# create package with a missing column in the schema of observations
missing_col_in_schema_pkg <- example_package
## remove `timestamp`
missing_col_in_schema_pkg$resources[[2]]$schema$fields <-
missing_col_in_schema_pkg$resources[[2]]$schema$fields[-3]
# Test
expect_error(
read_resource(missing_col_in_schema_pkg, "observations"),
regexp = "must match column names in data"
)
})

test_that("read_resource() returns error on column order mismatch between
schema and data", {
# Create package with the wrong order in the schema of deployments
wrong_order_in_schema_pkg <- example_package
## Reorder columns
purrr::pluck(wrong_order_in_schema_pkg, "resources", 1, "schema", "fields") <-
purrr::chuck(wrong_order_in_schema_pkg, "resources", 1, "schema", "fields")[
c(5, 1, 4, 3, 2) # this is not the order the columns have in the data!
]
# Test
expect_error(
read_resource(wrong_order_in_schema_pkg, "deployments"),
regexp = "must match column names in data"
)
})

test_that("read_resource() doesn't compare header when dialect$header is null", {
# not only will it not be compared, the header will be skipped when reading
})

test_that("read_resource() allows case mismatch between schema and data", {
# create package with the wrong case in the schema of observations
wrong_case_in_schema_pkg <- example_package
## Change case of single field name
wrong_case_in_schema_pkg$resources[[2]]$schema$fields[[3]]$name <-
toupper(wrong_case_in_schema_pkg$resources[[2]]$schema$fields[[3]]$name)
# Test
expect_s3_class(
read_resource(wrong_case_in_schema_pkg, "observations"),
"tbl"
)
})

test_that("read_resource() returns error on incorrect Data Package", {
expect_error(
read_resource(list(), "deployments"),
Expand Down