Skip to content

Commit

Permalink
Merge pull request #6 from openpharma/workflow-fixes
Browse files Browse the repository at this point in the history
Support pqConenction and fix graph nodes ordering
  • Loading branch information
krystian8207 authored Feb 9, 2023
2 parents ab4137c + 8754b49 commit 86abb49
Show file tree
Hide file tree
Showing 37 changed files with 1,115 additions and 334 deletions.
1 change: 0 additions & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ Dockerfile
schema_books.yml
schema.yml
^\.github$
README_files
README.Rmd
.RProfile
cran-comments.md
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
fail-fast: false
matrix:
config:
- {os: ubuntu-18.04, r: 'release'}
- {os: ubuntu-20.04, r: 'release'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: DataFakeR
Type: Package
Title: Generate Fake Data for Relational Databases
Version: 0.1.2
Version: 0.1.3
Authors@R: c(
person(
given = "Krystian", family = "Igras",
Expand Down Expand Up @@ -58,7 +58,7 @@ Suggests:
testthat (>= 3.0.0),
rcmdcheck
Config/testthat/edition: 3
RoxygenNote: 7.1.2
RoxygenNote: 7.2.3
VignetteBuilder: knitr
Collate:
'DataFaker-package.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(default_faker_opts)
export(get_faker_opts)
export(levels_rand)
export(na_rand)
export(nrows_simul_constant)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# DataFakeR 0.1.3

- Fixed computing order for dependent columns simulation.
- Added "pQConnection"" driver class methods for pulling DB schema.
- Added `get_faker_opts` method to list current package options.

# DataFakeR 0.1.2

- Built in support for `levels_ratio` parameter
Expand Down
1 change: 1 addition & 0 deletions R/DataFaker-package.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#' Generate fake data based on provided schema specification
#'
#' @name DataFakeR-package
#' @keywords internal
#' @importFrom magrittr %>%

globalVariables(c(
Expand Down
9 changes: 8 additions & 1 deletion R/schema_conf.R
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,8 @@ opt_pull_table <- function(nrows = "exact", ...) {
#' \link{special_simulation}, \link{restricted_simulation},
#' \link{sourcing_metadata}.
#'
#' \code{set_faker_opts} allows to overwrite selected options,
#' \code{set_faker_opts} allows to overwrite selected options.
#' \code{get_faker_opts} lists the current options configuration.
#'
#' @param opt_pull_character,opt_pull_numeric,opt_pull_integer,opt_pull_logical,opt_pull_date,opt_pull_table,opt_default_character,opt_simul_spec_character,opt_simul_restricted_character,opt_simul_default_fun_character,opt_default_numeric,opt_simul_spec_numeric,opt_simul_restricted_numeric,opt_simul_default_fun_numeric,opt_default_integer,opt_simul_spec_integer,opt_simul_restricted_integer,opt_simul_default_fun_integer,opt_default_logical,opt_simul_spec_logical,opt_simul_restricted_logical,opt_simul_default_fun_logical,opt_default_date,opt_simul_spec_date,opt_simul_restricted_date,opt_simul_default_fun_date,opt_default_table
#' Parameters defined in default configuration that can be modified by using \code{set_faker_opts} function.
Expand Down Expand Up @@ -575,3 +576,9 @@ set_faker_opts <- function(
}
current_opts
}

#' @rdname faker_configuration
#' @export
get_faker_opts <- function() {
getOption("dfkr_options")
}
56 changes: 44 additions & 12 deletions R/schema_from_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ get_schema_constraints.RedshiftConnection <- function(source, schema) {
"))
constraint_table <- dplyr::left_join(
constr_table,
constr_table %>% dplyr::select(unique_constraint_name, fk_table_name = table_name, fk_column_name = column_name),
by = "unique_constraint_name",
constr_table %>% dplyr::select(constraint_name, fk_table_name = table_name, fk_column_name = column_name),
by = c("unique_constraint_name" = "constraint_name"),
na_matches = "never"
)
if (nrow(constraint_table) == 0) {
Expand All @@ -21,7 +21,27 @@ get_schema_constraints.RedshiftConnection <- function(source, schema) {
constraint_table
}

get_schema_constraints.PostgresConnection <- get_schema_constraints.RPostgreSQLConnection <- get_schema_constraints.RedshiftConnection
get_schema_constraints.PqConnection <- get_schema_constraints.PostgresConnection <- get_schema_constraints.RPostgreSQLConnection <- get_schema_constraints.RedshiftConnection

get_table_pk.RPostgreSQLConnection <- function(source, schema, table_name) {
sql_query <- glue::glue("
SELECT
pg_attribute.attname as column_name,
format_type(pg_attribute.atttypid, pg_attribute.atttypmod) as data_type
FROM pg_index, pg_class, pg_attribute, pg_namespace
WHERE
pg_class.oid = '{schema}.{table_name}'::regclass AND
indrelid = pg_class.oid AND
nspname = '{schema}' AND
pg_class.relnamespace = pg_namespace.oid AND
pg_attribute.attrelid = pg_class.oid AND
pg_attribute.attnum = any(pg_index.indkey)
AND indisprimary
")
DBI::dbGetQuery(source, sql_query)
}

get_table_pk.PqConnection <- get_table_pk.PostgresConnection <- get_table_pk.RPostgreSQLConnection

get_table_pk.RedshiftConnection <- function(source, schema, table_name) {
# the way to source pk's without having permissions to information_schema
Expand All @@ -36,8 +56,6 @@ get_table_pk.RedshiftConnection <- function(source, schema, table_name) {
DBI::dbGetQuery(source, sql_query)
}

get_table_pk.PostgresConnection <- get_table_pk.RPostgreSQLConnection <- get_table_pk.RedshiftConnection

pull_column_values.RedshiftConnection <- function(source, col_info, values, max_uniq_to_pull) {
if (!identical(values, TRUE) || is.null(max_uniq_to_pull)) {
return(NULL)
Expand All @@ -56,7 +74,7 @@ pull_column_values.RedshiftConnection <- function(source, col_info, values, max_
result
}

pull_column_values.PostgresConnection <- pull_column_values.RPostgreSQLConnection <- pull_column_values.RedshiftConnection
pull_column_values.PqConnection <- pull_column_values.PostgresConnection <- pull_column_values.RPostgreSQLConnection <- pull_column_values.RedshiftConnection

pull_column_nchar.RedshiftConnection <- function(source, col_info, nchar) {
if (!identical(nchar, TRUE)) {
Expand All @@ -70,7 +88,7 @@ pull_column_nchar.RedshiftConnection <- function(source, col_info, nchar) {
nchar
}

pull_column_nchar.PostgresConnection <- pull_column_nchar.RPostgreSQLConnection <- pull_column_nchar.RedshiftConnection
pull_column_nchar.PqConnection <- pull_column_nchar.PostgresConnection <- pull_column_nchar.RPostgreSQLConnection <- pull_column_nchar.RedshiftConnection

pull_column_range.RedshiftConnection <- function(source, col_info, range) {

Expand All @@ -87,7 +105,7 @@ pull_column_range.RedshiftConnection <- function(source, col_info, range) {
c(result$min_val, result$max_val)
}

pull_column_range.PostgresConnection <- pull_column_range.RPostgreSQLConnection <- pull_column_range.RedshiftConnection
pull_column_range.PqConnection <- pull_column_range.PostgresConnection <- pull_column_range.RPostgreSQLConnection <- pull_column_range.RedshiftConnection

pull_column_na_ratio.RedshiftConnection <- function(source, col_info, na_ratio) {

Expand All @@ -104,7 +122,7 @@ pull_column_na_ratio.RedshiftConnection <- function(source, col_info, na_ratio)
result$na_ratio
}

pull_column_na_ratio.PostgresConnection <- pull_column_na_ratio.RPostgreSQLConnection <- pull_column_na_ratio.RedshiftConnection
pull_column_na_ratio.PqConnection <- pull_column_na_ratio.PostgresConnection <- pull_column_na_ratio.RPostgreSQLConnection <- pull_column_na_ratio.RedshiftConnection

pull_data_nrows.RedshiftConnection <- function(source, schema, nrows, ...) {

Expand All @@ -115,6 +133,7 @@ pull_data_nrows.RedshiftConnection <- function(source, schema, nrows, ...) {
tables_rows <- list()
for (table in tables$table_name) {
tbl_rows <- DBI::dbGetQuery(source, glue::glue("SELECT '{table}' as table_name, COUNT(1) as nrows FROM {schema}.{table};"))
tbl_rows$nrows <- as.integer(tbl_rows$nrows)
tables_rows <- append(tables_rows, list(tbl_rows))
}
tables_rows <- dplyr::bind_rows(tables_rows)
Expand All @@ -124,7 +143,7 @@ pull_data_nrows.RedshiftConnection <- function(source, schema, nrows, ...) {
return(tables_rows)
}

pull_data_nrows.PostgresConnection <- pull_data_nrows.RPostgreSQLConnection <- pull_data_nrows.RedshiftConnection
pull_data_nrows.PqConnection <- pull_data_nrows.PostgresConnection <- pull_data_nrows.RPostgreSQLConnection <- pull_data_nrows.RedshiftConnection

pull_column_levels_ratio.RedshiftConnection <- function(source, col_info, levels_ratio) {

Expand All @@ -141,12 +160,25 @@ pull_column_levels_ratio.RedshiftConnection <- function(source, col_info, levels
result$levels_ratio
}

pull_column_levels_ratio.PostgresConnection <- pull_column_levels_ratio.RPostgreSQLConnection <- pull_column_levels_ratio.RedshiftConnection
pull_column_levels_ratio.PqConnection <- pull_column_levels_ratio.PostgresConnection <- pull_column_levels_ratio.RPostgreSQLConnection <- pull_column_levels_ratio.RedshiftConnection

get_schema_info.RedshiftConnection <- function(source, schema) {
DBI::dbGetQuery(
source,
glue::glue("select * from information_schema.columns WHERE table_schema = '{schema}'")
)
}
get_schema_info.PostgresConnection <- get_schema_info.RPostgreSQLConnection <- get_schema_info.RedshiftConnection
get_schema_info.PqConnection <- get_schema_info.PostgresConnection <- get_schema_info.RPostgreSQLConnection <- function(source, schema) {
info <- DBI::dbGetQuery(
source,
glue::glue("select * from information_schema.columns WHERE table_schema = '{schema}'")
)
info %>%
dplyr::mutate(
data_type = ifelse(
data_type == "character" & !is.na(character_maximum_length),
paste0("char(", character_maximum_length, ")"),
data_type
)
)
}
1 change: 1 addition & 0 deletions R/simulate_cols.R
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ get_col_params <- function(col_def, schema, faker_opts) {
#' @param col_def Column definition.
#' @param schema Schema object.
#' @param faker_opts Column simulation faker_opts.
#' @keywords internal
fake_column <- function(n, col_def, schema, faker_opts) {
if (is_deterministic(col_def, schema)) {
# Col deterministic, no other rules can be applied
Expand Down
18 changes: 6 additions & 12 deletions R/simulate_tables.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,11 @@
nodes_order <- function(graph) {
nodes <- graph %>%
nodes_ordered <- graph %>%
dplyr::mutate(old_ind = seq_len(tidygraph::graph_order())) %>%
dplyr::arrange(tidygraph::node_topo_order()) %>%
tidygraph::activate(nodes) %>%
as.data.frame() %>% {
1:nrow(.)
}
edges <- graph %>%
tidygraph::activate(edges) %>%
as.data.frame()

edges <- edges %>%
dplyr::mutate(source = !from %in% to)

unique(c(edges[edges$source, "from"], edges[!edges$source, "from"], nodes))
as.data.frame() %>%
dplyr::pull(old_ind)
return(nodes_ordered)
}

simulate_schema_obj <- function(schema_obj) {
Expand Down
11 changes: 6 additions & 5 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,26 @@ knitr::opts_chunk$set(
fig.width = 8, # Default plot width
fig.height = 6, # .... height
dpi = 200, # Plot resolution
fig.align = "center"
fig.align = "center",
fig.path = "man/figures/README-"
)
knitr::opts_chunk$set() # Figure alignment
library(DataFakeR)
set.seed(123)
options(tibble.width = Inf)
```

# DataFakeR <img src="README_files/figure-gfm/logo.png" align="right" width="120" />
# DataFakeR <img src="man/figures/logo.png" align="right" width="120" />

[![version](https://img.shields.io/static/v1.svg?label=github.com&message=v.0.1.2&color=ff69b4)](https://openpharma.github.io/DataFakeR/)
[![version](https://img.shields.io/static/v1.svg?label=github.com&message=v.0.1.3&color=ff69b4)](https://openpharma.github.io/DataFakeR/)
[![lifecycle](https://img.shields.io/badge/lifecycle-experimental-success.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)

## Overview

DataFakeR is an R package designed to help you generate sample of fake data preserving specified assumptions about the original one.

<center>
## <span style="color:blue"> DataFakeR 0.1.2 is now available!</span>
## <span style="color:blue"> DataFakeR 0.1.3 is now available!</span>
</center>

## Installation
Expand Down Expand Up @@ -187,7 +188,7 @@ Special thanks to:

## Lifecycle

DataFakeR 0.1.2 is at experimental stage. If you find bugs or post an issue on GitHub page at <https://github.com/openpharma/DataFakeR/issues>
DataFakeR 0.1.3 is at experimental stage. If you find bugs or post an issue on GitHub page at <https://github.com/openpharma/DataFakeR/issues>

## Getting help

Expand Down
Loading

0 comments on commit 86abb49

Please sign in to comment.