Merge pull request #6 from openpharma/workflow-fixes

Support pqConenction and fix graph nodes ordering
openpharma · Feb 9, 2023 · 86abb49 · 86abb49
2 parents ab4137c + 8754b49
commit 86abb49
Show file tree

Hide file tree

Showing 37 changed files with 1,115 additions and 334 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -11,7 +11,6 @@ Dockerfile
 schema_books.yml
 schema.yml
 ^\.github$
-README_files
 README.Rmd
 .RProfile
 cran-comments.md

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {os: ubuntu-18.04,   r: 'release'}
+          - {os: ubuntu-20.04,   r: 'release'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: DataFakeR
 Type: Package
 Title: Generate Fake Data for Relational Databases
-Version: 0.1.2
+Version: 0.1.3
 Authors@R: c(
             person(
               given = "Krystian", family = "Igras",
@@ -58,7 +58,7 @@ Suggests:
     testthat (>= 3.0.0),
     rcmdcheck
 Config/testthat/edition: 3
-RoxygenNote: 7.1.2
+RoxygenNote: 7.2.3
 VignetteBuilder: knitr
 Collate:
     'DataFaker-package.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(default_faker_opts)
+export(get_faker_opts)
 export(levels_rand)
 export(na_rand)
 export(nrows_simul_constant)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# DataFakeR 0.1.3
+
+- Fixed computing order for dependent columns simulation.
+- Added "pQConnection"" driver class methods for pulling DB schema.
+- Added `get_faker_opts` method to list current package options.
+
 # DataFakeR 0.1.2
 
 - Built in support for `levels_ratio` parameter

diff --git a/R/DataFaker-package.R b/R/DataFaker-package.R
@@ -1,6 +1,7 @@
 #' Generate fake data based on provided schema specification
 #'
 #' @name DataFakeR-package
+#' @keywords internal
 #' @importFrom magrittr %>%
 
 globalVariables(c(

diff --git a/R/schema_conf.R b/R/schema_conf.R
@@ -502,7 +502,8 @@ opt_pull_table <- function(nrows = "exact", ...) {
 #' \link{special_simulation}, \link{restricted_simulation},
 #' \link{sourcing_metadata}.
 #'
-#' \code{set_faker_opts} allows to overwrite selected options,
+#' \code{set_faker_opts} allows to overwrite selected options.
+#' \code{get_faker_opts} lists the current options configuration.
 #'
 #' @param opt_pull_character,opt_pull_numeric,opt_pull_integer,opt_pull_logical,opt_pull_date,opt_pull_table,opt_default_character,opt_simul_spec_character,opt_simul_restricted_character,opt_simul_default_fun_character,opt_default_numeric,opt_simul_spec_numeric,opt_simul_restricted_numeric,opt_simul_default_fun_numeric,opt_default_integer,opt_simul_spec_integer,opt_simul_restricted_integer,opt_simul_default_fun_integer,opt_default_logical,opt_simul_spec_logical,opt_simul_restricted_logical,opt_simul_default_fun_logical,opt_default_date,opt_simul_spec_date,opt_simul_restricted_date,opt_simul_default_fun_date,opt_default_table
 #' Parameters defined in default configuration that can be modified by using \code{set_faker_opts} function.
@@ -575,3 +576,9 @@ set_faker_opts <- function(
   }
   current_opts
 }
+
+#' @rdname faker_configuration
+#' @export
+get_faker_opts <- function() {
+  getOption("dfkr_options")
+}
diff --git a/R/schema_from_db.R b/R/schema_from_db.R
@@ -11,8 +11,8 @@ get_schema_constraints.RedshiftConnection <- function(source, schema) {
   "))
   constraint_table <- dplyr::left_join(
     constr_table,
-    constr_table %>% dplyr::select(unique_constraint_name, fk_table_name = table_name, fk_column_name = column_name),
-    by = "unique_constraint_name",
+    constr_table %>% dplyr::select(constraint_name, fk_table_name = table_name, fk_column_name = column_name),
+    by = c("unique_constraint_name" = "constraint_name"),
     na_matches = "never"
   )
   if (nrow(constraint_table) == 0) {
@@ -21,7 +21,27 @@ get_schema_constraints.RedshiftConnection <- function(source, schema) {
   constraint_table
 }
 
-get_schema_constraints.PostgresConnection <- get_schema_constraints.RPostgreSQLConnection <- get_schema_constraints.RedshiftConnection
+get_schema_constraints.PqConnection <- get_schema_constraints.PostgresConnection <- get_schema_constraints.RPostgreSQLConnection <- get_schema_constraints.RedshiftConnection
+
+get_table_pk.RPostgreSQLConnection <- function(source, schema, table_name) {
+  sql_query <- glue::glue("
+    SELECT
+      pg_attribute.attname as column_name,
+      format_type(pg_attribute.atttypid, pg_attribute.atttypmod) as data_type
+    FROM pg_index, pg_class, pg_attribute, pg_namespace
+    WHERE
+      pg_class.oid = '{schema}.{table_name}'::regclass AND
+      indrelid = pg_class.oid AND
+      nspname = '{schema}' AND
+      pg_class.relnamespace = pg_namespace.oid AND
+      pg_attribute.attrelid = pg_class.oid AND
+      pg_attribute.attnum = any(pg_index.indkey)
+     AND indisprimary
+  ")
+  DBI::dbGetQuery(source, sql_query)
+}
+
+get_table_pk.PqConnection <- get_table_pk.PostgresConnection <- get_table_pk.RPostgreSQLConnection
 
 get_table_pk.RedshiftConnection <- function(source, schema, table_name) {
   # the way to source pk's without having permissions to information_schema
@@ -36,8 +56,6 @@ get_table_pk.RedshiftConnection <- function(source, schema, table_name) {
   DBI::dbGetQuery(source, sql_query)
 }
 
-get_table_pk.PostgresConnection <- get_table_pk.RPostgreSQLConnection <- get_table_pk.RedshiftConnection
-
 pull_column_values.RedshiftConnection <- function(source, col_info, values, max_uniq_to_pull) {
   if (!identical(values, TRUE) || is.null(max_uniq_to_pull)) {
     return(NULL)
@@ -56,7 +74,7 @@ pull_column_values.RedshiftConnection <- function(source, col_info, values, max_
   result
 }
 
-pull_column_values.PostgresConnection <- pull_column_values.RPostgreSQLConnection <- pull_column_values.RedshiftConnection
+pull_column_values.PqConnection <- pull_column_values.PostgresConnection <- pull_column_values.RPostgreSQLConnection <- pull_column_values.RedshiftConnection
 
 pull_column_nchar.RedshiftConnection <- function(source, col_info, nchar) {
   if (!identical(nchar, TRUE)) {
@@ -70,7 +88,7 @@ pull_column_nchar.RedshiftConnection <- function(source, col_info, nchar) {
   nchar
 }
 
-pull_column_nchar.PostgresConnection <- pull_column_nchar.RPostgreSQLConnection <- pull_column_nchar.RedshiftConnection
+pull_column_nchar.PqConnection <- pull_column_nchar.PostgresConnection <- pull_column_nchar.RPostgreSQLConnection <- pull_column_nchar.RedshiftConnection
 
 pull_column_range.RedshiftConnection <- function(source, col_info, range) {
 
@@ -87,7 +105,7 @@ pull_column_range.RedshiftConnection <- function(source, col_info, range) {
   c(result$min_val, result$max_val)
 }
 
-pull_column_range.PostgresConnection <- pull_column_range.RPostgreSQLConnection <- pull_column_range.RedshiftConnection
+pull_column_range.PqConnection <- pull_column_range.PostgresConnection <- pull_column_range.RPostgreSQLConnection <- pull_column_range.RedshiftConnection
 
 pull_column_na_ratio.RedshiftConnection <- function(source, col_info, na_ratio) {
 
@@ -104,7 +122,7 @@ pull_column_na_ratio.RedshiftConnection <- function(source, col_info, na_ratio)
   result$na_ratio
 }
 
-pull_column_na_ratio.PostgresConnection <- pull_column_na_ratio.RPostgreSQLConnection <- pull_column_na_ratio.RedshiftConnection
+pull_column_na_ratio.PqConnection <- pull_column_na_ratio.PostgresConnection <- pull_column_na_ratio.RPostgreSQLConnection <- pull_column_na_ratio.RedshiftConnection
 
 pull_data_nrows.RedshiftConnection <- function(source, schema, nrows, ...) {
 
@@ -115,6 +133,7 @@ pull_data_nrows.RedshiftConnection <- function(source, schema, nrows, ...) {
   tables_rows <- list()
   for (table in tables$table_name) {
     tbl_rows <- DBI::dbGetQuery(source, glue::glue("SELECT '{table}' as table_name, COUNT(1) as nrows FROM {schema}.{table};"))
+    tbl_rows$nrows <- as.integer(tbl_rows$nrows)
     tables_rows <- append(tables_rows, list(tbl_rows))
   }
   tables_rows <- dplyr::bind_rows(tables_rows)
@@ -124,7 +143,7 @@ pull_data_nrows.RedshiftConnection <- function(source, schema, nrows, ...) {
   return(tables_rows)
 }
 
-pull_data_nrows.PostgresConnection <- pull_data_nrows.RPostgreSQLConnection <- pull_data_nrows.RedshiftConnection
+pull_data_nrows.PqConnection <- pull_data_nrows.PostgresConnection <- pull_data_nrows.RPostgreSQLConnection <- pull_data_nrows.RedshiftConnection
 
 pull_column_levels_ratio.RedshiftConnection <- function(source, col_info, levels_ratio) {
 
@@ -141,12 +160,25 @@ pull_column_levels_ratio.RedshiftConnection <- function(source, col_info, levels
   result$levels_ratio
 }
 
-pull_column_levels_ratio.PostgresConnection <- pull_column_levels_ratio.RPostgreSQLConnection <- pull_column_levels_ratio.RedshiftConnection
+pull_column_levels_ratio.PqConnection <- pull_column_levels_ratio.PostgresConnection <- pull_column_levels_ratio.RPostgreSQLConnection <- pull_column_levels_ratio.RedshiftConnection
 
 get_schema_info.RedshiftConnection <- function(source, schema) {
   DBI::dbGetQuery(
     source,
     glue::glue("select * from information_schema.columns WHERE table_schema = '{schema}'")
   )
 }
-get_schema_info.PostgresConnection <- get_schema_info.RPostgreSQLConnection <- get_schema_info.RedshiftConnection
+get_schema_info.PqConnection <- get_schema_info.PostgresConnection <- get_schema_info.RPostgreSQLConnection <- function(source, schema) {
+  info <- DBI::dbGetQuery(
+    source,
+    glue::glue("select * from information_schema.columns WHERE table_schema = '{schema}'")
+  )
+  info %>%
+    dplyr::mutate(
+      data_type = ifelse(
+        data_type == "character" & !is.na(character_maximum_length),
+        paste0("char(", character_maximum_length, ")"),
+        data_type
+      )
+    )
+}
diff --git a/R/simulate_cols.R b/R/simulate_cols.R
@@ -243,6 +243,7 @@ get_col_params <- function(col_def, schema, faker_opts) {
 #' @param col_def Column definition.
 #' @param schema Schema object.
 #' @param faker_opts Column simulation faker_opts.
+#' @keywords internal
 fake_column <- function(n, col_def, schema, faker_opts) {
   if (is_deterministic(col_def, schema)) {
     # Col deterministic, no other rules can be applied

diff --git a/R/simulate_tables.R b/R/simulate_tables.R
@@ -1,17 +1,11 @@
 nodes_order <- function(graph) {
-  nodes <- graph %>%
+  nodes_ordered <- graph %>%
+    dplyr::mutate(old_ind = seq_len(tidygraph::graph_order())) %>%
+    dplyr::arrange(tidygraph::node_topo_order()) %>%
     tidygraph::activate(nodes) %>%
-    as.data.frame() %>% {
-      1:nrow(.)
-    }
-  edges <- graph %>%
-    tidygraph::activate(edges) %>%
-    as.data.frame()
-
-  edges <- edges %>%
-    dplyr::mutate(source = !from %in% to)
-
-  unique(c(edges[edges$source, "from"], edges[!edges$source, "from"], nodes))
+    as.data.frame() %>%
+    dplyr::pull(old_ind)
+  return(nodes_ordered)
 }
 
 simulate_schema_obj <- function(schema_obj) {

diff --git a/README.Rmd b/README.Rmd
@@ -13,25 +13,26 @@ knitr::opts_chunk$set(
   fig.width = 8,       # Default plot width
   fig.height = 6,      # .... height
   dpi = 200,           # Plot resolution
-  fig.align = "center"
+  fig.align = "center",
+  fig.path = "man/figures/README-"
 )
 knitr::opts_chunk$set()  # Figure alignment   
 library(DataFakeR)
 set.seed(123)
 options(tibble.width = Inf)
 ```
 
-# DataFakeR <img src="README_files/figure-gfm/logo.png" align="right" width="120" />
+# DataFakeR <img src="man/figures/logo.png" align="right" width="120" />
 
-[![version](https://img.shields.io/static/v1.svg?label=github.com&message=v.0.1.2&color=ff69b4)](https://openpharma.github.io/DataFakeR/)
+[![version](https://img.shields.io/static/v1.svg?label=github.com&message=v.0.1.3&color=ff69b4)](https://openpharma.github.io/DataFakeR/)
 [![lifecycle](https://img.shields.io/badge/lifecycle-experimental-success.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
 
 ## Overview
 
 DataFakeR is an R package designed to help you generate sample of fake data preserving specified assumptions about the original one. 
 
 <center>
-## <span style="color:blue"> DataFakeR 0.1.2 is now available!</span>
+## <span style="color:blue"> DataFakeR 0.1.3 is now available!</span>
 </center>
 
 ## Installation
@@ -187,7 +188,7 @@ Special thanks to:
 
 ## Lifecycle
 
-DataFakeR 0.1.2 is at experimental stage. If you find bugs or post an issue on GitHub page at <https://github.com/openpharma/DataFakeR/issues>
+DataFakeR 0.1.3 is at experimental stage. If you find bugs or post an issue on GitHub page at <https://github.com/openpharma/DataFakeR/issues>
 
 ## Getting help