Merge pull request #31 from r-world-devs/dev

Post release sync.
r-world-devs · Mar 1, 2023 · 1bb4146 · 1bb4146
2 parents 8021a08 + 8b08961
commit 1bb4146
Show file tree

Hide file tree

Showing 11 changed files with 123 additions and 57 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: cohortBuilder
 Type: Package
 Title: Data Source Agnostic Filtering Tools
-Version: 0.1.2
+Version: 0.2.0
 Authors@R: 
     c(person('Krystian', 'Igras', 
              email = '[email protected]', 
@@ -16,14 +16,14 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Imports:
-    yaml,
+    R6,
     jsonlite,
     purrr,
+    tibble,
     dplyr (>= 1.0.0),
     tidyr,
     magrittr,
     glue,
-    R6,
     ggplot2,
     rlang (>= 1.0),
     formatR

diff --git a/NEWS.md b/NEWS.md
@@ -1,12 +1,11 @@
-# cohortBuilder 0.1.2
+# cohortBuilder 0.2.0
 
 * Changed the way reproducible code is returned. Now more flexibility is allowed with using e.g. `.repro_code_tweak` method.
 * The `tblist` source reproducible code is now using pipe chains for each dataset filtering.
 * Optimized filtering with having cache computed only for active filters.
-
-# cohortBuilder 0.1.1
-
+* Properly readjust steps and filters ids after step is removed.
 * Add `.post_binding` method, that allows to modify data object when binding is completed.
+* Fix reproducible code generation when no filters applied.
 
 # cohortBuilder 0.1
 

diff --git a/R/cohortBuilder-package.R b/R/cohortBuilder-package.R
@@ -3,15 +3,21 @@
 #' @name cohortBuilder-package
 #' @importFrom magrittr %>%
 #' @importFrom dplyr sym
+NULL
 
 globalVariables(c(
   ":=", "!!", ".data",
   "arrow_end_position_y", "excl_end_position_x", "excl_position_x",
   "excl_position_y", "label", "label_excl",
-  "label_heights", "label_position_x", "label_position_y", "level"
+  "label_heights", "label_position_x", "label_position_y", "level",
+  "dataset", "type", "expr", "new_expr", "expr1", "!<-", "x"
 ))
 
-NULL
+force_import <- function() {
+  R6::R6Class
+  formatR::tidy_source
+  jsonlite::toJSON
+}
 
 `%:::%` <- function(pkg, name) {
   pkg <- as.character(substitute(pkg))

diff --git a/R/cohort_methods.R b/R/cohort_methods.R
@@ -150,10 +150,12 @@ Cohort <- R6::R6Class(
       run_hooks(hook$pre, self, private, step_id)
 
       step_id <- as.character(step_id)
+      clear_data_ids <- steps_range(step_id, rev(names(private$steps))[1])
       private$steps[[step_id]] <- NULL
-      private$cache[[step_id]] <- NULL
-      private$data_objects[steps_range(step_id, length(private$data_objects))] <- NULL
+      private$cache[clear_data_ids] <- NULL
+      private$data_objects[clear_data_ids] <- NULL
       private$steps <- adjust_names(private$steps)
+      private$steps <- purrr::imodify(private$steps, readjust_step)
       if (!is.null(private$steps) && run_flow) {
         self$run_flow(min_step = step_id)
       }

diff --git a/R/list_operators.R b/R/list_operators.R
@@ -35,6 +35,11 @@ rename_item <- function(list_obj, old_name, new_name) {
   return(list_obj)
 }
 
+modify_item <- function(list_obj, new_val, what) {
+  list_obj[[what]] <- new_val
+  return(list_obj)
+}
+
 #' Get function definition
 #'
 #' Whenever the function with provided name exists anywhere, the one is

diff --git a/R/step.R b/R/step.R
@@ -88,6 +88,13 @@ steps_range <- function(from, to) {
   )
 }
 
+readjust_step <- function(step, new_id) {
+  step$id <- new_id
+  step$filters <- purrr::modify(step$filters, modify_item, new_val = new_id, what = "step_id")
+
+  return(step)
+}
+
 prev_step <- function(idx) {
   as.character(as.integer(idx) - 1)
 }

diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 
 # cohortBuilder <img src="man/figures/logo.png" align="right" width="120" />
 
-[![version](https://img.shields.io/static/v1.svg?label=github.com&message=v.0.1.1&color=ff69b4)](https://r-world-devs.github.io/cohortBuilder/)
+[![version](https://img.shields.io/static/v1.svg?label=github.com&message=v.0.2.0&color=ff69b4)](https://r-world-devs.github.io/cohortBuilder/)
 [![lifecycle](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://lifecycle.r-lib.org/articles/stages.html#experimental)
 
 ## Overview
@@ -17,24 +17,24 @@ With only two steps:
 
 You can operate on data using common methods, such as:
 
--   `filter` - to define and `run` to apply filtering rules,
--   `step` - to perform multi-stage filtering,
--   `get_data`, `stat`, `attrition`, `plot_data` - to extract, sum up or
-    visualize your cohort data.
+- `filter` - to define and `run` to apply filtering rules,
+- `step` - to perform multi-stage filtering,
+- `get_data`, `stat`, `attrition`, `plot_data` - to extract, sum up or
+  visualize your cohort data.
 
 With `cohortBuilder` you can share the cohort easier with useful
 methods:
 
--   `code` - to get reproducible cohort creation code,
--   `get_state` - to get cohort state (e.g. in JSON) that can be then
-    easily restored with `restore`.
+- `code` - to get reproducible cohort creation code,
+- `get_state` - to get cohort state (e.g. in JSON) that can be then
+  easily restored with `restore`.
 
 Or modify the cohort configuration with:
 
--   `add_filter`, `rm_filter`, `update_filter` - to manage filters
-    definition
--   `add_step`, `rm_step` - to manage filtering steps,
--   `update_source` - to manage the cohort source.
+- `add_filter`, `rm_filter`, `update_filter` - to manage filters
+  definition
+- `add_step`, `rm_step` - to manage filtering steps,
+- `update_source` - to manage the cohort source.
 
 ## Data sources and extensions
 
@@ -354,24 +354,58 @@ code(coh)
 #>     }
 #>     return(data_object)
 #> }
+#> .run_binding <- function(source, binding_key, data_object_pre, data_object_post,
+#>     ...) {
+#>     binding_dataset <- binding_key$update$dataset
+#>     dependent_datasets <- names(binding_key$data_keys)
+#>     active_datasets <- data_object_post %>%
+#>         purrr::keep(~attr(., "filtered")) %>%
+#>         names()
+#>     if (!any(dependent_datasets %in% active_datasets)) {
+#>         return(data_object_post)
+#>     }
+#>     key_values <- NULL
+#>     common_key_names <- paste0("key_", seq_along(binding_key$data_keys[[1]]$key))
+#>     for (dependent_dataset in dependent_datasets) {
+#>         key_names <- binding_key$data_keys[[dependent_dataset]]$key
+#>         tmp_key_values <- dplyr::distinct(data_object_post[[dependent_dataset]][,
+#>             key_names, drop = FALSE]) %>%
+#>             stats::setNames(common_key_names)
+#>         if (is.null(key_values)) {
+#>             key_values <- tmp_key_values
+#>         } else {
+#>             key_values <- dplyr::inner_join(key_values, tmp_key_values, by = common_key_names)
+#>         }
+#>     }
+#>     data_object_post[[binding_dataset]] <- dplyr::inner_join(switch(as.character(binding_key$post),
+#>         `FALSE` = data_object_pre[[binding_dataset]], `TRUE` = data_object_post[[binding_dataset]]),
+#>         key_values, by = stats::setNames(common_key_names, binding_key$update$key))
+#>     if (binding_key$activate) {
+#>         attr(data_object_post[[binding_dataset]], "filtered") <- TRUE
+#>     }
+#>     return(data_object_post)
+#> }
 #> source <- list(dtconn = as.tblist(librarian))
 #> data_object <- source$dtconn
-#> # step 1
 #> step_id <- "1"
-#> data_object <- .pre_filtering(source, data_object, step_id)
+#> pre_data_object <- data_object
+#> data_object <- .pre_filtering(source, data_object, "1")
 #> data_object[["books"]] <- data_object[["books"]] %>%
 #>     dplyr::filter(author %in% c("Dan Brown", NA))
 #> attr(data_object[["books"]], "filtered") <- TRUE
 #> data_object[["borrowers"]] <- data_object[["borrowers"]] %>%
 #>     dplyr::filter((registered <= Inf & registered >= 14610) | is.na(registered))
 #> attr(data_object[["borrowers"]], "filtered") <- TRUE
-#> # step 2
+#> data_object <- .post_filtering(source, data_object, "1")
+#> for (binding_key in binding_keys) {
+#>     data_object <- .run_binding(source, binding_key, pre_data_object, data_object)
+#> }
 #> step_id <- "2"
-#> data_object <- .pre_filtering(source, data_object, step_id)
+#> data_object <- .pre_filtering(source, data_object, "2")
 #> data_object[["books"]] <- data_object[["books"]] %>%
 #>     dplyr::filter((copies <= 10 & copies >= 5) | is.na(copies))
 #> attr(data_object[["books"]], "filtered") <- TRUE
-#> data_object
+#> data_object <- .post_filtering(source, data_object, "2")
 ```
 
 ``` r
@@ -382,21 +416,21 @@ attrition(coh, dataset = "books")
 
 ``` r
 get_state(coh, json = TRUE)
-#> [{"step":"1","filters":[{"range":[5,6],"type":"discrete","id":"author","name":"author","variable":"author","value":"Dan Brown","dataset":"books","keep_na":true,"description":null,"active":true},{"type":"date_range","id":"registered","name":"registered","variable":"registered","range":["2010-01-01","NA"],"dataset":"borrowers","keep_na":true,"description":null,"active":true}]},{"step":"2","filters":[{"type":"range","id":"copies","name":"copies","variable":"copies","range":[5,10],"dataset":"books","keep_na":true,"description":null,"active":true}]}]
+#> [{"step":"1","filters":[{"range":[5,6],"type":"discrete","id":"author","name":"author","variable":"author","value":"Dan Brown","dataset":"books","keep_na":true,"description":null,"active":true},{"type":"date_range","id":"registered","name":"registered","variable":"registered","range":["2010-01-01","Inf"],"dataset":"borrowers","keep_na":true,"description":null,"active":true}]},{"step":"2","filters":[{"type":"range","id":"copies","name":"copies","variable":"copies","range":[5,10],"dataset":"books","keep_na":true,"description":null,"active":true}]}]
 ```
 
 ## Acknowledgement
 
 Special thanks to:
 
--   [Kamil Wais](mailto:[email protected]) for highlighting the need
-    for the package and its relevance to real-world applications.
--   [Adam Foryś](mailto:[email protected]) for technical support,
-    numerous suggestions for the current and future implementation of
-    the package.
--   [Paweł Kawski](mailto:[email protected]) for indication of
-    initial assumptions about the package based on real-world medical
-    data.
+- [Kamil Wais](mailto:[email protected]) for highlighting the need
+  for the package and its relevance to real-world applications.
+- [Adam Foryś](mailto:[email protected]) for technical support,
+  numerous suggestions for the current and future implementation of the
+  package.
+- [Paweł Kawski](mailto:[email protected]) for indication of
+  initial assumptions about the package based on real-world medical
+  data.
 
 ## Getting help
 

diff --git a/cran-comments.md b/cran-comments.md
@@ -1,16 +1,16 @@
 ## Test environments
 * local check
-  Ubuntu 18.04.6 LTS, R 4.1.2 (2021-11-01)
+  Ubuntu 20.04.5 LTS, R 4.1.2 (2021-11-01)
 * win-builder
   R version 4.1.3 (2022-03-10)
-  R version 4.2.1 (2022-06-23 ucrt)
-  R Under development (unstable) (2022-07-26 r82626 ucrt)
+  R version 4.2.2 (2022-10-31 ucrt)
+  R Under development (unstable) (2023-02-27 r83911 ucrt)
 
-## `R CMD check cohortBuilder_0.1.1.tar.gz --as-cran` results
+## `R CMD check cohortBuilder_0.2.0.tar.gz --as-cran` results
 
 ```
-* using log directory ‘/home/krystian/Projects/Packages/cohortBuilder.Rcheck’
-* using R version 4.1.2 (2021-11-01)
+* using log directory ‘/home/krystian/projects/cohortBuilder.Rcheck’
+* using R version 4.2.1 (2022-06-23)
 * using platform: x86_64-pc-linux-gnu (64-bit)
 ...
 Status: OK
@@ -38,7 +38,7 @@ Status: OK
 
 ```
 * using log directory 'd:/RCompile/CRANguest/R-release/cohortBuilder.Rcheck'
-* using R version 4.2.1 (2022-06-23 ucrt)
+* using R version 4.2.2 (2022-10-31 ucrt)
 * using platform: x86_64-w64-mingw32 (64-bit)
 ...
 * checking CRAN incoming feasibility ... Note_to_CRAN_maintainers
@@ -49,10 +49,10 @@ Status: OK
 
 ```
 * using log directory 'd:/RCompile/CRANguest/R-devel/cohortBuilder.Rcheck'
-* using R Under development (unstable) (2022-07-26 r82626 ucrt)
+* using R Under development (unstable) (2023-02-27 r83911 ucrt)
 * using platform: x86_64-w64-mingw32 (64-bit)
 ...
-* checking CRAN incoming feasibility ... Note_to_CRAN_maintainers
+* checking CRAN incoming feasibility ... [10s] Note_to_CRAN_maintainers
 Maintainer: 'Krystian Igras <[email protected]>'
 ...
 Status: OK

diff --git a/renv/settings.dcf b/renv/settings.dcf
@@ -1,5 +1,5 @@
 external.libraries:
-ignored.packages:
+ignored.packages: cohortBuilder
 package.dependency.fields: Imports, Depends, LinkingTo
 r.version:
 snapshot.type: all

diff --git a/vignettes/cohortBuilder.Rmd b/vignettes/cohortBuilder.Rmd
@@ -374,14 +374,15 @@ specific source (i.e. `dplyr` for `tblist` and `dbplyr` for `db` source):
 code(librarian_cohort)
 ```
 
-We can see above, the resulting code uses `source` object, which creation can be
+We can see above, the resulting code uses `source` object, which creation code can be
 defined separately while creating it:
 
 ```{r}
 librarian_source <- set_source(
   as.tblist(librarian),
   source_code = quote({
-    source <- list(attributes = list(datasets = librarian))
+    source <- list()
+    source$dtconn <- as.tblist(librarian)
   })
 )
 
@@ -409,7 +410,14 @@ librarian_cohort <- librarian_source %>%
 code(librarian_cohort)
 ```
 
-The second option allows to restore cohort configuration using its state.
+What's more, you can manipulate the output with additional arguments:
+
+- `include_methods` - list of methods names which definition should be printed in output,
+- `include_action` - list of actions names (such as "pre_filtering") that should be included in output,
+- `modifier` - a custom modifier of data.frame storing reproducible code parts,
+- `mark_step` - should step ID be presented in output.
+
+The second option for achieving reproducibility allows to restore cohort configuration using its state.
 The cohort state is a list (or json) storing information about all the steps and filters configuration.
 
 You may get the state with `get_state` method:

diff --git a/vignettes/managing-cohort.Rmd b/vignettes/managing-cohort.Rmd
@@ -148,21 +148,23 @@ See `vignette("cohort-configuration")`.
 The last Cohort configuration component - source, can be also managed within the Cohort itself.
 With `update_source` method you can change the source defined in the existing Cohort.
 
-Below we update cohort with Source having `source_code` parameter defined:
+Below we update cohort with Source having `source_code` parameter defined.
+The argument is responsible to generate `source` object definition printed in the reproducible code (you can use it when the default method doesn't print reasonable output).
+
 ```{r}
-code(librarian_cohort)
+code(librarian_cohort, include_methods = NULL)
 
 new_source <- set_source(
   as.tblist(librarian),
   source_code = quote({
-    source <- list(attributes = list(datasets = librarian))
+    source <- list()
+    source$dtconn <- as.tblist(librarian)
   })
 )
 
 update_source(librarian_cohort, new_source)
-code(librarian_cohort)
 sum_up(librarian_cohort)
-code(librarian_cohort)
+code(librarian_cohort, include_methods = NULL)
 ```
 
 Note that updating source doesn't remove Cohort configuration (steps and filters).
@@ -176,9 +178,12 @@ sum_up(librarian_cohort)
 You can also use `update_source` to add Source to an empty Cohort:
 
 ```{r}
+new_source <- set_source(
+  as.tblist(librarian)
+)
 empty_cohort <- cohort()
-update_source(librarian_cohort, new_source)
-sum_up(empty_cohort)
+update_source(empty_cohort, new_source)
+code(empty_cohort, include_methods = NULL)
 ```
 
 The `update_source` method can be also useful if you want to update source along with steps and filters configuration.