Skip to content

Commit

Permalink
Significantly improved performance of cuminc (#95)
Browse files Browse the repository at this point in the history
* Significantly improved performance of `cuminc` due to removing the for-loop and restructuring the re-encoding of the events.

* Even better performance - though breaks table output length.

* increment version number

* doc updates

* Update PULL_REQUEST_TEMPLATE.md

---------

Co-authored-by: Daniel Sjoberg <[email protected]>
  • Loading branch information
pteridin and ddsjoberg authored Oct 26, 2023
1 parent 99815ec commit fd7f2ed
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 133 deletions.
5 changes: 0 additions & 5 deletions .Rprofile

This file was deleted.

1 change: 1 addition & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Checklist for PR reviewer
- [ ] Code coverage is suitable for any new functions/features. Review coverage with `covr::report()`. Before you run, set `Sys.setenv(NOT_CRAN="true")` and begin in a fresh R session without any packages loaded.
- [ ] R CMD Check runs without errors, warnings, and notes
- [ ] `usethis::use_spell_check()` runs with no spelling errors in documentation
- [ ] Test the update does not break any reverse dependencies with `revdepcheck::revdep_check()` (https://github.com/r-lib/revdepcheck)

When the branch is ready to be merged into master:
- [ ] Update `NEWS.md` with the changes from this pull request under the heading "`# tidycmprsk (development version)`". If there is an issue associated with the pull request, reference it in parentheses at the end of the update.
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidycmprsk
Title: Competing Risks Estimation
Version: 0.2.0.9001
Version: 0.2.0.9002
Authors@R: c(
person(c("Daniel", "D."), "Sjoberg", , "[email protected]", role = c("aut", "cre", "cph"),
comment = c(ORCID = "0000-0003-0862-2018")),
Expand Down Expand Up @@ -42,4 +42,4 @@ Encoding: UTF-8
Language: en-US
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.1
RoxygenNote: 7.2.3
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# tidycmprsk (development version)

* Performance improvements to `cuminc()`. (@pteridin; #73)

* Updates ahead of the {purrr} v1.0 release.

# tidycmprsk 0.2.0
Expand Down
209 changes: 84 additions & 125 deletions R/broom_methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,8 @@ add_n_stats <- function(df_tidy, x) {
unclass() %>%
tibble::as_tibble()

if ("strata" %in% names(df_tidy)) {
is_strata <- "strata" %in% names(df_tidy)
if (is_strata) {
df_Surv <-
df_Surv %>%
mutate(
Expand All @@ -297,133 +298,91 @@ add_n_stats <- function(df_tidy, x) {
)
}

df_time_zero <-
df_Surv %>%
select(any_of(c("strata", "status"))) %>%
group_by(across(any_of(c("strata")))) %>%
mutate(
time = 0,
n.event = 0L,
n.risk = dplyr::n(),
n.censor = 0L,
outcome =
dplyr::recode(
.data$status,
!!!(as.list(names(x$failcode)) %>% stats::setNames(unlist(x$failcode))),
.default = NA_character_
)
) %>%
filter(.data$status != 0) %>%
select(-dplyr::all_of("status")) %>%
dplyr::distinct() %>%
dplyr::ungroup() %>%
# all of this below is just in case a particular stratum does not have any observed events of any of the outcomes
tidyr::complete(., !!!rlang::syms(intersect(c("strata", "outcome"), names(.)))) %>%
group_by(across(any_of(c("strata")))) %>%
tidyr::fill(dplyr::all_of(c("time", "n.event", "n.risk", "n.censor")), .direction = "updown") %>%
dplyr::ungroup()

df_Surv <-
df_Surv %>%
filter(stats::complete.cases(.)) %>%
arrange(across(any_of(c("strata", "time", "status")))) %>%
group_by(across(any_of(c("strata")))) %>%
mutate(
n.risk = dplyr::n() - dplyr::row_number() + 1L,
n.event = as.integer(.data$status != 0),
n.censor = as.integer(.data$status == 0)
) %>%
group_by(across(any_of(c("strata", "time")))) %>%
mutate(
outcome =
dplyr::recode(
.data$status,
`0` = "censored",
!!!(as.list(names(x$failcode)) %>% stats::setNames(unlist(x$failcode)))
)
) %>%
select(-dplyr::all_of("status")) %>%
dplyr::ungroup() %>%
dplyr::distinct()

df_n_censor <-
df_Surv %>%
filter(.data$outcome == "censored") %>%
group_by(across(any_of(c("strata", "time")))) %>%
dplyr::slice(rep(1:dplyr::n(), each = length(x$failcode))) %>%
mutate(
status = rep(1:length(x$failcode), dplyr::n() / length(x$failcode))
) %>%
mutate(
outcome =
dplyr::recode(.data$status, !!!(as.list(names(x$failcode)) %>% stats::setNames(unlist(x$failcode))))
) %>%
select(-dplyr::all_of("status")) %>%
dplyr::ungroup() %>%
dplyr::distinct()

df_Surv <- df_Surv %>%
filter(.data$outcome != "censored") %>%
group_by(across(any_of(c("strata", "outcome", "time")))) %>%
dplyr::slice(rep(1:dplyr::n(), each = length(x$failcode))) %>%
mutate(
status = rep(1:length(x$failcode), dplyr::n() / length(x$failcode)),
outcome2 =
dplyr::recode(.data$status, !!!(as.list(names(x$failcode)) %>% stats::setNames(unlist(x$failcode)))),
n.event = as.integer(.data$outcome == .data$outcome2),
outcome = .data$outcome2
) %>%
select(-dplyr::all_of(c("status", "outcome2")))

df_Surv <- merge(df_Surv, df_n_censor, all = TRUE)
df_Surv <- merge(df_Surv, df_time_zero, all = TRUE)
df_Surv <- df_Surv %>%
arrange(across(any_of(c("strata", "outcome", "time")))) %>%
mutate(
ties = ifelse(.data$time == dplyr::lag(.data$time, default = -1), 1, 0)
)

df_Surv$keep <- 1
for (ii in 1:nrow(df_Surv)) {
if (df_Surv$ties[ii] == 1) {
df_Surv$n.event[ii] <- df_Surv$n.event[ii] + df_Surv$n.event[ii - 1]
df_Surv$n.censor[ii] <- df_Surv$n.censor[ii] + df_Surv$n.censor[ii - 1]
df_Surv$keep[ii - 1] <- 0
}
}

df_Surv <- df_Surv %>%
filter(.data$keep == 1) %>%
select(-dplyr::all_of(c("ties", "keep")))

df_Surv <- df_Surv %>%
group_by(across(any_of(c("strata", "outcome")))) %>%
arrange(across(any_of(c("strata", "outcome", "time")))) %>%
mutate(
cum.event = as.integer(cumsum(.data$n.event)),
cum.censor = as.integer(cumsum(.data$n.censor))
)

if ("strata" %in% names(df_tidy)) {
output <- merge(df_tidy, df_Surv, by = c("time", "outcome", "strata"), all.y = TRUE)
## Determine n at risk (t = 0) --------
df_n_risk0 <- df_Surv |>
dplyr::group_by(across(any_of(c("strata")))) |>
dplyr::summarize(n.risk = dplyr::n(),
.groups = "drop")

## Determine censored & events each t --------
df_n_cens_event <- df_Surv |>
dplyr::group_by(across(any_of(c("time", "strata", "status")))) |>
dplyr::summarize(n = dplyr::n(),
.groups = "drop")

## Determine censored overall each t ---------
df_n_event_overall <- df_Surv |>
filter(.data$status != 0) |>
dplyr::group_by(across(any_of(c("time", "strata")))) |>
dplyr::summarise(n.event.overall = dplyr::n(),
.groups = "drop")

## Joining the data --------
if(is_strata) {
df_result <- df_tidy |>
dplyr::left_join(df_n_risk0,
by = "strata") |>
dplyr::left_join(df_n_cens_event |>
filter(.data$status != 0) |>
mutate(outcome = factor(.data$status,
x$failcode,
names(x$failcode))) |>
select(-dplyr::all_of(c("status"))) |>
dplyr::rename(n.event = dplyr::all_of("n")),
by = c("strata", "time", "outcome")) |>
dplyr::left_join(df_n_cens_event |>
filter(.data$status == 0) |>
select(-dplyr::all_of("status")) |>
dplyr::rename(n.censor = dplyr::all_of("n")),
by = c("strata", "time")) |>
dplyr::left_join(df_n_event_overall,
by = c("strata", "time"),
relationship = "many-to-many")
} else {
output <- merge(df_tidy, df_Surv, by = c("time", "outcome"), all.y = TRUE)
df_result <- df_tidy |>
dplyr::cross_join(df_n_risk0) |>
dplyr::left_join(df_n_cens_event |>
filter(.data$status != 0) |>
mutate(outcome = factor(.data$status,
x$failcode,
names(x$failcode))) |>
select(-dplyr::all_of(c("status"))) |>
dplyr::rename(n.event = dplyr::all_of("n")),
by = c("time", "outcome")) |>
dplyr::left_join(df_n_cens_event |>
filter(.data$status == 0) |>
select(-dplyr::all_of("status")) |>
dplyr::rename(n.censor = dplyr::all_of("n")),
by = c("time")) |>
dplyr::left_join(df_n_event_overall,
by = c("time"))
}

output %>%
arrange(across(any_of(c("strata", "outcome", "time", "n.risk")))) %>%
group_by(across(any_of(c("strata", "outcome")))) %>%
tidyr::fill(
dplyr::all_of(c(
"n.risk", "estimate", "std.error",
"conf.low", "conf.high",
"n.event", "n.censor", "cum.event",
"cum.censor")),
.direction = "down"
) %>%
dplyr::ungroup() %>%
filter(!is.na(.data$outcome)) %>%
dplyr::distinct()
# Fill missing values and build cum. sum ------
df_result <- df_result |>
mutate(n.event = dplyr::coalesce(.data$n.event, 0L),
n.censor = dplyr::coalesce(.data$n.censor, 0L),
n.event.overall = dplyr::coalesce(.data$n.event.overall, 0L)) |>
group_by(across(any_of(c("strata", "outcome")))) |>
arrange("time") |>
mutate(cum.event = cumsum(.data$n.event),
cum.censor = cumsum(.data$n.censor),
cum.event.overall = cumsum(.data$n.event.overall),
n.risk = .data$n.risk - .data$cum.event.overall - .data$cum.censor +
.data$n.event.overall + .data$n.censor) |>
dplyr::ungroup() |>
arrange(across(any_of(c("strata", "outcome", "time", "n.risk")))) |>
select(any_of(c("time", "outcome", "strata")),
everything()) |>
select(-any_of(c("cum.event.overall",
"n.event.overall")))

if(is_strata)
df_result <- df_result |>
mutate(strata = factor(.data$strata,
levels(df_tidy$strata)))

return(df_result)
}

cuminc_matrix_to_df <- function(x, name, times) {
Expand Down
2 changes: 1 addition & 1 deletion inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ crr
cuminc
doi
exponentiate
ggplot
ggsurvfit
mL
ng
purrr
subdistribution
tbl
tibble
Expand Down

0 comments on commit fd7f2ed

Please sign in to comment.