Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sanitise bin calculations #6212

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ Collate:
'scales-.R'
'stat-align.R'
'stat-bin.R'
'stat-summary-2d.R'
'stat-bin2d.R'
'stat-bindot.R'
'stat-binhex.R'
Expand All @@ -263,7 +264,6 @@ Collate:
'stat-smooth-methods.R'
'stat-smooth.R'
'stat-sum.R'
'stat-summary-2d.R'
'stat-summary-bin.R'
'stat-summary-hex.R'
'stat-summary.R'
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# ggplot2 (development version)

* All binning stats now use the `boundary`/`center` parametrisation rather
than `origin`, following in `stat_bin()`'s footsteps (@teunbrand).
* `stat_summary_2d()` and `stat_bin_2d()` now deal with zero-range data
more elegantly (@teunbrand, #6207).
* New stat: `stat_manual()` for arbitrary computations (@teunbrand, #3501)
* Reversal of a dimension, typically 'x' or 'y', is now controlled by the
`reverse` argument in `coord_cartesian()`, `coord_fixed()`, `coord_radial()`
Expand Down Expand Up @@ -226,7 +230,7 @@
* The ellipsis argument is now checked in `fortify()`, `get_alt_text()`,
`labs()` and several guides (@teunbrand, #3196).
* `stat_summary_bin()` no longer ignores `width` parameter (@teunbrand, #4647).
* Added `keep.zeroes` argument to `stat_bin()` (@teunbrand, #3449)
* Reintroduced `drop` argument to `stat_bin()` (@teunbrand, #3449)
* (internal) removed barriers for using 2D structures as aesthetics
(@teunbrand, #4189).
* `coord_sf()` no longer errors when dealing with empty graticules (@teunbrand, #6052)
Expand Down
109 changes: 94 additions & 15 deletions R/bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,12 @@

bin_breaks_width <- function(x_range, width = NULL, center = NULL,
boundary = NULL, closed = c("right", "left")) {
check_length(x_range, 2L)

# binwidth seems to be the argument name supplied to width. (stat-bin and stat-bindot)
check_number_decimal(width, min = 0, allow_infinite = FALSE, arg = "binwidth")

if (!is.null(boundary) && !is.null(center)) {
cli::cli_abort("Only one of {.arg boundary} and {.arg center} may be specified.")
} else if (is.null(boundary)) {
if (is.null(boundary)) {
if (is.null(center)) {
# If neither edge nor center given, compute both using tile layer's
# algorithm. This puts min and max of data in outer half of their bins.
boundary <- width / 2

} else {
# If center given but not boundary, compute boundary.
boundary <- center - width / 2
Expand All @@ -75,9 +68,6 @@

# Find the left side of left-most bin: inputs could be Dates or POSIXct, so
# coerce to numeric first.
x_range <- as.numeric(x_range)
width <- as.numeric(width)
boundary <- as.numeric(boundary)
shift <- floor((x_range[1] - boundary) / width)
origin <- boundary + shift * width

Expand All @@ -104,9 +94,7 @@

bin_breaks_bins <- function(x_range, bins = 30, center = NULL,
boundary = NULL, closed = c("right", "left")) {
check_length(x_range, 2L)

check_number_whole(bins, min = 1)
if (zero_range(x_range)) {
# 0.1 is the same width as the expansion `default_expansion()` gives for 0-width data
width <- 0.1
Expand All @@ -128,6 +116,56 @@

# Compute bins ------------------------------------------------------------

compute_bins <- function(x, scale = NULL, breaks = NULL, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left")) {

range <- if (is.scale(scale)) scale$dimension() else range(x)
check_length(range, 2L)

if (!is.null(breaks)) {
breaks <- allow_lambda(breaks)
if (is.function(breaks)) {
breaks <- breaks(x)
}
if (is.scale(scale) && !scale$is_discrete()) {
breaks <- scale$transform(breaks)
}
check_numeric(breaks)
bins <- bin_breaks(breaks, closed)
return(bins)
}

check_number_decimal(boundary, allow_infinite = FALSE, allow_null = TRUE)
check_number_decimal(center, allow_infinite = FALSE, allow_null = TRUE)
if (!is.null(boundary) && !is.null(center)) {
cli::cli_abort("Only one of {.arg boundary} and {.arg center} may be specified.")
}

if (!is.null(binwidth)) {
binwidth <- allow_lambda(binwidth)
if (is.function(binwidth)) {
binwidth <- binwidth(x)
}
check_number_decimal(binwidth, min = 0, allow_infinite = FALSE)
bins <- bin_breaks_width(
range, binwidth,
center = center, boundary = boundary, closed = closed
)
return(bins)
}

bins <- allow_lambda(bins)
if (is.function(bins)) {
bins <- bins(x)

Check warning on line 160 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L160

Added line #L160 was not covered by tests
}
check_number_whole(bins, min = 1, allow_infinite = FALSE)
bin_breaks_bins(
range, bins,
center = center, boundary = boundary, closed = closed
)
}

bin_vector <- function(x, bins, weight = NULL, pad = FALSE) {
check_object(bins, is_bins, "a {.cls ggplot2_bins} object")

Expand All @@ -141,8 +179,7 @@
weight[is.na(weight)] <- 0
}

bin_idx <- cut(x, bins$fuzzy, right = bins$right_closed,
include.lowest = TRUE)
bin_idx <- bin_cut(x, bins)
bin_count <- as.numeric(tapply(weight, bin_idx, sum, na.rm = TRUE))
bin_count[is.na(bin_count)] <- 0

Expand Down Expand Up @@ -170,6 +207,10 @@
bin_out(bin_count, bin_x, bin_widths)
}

bin_cut <- function(x, bins) {
cut(x, bins$fuzzy, right = bins$right_closed, include.lowest = TRUE)
}

bin_out <- function(count = integer(0), x = numeric(0), width = numeric(0),
xmin = x - width / 2, xmax = x + width / 2) {
density <- count / width / sum(abs(count))
Expand All @@ -186,3 +227,41 @@
.size = length(count)
)
}

bin_loc <- function(x, id) {
left <- x[-length(x)]
right <- x[-1]

list(
left = left[id],
right = right[id],
mid = ((left + right) / 2)[id],
length = diff(x)[id]
)
}

fix_bin_params = function(params, fun, version) {

if (!is.null(params$origin)) {
args <- paste0(fun, c("(origin)", "(boundary)"))
deprecate_warn0(version, args[1], args[2])
params$boudnary <- params$origin
params$origin <- NULL

Check warning on line 249 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L246-L249

Added lines #L246 - L249 were not covered by tests
}

if (!is.null(params$right)) {
args <- paste0(fun, c("(right)", "(closed)"))
deprecate_warn0(version, args[1], args[2])
params$closed <- if (isTRUE(params$right)) "right" else "left"
params$right <- NULL

Check warning on line 256 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L253-L256

Added lines #L253 - L256 were not covered by tests
}

if (is.null(params$breaks %||% params$binwidth %||% params$bins)) {
cli::cli_inform(
"{.fn {fun}} using {.code bins = 30}. Pick better value {.arg binwidth}."
)
params$bins <- 30

Check warning on line 263 in R/bin.R

View check run for this annotation

Codecov / codecov/patch

R/bin.R#L260-L263

Added lines #L260 - L263 were not covered by tests
}

params
}
72 changes: 21 additions & 51 deletions R/stat-bin.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@
#' or left edges of bins are included in the bin.
#' @param pad If `TRUE`, adds empty bins at either end of x. This ensures
#' frequency polygons touch 0. Defaults to `FALSE`.
#' @param keep.zeroes Treatment of zero count bins. If `"all"` (default), such
#' @param drop Treatment of zero count bins. If `"all"` (default), such
#' bins are kept as-is. If `"none"`, all zero count bins are filtered out.
#' If `"inner"` only zero count bins at the flanks are filtered out, but not
#' in the middle.
#' in the middle. `TRUE` is shorthand for `"all"` and `FALSE` is shorthand
#' for `"none"`.
#' @eval rd_computed_vars(
#' count = "number of points in bin.",
#' density = "density of points in bin, scaled to integrate to 1.",
Expand Down Expand Up @@ -59,7 +60,7 @@ stat_bin <- function(mapping = NULL, data = NULL,
closed = c("right", "left"),
pad = FALSE,
na.rm = FALSE,
keep.zeroes = "all",
drop = "all",
orientation = NA,
show.legend = NA,
inherit.aes = TRUE) {
Expand All @@ -82,7 +83,7 @@ stat_bin <- function(mapping = NULL, data = NULL,
pad = pad,
na.rm = na.rm,
orientation = orientation,
keep.zeroes = keep.zeroes,
drop = drop,
...
)
)
Expand All @@ -95,9 +96,13 @@ stat_bin <- function(mapping = NULL, data = NULL,
StatBin <- ggproto("StatBin", Stat,
setup_params = function(self, data, params) {
params$flipped_aes <- has_flipped_aes(data, params, main_is_orthogonal = FALSE)
params$keep.zeroes <- arg_match0(
params$keep.zeroes %||% "all",
c("all", "none", "inner"), arg_nm = "keep.zeroes"

if (is.logical(params$drop)) {
params$drop <- if (isTRUE(params$drop)) "all" else "none"
}
params$drop <- arg_match0(
params$drop %||% "all",
c("all", "none", "inner"), arg_nm = "drop"
)

has_x <- !(is.null(data$x) && is.null(params$x))
Expand All @@ -118,29 +123,7 @@ StatBin <- ggproto("StatBin", Stat,
))
}

if (!is.null(params$drop)) {
deprecate_warn0("2.1.0", "stat_bin(drop)", "stat_bin(pad)")
params$drop <- NULL
}
if (!is.null(params$origin)) {
deprecate_warn0("2.1.0", "stat_bin(origin)", "stat_bin(boundary)")
params$boundary <- params$origin
params$origin <- NULL
}
if (!is.null(params$right)) {
deprecate_warn0("2.1.0", "stat_bin(right)", "stat_bin(closed)")
params$closed <- if (params$right) "right" else "left"
params$right <- NULL
}
if (!is.null(params$boundary) && !is.null(params$center)) {
cli::cli_abort("Only one of {.arg boundary} and {.arg center} may be specified in {.fn {snake_class(self)}}.")
}

if (is.null(params$breaks) && is.null(params$binwidth) && is.null(params$bins)) {
cli::cli_inform("{.fn {snake_class(self)}} using {.code bins = 30}. Pick better value with {.arg binwidth}.")
params$bins <- 30
}

params <- fix_bin_params(params, fun = snake_class(self), version = "2.1.0")
params
},

Expand All @@ -149,33 +132,20 @@ StatBin <- ggproto("StatBin", Stat,
compute_group = function(data, scales, binwidth = NULL, bins = NULL,
center = NULL, boundary = NULL,
closed = c("right", "left"), pad = FALSE,
breaks = NULL, flipped_aes = FALSE, keep.zeroes = "all",
breaks = NULL, flipped_aes = FALSE, drop = "all",
# The following arguments are not used, but must
# be listed so parameters are computed correctly
origin = NULL, right = NULL, drop = NULL) {
origin = NULL, right = NULL) {
x <- flipped_names(flipped_aes)$x
if (!is.null(breaks)) {
if (is.function(breaks)) {
breaks <- breaks(data[[x]])
}
if (!scales[[x]]$is_discrete()) {
breaks <- scales[[x]]$transform(breaks)
}
bins <- bin_breaks(breaks, closed)
} else if (!is.null(binwidth)) {
if (is.function(binwidth)) {
binwidth <- binwidth(data[[x]])
}
bins <- bin_breaks_width(scales[[x]]$dimension(), binwidth,
center = center, boundary = boundary, closed = closed)
} else {
bins <- bin_breaks_bins(scales[[x]]$dimension(), bins, center = center,
boundary = boundary, closed = closed)
}
bins <- compute_bins(
data[[x]], scales[[x]],
breaks = breaks, binwidth = binwidth, bins = bins,
center = center, boundary = boundary, closed = closed
)
bins <- bin_vector(data[[x]], bins, weight = data$weight, pad = pad)

keep <- switch(
keep.zeroes,
drop,
none = bins$count != 0,
inner = inner_runs(bins$count != 0),
TRUE
Expand Down
Loading
Loading