-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #181 from r-transit/dev/stop-name-distances
stop distance analysis and stop_name clustering
- Loading branch information
Showing
9 changed files
with
560 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,256 @@ | ||
#' Calculate distances between a given set of stops | ||
#' | ||
#' @param gtfs_stops gtfs stops table either as data frame (with at least `stop_id`, | ||
#' `stop_lon` and `stop_lat` columns) or as `sf` object. | ||
#' | ||
#' @return Returns a data.frame with each row containing a pair of stop_ids (columns | ||
#' `from_stop_id` and `to_stop_id`) and the `distance` between them (in meters) | ||
#' | ||
#' @note The resulting data.frame has nrow(gtfs_stops)^2 rows, distances calculations | ||
#' among all stops for large feeds should be avoided | ||
#' | ||
#' @examples | ||
#' library(dplyr) | ||
#' | ||
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit") | ||
#' nyc <- read_gtfs(nyc_path) | ||
#' | ||
#' nyc$stops %>% | ||
#' filter(stop_name == "Borough Hall") %>% | ||
#' stop_distances() %>% | ||
#' arrange(desc(distance)) | ||
#' | ||
#' #> # A tibble: 36 × 3 | ||
#' #> from_stop_id to_stop_id distance | ||
#' #> <chr> <chr> <dbl> | ||
#' #> 1 423 232 91.5 | ||
#' #> 2 423N 232 91.5 | ||
#' #> 3 423S 232 91.5 | ||
#' #> 4 423 232N 91.5 | ||
#' #> 5 423N 232N 91.5 | ||
#' #> 6 423S 232N 91.5 | ||
#' #> 7 423 232S 91.5 | ||
#' #> 8 423N 232S 91.5 | ||
#' #> 9 423S 232S 91.5 | ||
#' #> 10 232 423 91.5 | ||
#' #> # … with 26 more rows | ||
#' | ||
#' @export | ||
stop_distances = function(gtfs_stops) { | ||
stopifnot(nrow(gtfs_stops) > 1) | ||
from_stop_id <- NULL | ||
if(!inherits(gtfs_stops, "data.frame")) { | ||
stop("Please pass a stops data.frame (i.e. with gtfs_obj$stops)") | ||
} | ||
if(inherits(gtfs_stops, "sf")) { | ||
dist_matrix = sf::st_distance(gtfs_stops) | ||
} else { | ||
dist_matrix = geodist::geodist(gtfs_stops[,c("stop_id", "stop_lon", "stop_lat")]) | ||
} | ||
|
||
rownames(dist_matrix) <- gtfs_stops$stop_id | ||
colnames(dist_matrix) <- gtfs_stops$stop_id | ||
dist_matrix_df = dplyr::as_tibble(dist_matrix, rownames = "from_stop_id") | ||
|
||
# replace gather (no dependency on tidyr) | ||
# dists_gathered = gather(dplyr::as_tibble(dist_matrix_df), "to_stop_id", "dist", -from_stop_id) | ||
dists = reshape(as.data.frame(dist_matrix_df), direction = "long", | ||
idvar = "from_stop_id", timevar = "to_stop_id", v.names = "distance", | ||
varying = dist_matrix_df$from_stop_id) | ||
|
||
dists$to_stop_id <- rep(dist_matrix_df$from_stop_id, each = length(dist_matrix_df$from_stop_id)) | ||
dists$distance <- as.numeric(dists$distance) | ||
|
||
dplyr::as_tibble(dists) | ||
} | ||
|
||
geodist_list = function(lon, lat, names = NULL) { | ||
# second fastest measure after cheap | ||
dists = geodist::geodist(data.frame(lon, lat), measure = "haversine") | ||
if(!is.null(names)) { | ||
colnames(dists) <- rownames(dists) <- names | ||
} | ||
list(dists) | ||
} | ||
|
||
geodist_list_sf = function(pts, names = NULL) { | ||
dists = as.numeric(sf::st_distance(pts)) | ||
if(!is.null(names)) { | ||
colnames(dists) <- rownames(dists) <- names | ||
} | ||
list(dists) | ||
} | ||
|
||
prep_dist_mtrx = function(dist_list) { | ||
mtrx = dist_list[[1]] | ||
if(nrow(mtrx) == 1) return(0) | ||
diag(mtrx) <- NA | ||
v = c(mtrx) | ||
v[!is.na(v)] | ||
} | ||
|
||
#' Calculates distances among stop within the same group column | ||
#' | ||
#' By default calculates distances among stop_ids with the same stop_name. | ||
#' | ||
#' @inheritParams stop_distances | ||
#' @param by group column, default: stop_name | ||
#' | ||
#' @returns data.frame with one row per group containing a distance matrix (distances), | ||
#' number of stop ids within that group (n_stop_ids) and distance summary values | ||
#' (dist_mean, dist_median and dist_max). | ||
#' | ||
#' @examples | ||
#' library(dplyr) | ||
#' | ||
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit") | ||
#' nyc <- read_gtfs(nyc_path) | ||
#' | ||
#' stop_group_distances(nyc$stops) | ||
#' #> # A tibble: 380 × 6 | ||
#' #> stop_name distances n_stop_ids dist_mean dist_median dist_max | ||
#' #> <chr> <list> <dbl> <dbl> <dbl> <dbl> | ||
#' #> 1 86 St <dbl [18 × 18]> 18 5395. 5395. 21811. | ||
#' #> 2 79 St <dbl [6 × 6]> 6 19053. 19053. 19053. | ||
#' #> 3 Prospect Av <dbl [6 × 6]> 6 18804. 18804. 18804. | ||
#' #> 4 77 St <dbl [6 × 6]> 6 16947. 16947. 16947. | ||
#' #> 5 59 St <dbl [6 × 6]> 6 14130. 14130. 14130. | ||
#' #> 6 50 St <dbl [9 × 9]> 9 7097. 7097. 14068. | ||
#' #> 7 36 St <dbl [6 × 6]> 6 12496. 12496. 12496. | ||
#' #> 8 8 Av <dbl [6 × 6]> 6 11682. 11682. 11682. | ||
#' #> 9 7 Av <dbl [9 × 9]> 9 5479. 5479. 10753. | ||
#' #> 10 111 St <dbl [9 × 9]> 9 3877. 3877. 7753. | ||
#' #> # … with 370 more rows | ||
#' | ||
#' @export | ||
stop_group_distances = function(gtfs_stops, by = "stop_name") { | ||
distances <- n_stop_ids <- dist_mean <- dist_median <- dist_max <- NULL | ||
if(inherits(gtfs_stops, "sf")) { | ||
gtfs_stops <- sf_points_to_df(gtfs_stops, c("stop_lon", "stop_lat"), TRUE) | ||
} | ||
if(!by %in% colnames(gtfs_stops)) { | ||
stop("column ", by, " does not exist in ", deparse(substitute(gtfs_stops))) | ||
} | ||
n_stops = table(gtfs_stops$stop_name) | ||
|
||
gtfs_single_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops == 1]) | ||
gtfs_multip_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops != 1]) | ||
|
||
gtfs_multip_stops <- gtfs_multip_stops %>% | ||
dplyr::group_by_at(by) %>% | ||
dplyr::summarise(distances = geodist_list(stop_lon, stop_lat, stop_id), .groups = "keep") %>% | ||
dplyr::mutate(n_stop_ids = nrow(distances[[1]]), | ||
dist_mean = median(prep_dist_mtrx(distances)), | ||
dist_median = median(prep_dist_mtrx(distances)), | ||
dist_max = max(prep_dist_mtrx(distances))) %>% ungroup() | ||
|
||
# tidytable version | ||
# gtfs_multip_stops <- gtfs_multip_stops %>% | ||
# tidytable::summarise.(distances = geodist_list(stop_lon, stop_lat, stop_id), .by = by) %>% | ||
# tidytable::mutate.(n_stop_ids = nrow(distances[[1]]), | ||
# dist_mean = median(prep_dist_mtrx(distances)), | ||
# dist_median = median(prep_dist_mtrx(distances)), | ||
# dist_max = max(prep_dist_mtrx(distances)), .by = "stop_name") | ||
|
||
gtfs_single_stops <- gtfs_single_stops %>% | ||
select(stop_name) %>% | ||
dplyr::mutate(distances = list(matrix(0)), n_stop_ids = 1, dist_mean = 0, dist_median = 0, dist_max = 0) | ||
|
||
dists = dplyr::as_tibble(dplyr::bind_rows(gtfs_single_stops, gtfs_multip_stops)) | ||
dists[order(dists$dist_max, dists$n_stop_ids, dists[[by]], decreasing = T),] | ||
} | ||
|
||
#' Cluster nearby stops within a group | ||
#' | ||
#' Finds clusters of stops for each unique value in `group_col` (e.g. stop_name). Can | ||
#' be used to find different groups of stops that share the same name but are located more | ||
#' than `max_dist` apart. `gtfs_stops` is assigned a new column (named `cluster_colname`) | ||
#' which contains the `group_col` value and the cluster number. | ||
#' | ||
#' [stats::kmeans()] is used for clustering. | ||
#' | ||
#' @param gtfs_stops Stops table of a gtfs object. It is also possible to pass a | ||
#' tidygtfs object to enable piping. | ||
#' @param max_dist Only stop groups that have a maximum distance among them above this | ||
#' threshold (in meters) are clustered. | ||
#' @param group_col Clusters for are calculated for each set of stops with the same value | ||
#' in this column (default: stop_name) | ||
#' @param cluster_colname Name of the new column name. Can be the same as group_col to overwrite. | ||
#' | ||
#' @return Returns a stops table with an added cluster column. If `gtfs_stops` is a tidygtfs object, a | ||
#' modified tidygtfs object is return | ||
#' | ||
#' @importFrom stats kmeans | ||
#' @examples \donttest{ | ||
#' library(dplyr) | ||
#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit") | ||
#' nyc <- read_gtfs(nyc_path) | ||
#' nyc <- cluster_stops(nyc) | ||
#' | ||
#' # There are 6 stops with the name "86 St" that are far apart | ||
#' stops_86_St = nyc$stops %>% | ||
#' filter(stop_name == "86 St") | ||
#' | ||
#' table(stops_86_St$stop_name_cluster) | ||
#' #> 86 St [1] 86 St [2] 86 St [3] 86 St [4] 86 St [5] 86 St [6] | ||
#' #> 3 3 3 3 3 3 | ||
#' | ||
#' stops_86_St %>% select(stop_id, stop_name, parent_station, stop_name_cluster) %>% head() | ||
#' #> # A tibble: 6 × 4 | ||
#' #> stop_id stop_name parent_station stop_name_cluster | ||
#' #> <chr> <chr> <chr> <chr> | ||
#' #> 1 121 86 St "" 86 St [3] | ||
#' #> 2 121N 86 St "121" 86 St [3] | ||
#' #> 3 121S 86 St "121" 86 St [3] | ||
#' #> 4 626 86 St "" 86 St [4] | ||
#' #> 5 626N 86 St "626" 86 St [4] | ||
#' #> 6 626S 86 St "626" 86 St [4] | ||
#' | ||
#' library(ggplot2) | ||
#' ggplot(stops_86_St) + | ||
#' geom_point(aes(stop_lon, stop_lat, color = stop_name_cluster)) | ||
#' } | ||
#' @export | ||
cluster_stops = function(gtfs_stops, | ||
max_dist = 300, | ||
group_col = "stop_name", | ||
cluster_colname = "stop_name_cluster") { | ||
if(inherits(gtfs_stops, "tidygtfs")) { | ||
gstops = gtfs_stops$stops | ||
} else { | ||
gstops = gtfs_stops | ||
} | ||
|
||
is_sf = inherits(gstops, "sf") | ||
stops_clusters = lapply(unique(gstops[[group_col]]), function(sn) { | ||
stop_name_set = gstops[gstops[[group_col]] == sn,] | ||
stop_name_set[[cluster_colname]] <- sn | ||
if(nrow(stop_name_set) == 1) return(stop_name_set) | ||
|
||
dists = stop_distances(stop_name_set) | ||
if(max(dists$distance) > max_dist) { | ||
if(is_sf) { | ||
stop_name_lonlat = do.call(rbind, sf::st_geometry(stop_name_set)) | ||
} else { | ||
stop_name_lonlat = stop_name_set[,c("stop_lon", "stop_lat")] | ||
} | ||
|
||
stops_unique_coords = unique(stop_name_lonlat) | ||
n_dists = min(length(unique(dists$distance)), nrow(stops_unique_coords)) | ||
n_clusters = min(n_dists, nrow(stop_name_set)-1) | ||
|
||
kms = kmeans(stop_name_lonlat, n_clusters) | ||
|
||
stop_name_set[[cluster_colname]] <- paste0(sn, " [", kms$cluster, "]") | ||
} | ||
stop_name_set | ||
}) | ||
stops_clusters = dplyr::bind_rows(stops_clusters) | ||
|
||
if(inherits(gtfs_stops, "tidygtfs")) { | ||
gtfs_stops$stops <- stops_clusters | ||
return(gtfs_stops) | ||
} else { | ||
return(stops_clusters) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.