Merge pull request #181 from r-transit/dev/stop-name-distances

stop distance analysis and stop_name clustering
r-transit · Jan 27, 2022 · 2593862 · 2593862
2 parents bd587fc + 0b700f1
commit 2593862
Show file tree

Hide file tree

Showing 9 changed files with 560 additions and 11 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 S3method(plot,tidygtfs)
 S3method(print,tidygtfs)
 S3method(summary,tidygtfs)
+export(cluster_stops)
 export(filter_feed_by_area)
 export(filter_feed_by_date)
 export(filter_feed_by_stops)
@@ -22,6 +23,8 @@ export(set_api_key)
 export(set_servicepattern)
 export(sf_as_tbl)
 export(shapes_as_sf)
+export(stop_distances)
+export(stop_group_distances)
 export(stops_as_sf)
 export(travel_times)
 export(validate_gtfs)
@@ -55,6 +58,7 @@ importFrom(sf,st_cast)
 importFrom(sf,st_transform)
 importFrom(stats,"na.omit")
 importFrom(stats,"setNames")
+importFrom(stats,kmeans)
 importFrom(stats,median)
 importFrom(stats,reshape)
 importFrom(stats,sd)

diff --git a/R/geo.R b/R/geo.R
@@ -0,0 +1,256 @@
+#' Calculate distances between a given set of stops
+#' 
+#' @param gtfs_stops gtfs stops table either as data frame (with at least `stop_id`, 
+#'                   `stop_lon` and `stop_lat` columns) or as `sf` object.
+#' 
+#' @return Returns a data.frame with each row containing a pair of stop_ids (columns
+#'         `from_stop_id` and `to_stop_id`) and the `distance` between them (in meters)
+#'        
+#' @note The resulting data.frame has nrow(gtfs_stops)^2 rows, distances calculations 
+#'       among all stops for large feeds should be avoided
+#'       
+#' @examples
+#' library(dplyr)
+#' 
+#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
+#' nyc <- read_gtfs(nyc_path)
+#' 
+#' nyc$stops %>%
+#'   filter(stop_name == "Borough Hall") %>%
+#'   stop_distances() %>%
+#'   arrange(desc(distance))
+#'
+#' #> # A tibble: 36 × 3
+#' #>    from_stop_id to_stop_id  distance
+#' #>    <chr>        <chr>          <dbl>
+#' #>  1 423          232             91.5
+#' #>  2 423N         232             91.5
+#' #>  3 423S         232             91.5
+#' #>  4 423          232N            91.5
+#' #>  5 423N         232N            91.5
+#' #>  6 423S         232N            91.5
+#' #>  7 423          232S            91.5
+#' #>  8 423N         232S            91.5
+#' #>  9 423S         232S            91.5
+#' #> 10 232          423             91.5
+#' #> # … with 26 more rows
+#' 
+#' @export
+stop_distances = function(gtfs_stops) {
+  stopifnot(nrow(gtfs_stops) > 1)
+  from_stop_id <- NULL
+  if(!inherits(gtfs_stops, "data.frame")) {
+    stop("Please pass a stops data.frame (i.e. with gtfs_obj$stops)")
+  }
+  if(inherits(gtfs_stops, "sf")) {
+    dist_matrix = sf::st_distance(gtfs_stops)
+  } else {
+    dist_matrix = geodist::geodist(gtfs_stops[,c("stop_id", "stop_lon", "stop_lat")])
+  }
+
+  rownames(dist_matrix) <- gtfs_stops$stop_id
+  colnames(dist_matrix) <- gtfs_stops$stop_id
+  dist_matrix_df = dplyr::as_tibble(dist_matrix, rownames = "from_stop_id")
+
+  # replace gather (no dependency on tidyr)
+  # dists_gathered = gather(dplyr::as_tibble(dist_matrix_df), "to_stop_id", "dist", -from_stop_id)
+  dists = reshape(as.data.frame(dist_matrix_df), direction = "long",
+          idvar = "from_stop_id", timevar = "to_stop_id", v.names = "distance",
+          varying = dist_matrix_df$from_stop_id)
+
+  dists$to_stop_id <- rep(dist_matrix_df$from_stop_id, each = length(dist_matrix_df$from_stop_id))
+  dists$distance <- as.numeric(dists$distance)
+
+  dplyr::as_tibble(dists)
+}
+
+geodist_list = function(lon, lat, names = NULL) {
+  # second fastest measure after cheap
+  dists = geodist::geodist(data.frame(lon, lat), measure = "haversine")
+  if(!is.null(names)) {
+    colnames(dists) <- rownames(dists) <- names
+  }
+  list(dists)
+}
+
+geodist_list_sf = function(pts, names = NULL) {
+  dists = as.numeric(sf::st_distance(pts))
+  if(!is.null(names)) {
+    colnames(dists) <- rownames(dists) <- names
+  }
+  list(dists)
+}
+
+prep_dist_mtrx = function(dist_list) {
+  mtrx = dist_list[[1]]
+  if(nrow(mtrx) == 1) return(0)
+  diag(mtrx) <- NA
+  v = c(mtrx)
+  v[!is.na(v)]
+}
+
+#' Calculates distances among stop within the same group column
+#' 
+#' By default calculates distances among stop_ids with the same stop_name.
+#' 
+#' @inheritParams stop_distances
+#' @param by group column, default: stop_name
+#' 
+#' @returns data.frame with one row per group containing a distance matrix (distances),
+#'          number of stop ids within that group (n_stop_ids) and distance summary values 
+#'          (dist_mean, dist_median and dist_max).
+#'          
+#' @examples
+#' library(dplyr)
+#' 
+#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
+#' nyc <- read_gtfs(nyc_path)
+#' 
+#' stop_group_distances(nyc$stops)
+#' #> # A tibble: 380 × 6
+#' #>    stop_name   distances       n_stop_ids dist_mean dist_median dist_max
+#' #>    <chr>       <list>               <dbl>     <dbl>       <dbl>    <dbl>
+#' #>  1 86 St       <dbl [18 × 18]>         18     5395.       5395.   21811.
+#' #>  2 79 St       <dbl [6 × 6]>            6    19053.      19053.   19053.
+#' #>  3 Prospect Av <dbl [6 × 6]>            6    18804.      18804.   18804.
+#' #>  4 77 St       <dbl [6 × 6]>            6    16947.      16947.   16947.
+#' #>  5 59 St       <dbl [6 × 6]>            6    14130.      14130.   14130.
+#' #>  6 50 St       <dbl [9 × 9]>            9     7097.       7097.   14068.
+#' #>  7 36 St       <dbl [6 × 6]>            6    12496.      12496.   12496.
+#' #>  8 8 Av        <dbl [6 × 6]>            6    11682.      11682.   11682.
+#' #>  9 7 Av        <dbl [9 × 9]>            9     5479.       5479.   10753.
+#' #> 10 111 St      <dbl [9 × 9]>            9     3877.       3877.    7753.
+#' #> # … with 370 more rows
+#' 
+#' @export
+stop_group_distances = function(gtfs_stops, by = "stop_name") {
+  distances <- n_stop_ids <- dist_mean <- dist_median <- dist_max <- NULL
+  if(inherits(gtfs_stops, "sf")) {
+    gtfs_stops <- sf_points_to_df(gtfs_stops, c("stop_lon", "stop_lat"), TRUE)
+  }
+  if(!by %in% colnames(gtfs_stops)) {
+    stop("column ", by, " does not exist in ", deparse(substitute(gtfs_stops)))
+  }
+  n_stops = table(gtfs_stops$stop_name)
+
+  gtfs_single_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops == 1])
+  gtfs_multip_stops = gtfs_stops %>% filter(stop_name %in% names(n_stops)[n_stops != 1])
+
+  gtfs_multip_stops <- gtfs_multip_stops %>%
+    dplyr::group_by_at(by) %>%
+    dplyr::summarise(distances = geodist_list(stop_lon, stop_lat, stop_id), .groups = "keep") %>%
+    dplyr::mutate(n_stop_ids = nrow(distances[[1]]),
+                  dist_mean = median(prep_dist_mtrx(distances)),
+                  dist_median = median(prep_dist_mtrx(distances)),
+                  dist_max = max(prep_dist_mtrx(distances))) %>% ungroup()
+
+  # tidytable version
+  # gtfs_multip_stops <- gtfs_multip_stops %>%
+  #   tidytable::summarise.(distances = geodist_list(stop_lon, stop_lat, stop_id), .by = by) %>%
+  #   tidytable::mutate.(n_stop_ids = nrow(distances[[1]]),
+  #                      dist_mean = median(prep_dist_mtrx(distances)),
+  #                      dist_median = median(prep_dist_mtrx(distances)),
+  #                      dist_max = max(prep_dist_mtrx(distances)), .by = "stop_name")
+
+  gtfs_single_stops <- gtfs_single_stops %>% 
+    select(stop_name) %>% 
+    dplyr::mutate(distances = list(matrix(0)), n_stop_ids = 1, dist_mean = 0, dist_median = 0, dist_max = 0)
+
+  dists = dplyr::as_tibble(dplyr::bind_rows(gtfs_single_stops, gtfs_multip_stops))
+  dists[order(dists$dist_max, dists$n_stop_ids, dists[[by]], decreasing = T),]
+}
+
+#' Cluster nearby stops within a group
+#' 
+#' Finds clusters of stops for each unique value in `group_col` (e.g. stop_name). Can 
+#' be used to find different groups of stops that share the same name but are located more
+#' than `max_dist` apart. `gtfs_stops` is assigned a new column (named `cluster_colname`) 
+#' which contains the `group_col` value and the cluster number.
+#' 
+#' [stats::kmeans()] is used for clustering.
+#' 
+#' @param gtfs_stops Stops table of a gtfs object. It is also possible to pass a 
+#'                   tidygtfs object to enable piping.
+#' @param max_dist Only stop groups that have a maximum distance among them above this
+#'                 threshold (in meters) are clustered.
+#' @param group_col Clusters for are calculated for each set of stops with the same value
+#'                  in this column (default: stop_name) 
+#' @param cluster_colname Name of the new column name. Can be the same as group_col to overwrite. 
+#' 
+#' @return Returns a stops table with an added cluster column. If `gtfs_stops` is a tidygtfs object, a 
+#'         modified tidygtfs object is return
+#' 
+#' @importFrom stats kmeans
+#' @examples \donttest{
+#' library(dplyr)
+#' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
+#' nyc <- read_gtfs(nyc_path)
+#' nyc <- cluster_stops(nyc)
+#' 
+#' # There are 6 stops with the name "86 St" that are far apart
+#' stops_86_St = nyc$stops %>% 
+#'   filter(stop_name == "86 St")
+#'
+#' table(stops_86_St$stop_name_cluster)
+#' #> 86 St [1] 86 St [2] 86 St [3] 86 St [4] 86 St [5] 86 St [6] 
+#' #>         3         3         3         3         3         3 
+#'
+#' stops_86_St %>% select(stop_id, stop_name, parent_station, stop_name_cluster) %>% head()
+#' #> # A tibble: 6 × 4
+#' #>   stop_id stop_name parent_station stop_name_cluster
+#' #>   <chr>   <chr>     <chr>          <chr>            
+#' #> 1 121     86 St     ""             86 St [3]        
+#' #> 2 121N    86 St     "121"          86 St [3]        
+#' #> 3 121S    86 St     "121"          86 St [3]        
+#' #> 4 626     86 St     ""             86 St [4]        
+#' #> 5 626N    86 St     "626"          86 St [4]        
+#' #> 6 626S    86 St     "626"          86 St [4]
+#' 
+#' library(ggplot2)
+#' ggplot(stops_86_St) +
+#'   geom_point(aes(stop_lon, stop_lat, color = stop_name_cluster))
+#' }
+#' @export
+cluster_stops = function(gtfs_stops,
+                         max_dist = 300,
+                         group_col = "stop_name",
+                         cluster_colname = "stop_name_cluster") {
+  if(inherits(gtfs_stops, "tidygtfs")) {
+    gstops = gtfs_stops$stops
+  } else {
+    gstops = gtfs_stops 
+  }
+
+  is_sf = inherits(gstops, "sf")
+  stops_clusters = lapply(unique(gstops[[group_col]]), function(sn) {
+    stop_name_set = gstops[gstops[[group_col]] == sn,]
+    stop_name_set[[cluster_colname]] <- sn
+    if(nrow(stop_name_set) == 1) return(stop_name_set)
+
+    dists = stop_distances(stop_name_set)
+    if(max(dists$distance) > max_dist) {
+      if(is_sf) {
+        stop_name_lonlat = do.call(rbind, sf::st_geometry(stop_name_set))
+      } else {
+        stop_name_lonlat = stop_name_set[,c("stop_lon", "stop_lat")]
+      }
+
+      stops_unique_coords = unique(stop_name_lonlat)
+      n_dists = min(length(unique(dists$distance)), nrow(stops_unique_coords))
+      n_clusters = min(n_dists, nrow(stop_name_set)-1)
+
+      kms = kmeans(stop_name_lonlat, n_clusters)
+
+      stop_name_set[[cluster_colname]] <- paste0(sn, " [", kms$cluster, "]")
+    }
+    stop_name_set
+  })
+  stops_clusters = dplyr::bind_rows(stops_clusters)
+
+  if(inherits(gtfs_stops, "tidygtfs")) {
+    gtfs_stops$stops <- stops_clusters
+    return(gtfs_stops)
+  } else {
+    return(stops_clusters)
+  }
+}
diff --git a/R/raptor.R b/R/raptor.R
@@ -86,7 +86,7 @@ raptor = function(stop_times,
   if(!is.character(stop_ids) && !is.null(stop_ids)) {
     stop("stop_ids must be a character vector (or NULL)")
   }
-
+  
   # check and params ####
   # stop ids need to be a character vector
   # use data.table for faster manipulation
@@ -298,12 +298,17 @@ raptor = function(stop_times,
 #' Calculate shortest travel times from a stop to all reachable stops
 #' 
 #' Function to calculate the shortest travel times from a stop (given by `stop_name`) 
-#' to all other stops of a feed. `filtered_stop_times` needs to be created before with 
+#' to all other stop_names of a feed. `filtered_stop_times` needs to be created before with 
 #' [filter_stop_times()] or [filter_feed_by_date()].
 #' 
 #' This function allows easier access to [raptor()] by using stop names instead of ids and 
 #' returning shortest travel times by default.
 #' 
+#' Note however that stop_name might not be a suitable identifier for a feed. It is possible
+#' that multiple stops have the same name while not being related or geographically close to
+#' each other. [stop_group_distances()] and [cluster_stops()] can help identify and fix 
+#' issues with stop_names.
+#'  
 #' @param filtered_stop_times stop_times data.table (with transfers and stops tables as 
 #'                            attributes) created with [filter_stop_times()] where the 
 #'                            departure or arrival time has been set. Alternatively,
@@ -324,15 +329,26 @@ raptor = function(stop_times,
 #' @param return_coords Returns stop coordinates as columms. Default is FALSE.                            
 #' @param return_DT travel_times() returns a data.table if TRUE. Default is FALSE which 
 #'                  returns a tibble/tbl_df.
-#'                           
+#' @param stop_dist_check stop_names are not structured identifiers like 
+#'                            stop_ids or parent_stations, so it's possible that 
+#'                            stops with the same name are far apart. travel_times()
+#'                            errors if the distance among stop_ids with the same name is
+#'                            above this threshold (in meters).
+#'                            Use FALSE to turn check off. However, it is recommended to
+#'                            either use [raptor()] or fix the feed (see [cluster_stops()]).
+#'
 #' @return A table with travel times to/from all stops reachable by `stop_name` and their
 #'         corresponding journey departure and arrival times.
+#'         
 #' @importFrom data.table fifelse
 #' @export
 #' @examples \donttest{
 #' nyc_path <- system.file("extdata", "google_transit_nyc_subway.zip", package = "tidytransit")
 #' nyc <- read_gtfs(nyc_path)
 #' 
+#' # stop_names in this feed are not restricted to an area, create clusters of stops to fix
+#' nyc <- cluster_stops(nyc, group_col = "stop_name", cluster_colname = "stop_name")
+#' 
 #' # Use journeys departing after 7 AM with arrival time before 9 AM on 26th June
 #' stop_times <- filter_stop_times(nyc, "2018-06-26", 7*3600, 9*3600)
 #' 
@@ -355,8 +371,10 @@ travel_times = function(filtered_stop_times,
                         max_transfers = NULL,
                         max_departure_time = NULL,
                         return_coords = FALSE,
-                        return_DT = FALSE) {
+                        return_DT = FALSE,
+                        stop_dist_check = 300) {
   travel_time <- journey_arrival_time <- journey_departure_time <- NULL
+  stop_names = stop_name; rm(stop_name)
   if("tidygtfs" %in% class(filtered_stop_times)) {
     gtfs_obj = filtered_stop_times
     if(is.null(attributes(gtfs_obj$stop_times)$extract_date)) {
@@ -387,11 +405,18 @@ travel_times = function(filtered_stop_times,
   }
 
   # get stop_ids of names
-  stop_ids = NULL
-  if(!is.null(stop_name)) {
-    stop_ids = stops$stop_id[which(stops$stop_name %in% stop_name)]
-    if(length(stop_ids) == 0) {
-      stop(paste0("Stop name '", stop_name, "' not found in stops table"))
+  stop_ids = stops$stop_id[which(stops$stop_name %in% stop_names)]
+  if(length(stop_ids) == 0) {
+    stop(paste0("Stop name '", stop_names, "' not found in stops table"))
+  }
+
+  # Check stop_name integrity
+  if(length(stop_ids) > 1 & !is.null(stop_dist_check) & !isFALSE(stop_dist_check)) {
+    stop_dists = stop_group_distances(stops, "stop_name")
+
+    if(max(stop_dists$dist_max) > stop_dist_check) {
+      stop("Some stops with the same name are more than ", stop_dist_check, " meters apart, see stop_group_distances().\n",
+           "Using travel_times() might lead to unexpected results. Set stop_dist_check=FALSE to ignore this error.")
     }
   }