Fixing aggregate query performance (#950)

transitmatters · Feb 11, 2024 · 22d43fa · 22d43fa
1 parent 8e8036d
commit 22d43fa
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 3 deletions.
diff --git a/server/chalicelib/s3.py b/server/chalicelib/s3.py
@@ -81,9 +81,9 @@ def parallel_download_events(datestop):
 
 
 def download_events(sdate, edate, stops: list):
-    # This used to be month_range but updated to date_range to support live ranges
-    # If something breaks, this may be why
-    datestops = itertools.product(parallel.date_range(sdate, edate), stops)
+    # This needs to be month_range for performance and memory,
+    # however, for data from gobble we'll need specific dates, not just first of the month
+    datestops = itertools.product(parallel.month_range(sdate, edate), stops)
     result = parallel_download_events(datestops)
     result = filter(lambda row: sdate.strftime("%Y-%m-%d") <= row["service_date"] <= edate.strftime("%Y-%m-%d"), result)
     return sorted(result, key=lambda row: row["event_time"])

diff --git a/server/chalicelib/s3_historical.py b/server/chalicelib/s3_historical.py
@@ -77,6 +77,7 @@ def headways(stop_ids: list, sdate, edate):
         headway_time_sec = delta.total_seconds()
 
         # Throw out any headways > 120 min
+        # TODO: We can't do this anymore for CR data
         if headway_time_sec > 120 * 60:
             continue