Skip to content

Commit

Permalink
Fix a bug from trying to scrape all trips
Browse files Browse the repository at this point in the history
I found a scraper (hello, Scrapy user!) trying to scrape all trips from
https://mitoc-trips.mit.edu/trips/?after=0001-10-17

The server crashed because I mistakenly assumed you can subtract 365
days from any given year. Let's not do that.

Separately, with *thousands* of trips, this view is getting pretty slow.
To monitor (and perhaps limit) scrapers, I might try to make the view a
bit more locked down. I don't *want* to paginate, but I don't want abuse
either.
  • Loading branch information
DavidCain committed Jun 24, 2024
1 parent 8e62018 commit 33c1063
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 15 deletions.
11 changes: 11 additions & 0 deletions ws/tests/views/test_trips.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,17 @@ def test_trips_with_filter(self):
self._expect_past_trips(response, [one_week_ago.pk, one_month_ago.pk])
self._expect_link_for_date(soup, "2016-11-15")

def test_trips_with_very_early_date(self):
"""You can ask for trips starting after the year 1."""
trip = factories.TripFactory.create(trip_date="2016-12-23")

# Filter based on a date in the past
response, soup = self._get("/trips/?after=0001-10-17")
self.assertFalse(response.context["date_invalid"])

self._expect_title(soup, "Trips after 1900-01-01")
self._expect_past_trips(response, [trip.pk])

def test_upcoming_trips_can_be_filtered(self):
"""If supplying an 'after' date in the future, that still permits filtering!"""
_next_week = factories.TripFactory.create(trip_date="2019-02-22")
Expand Down
45 changes: 30 additions & 15 deletions ws/views/trips.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@
from ws.middleware import RequestWithParticipant


# Hardcode the result of `models.Trip.objects.earliest('trip_date')
# There will never be any *older* trips, so we can save db queries.
FIRST_TRIP_DATE = date(2015, 1, 10)


class TripView(DetailView):
"""Display the trip to both unregistered users and known participants.
Expand Down Expand Up @@ -487,25 +492,30 @@ def get_queryset(self):
trips = super().get_queryset()
return annotated_for_trip_list(trips)

def _optionally_filter_from_args(self):
def _optionally_filter_from_args(self) -> tuple[date | None, bool]:
"""Return the date at which we want to omit previous trips, plus validity boolean.
If the user passes an invalid date (for example, because they were
manually building the query arguments), we don't want to 500 and
instead should just give them a simple warning message.
"""
start_date = None
start_date_invalid = False
if "after" in self.request.GET:
after = self.request.GET["after"]
try:
start_date = date.fromisoformat(after)
except (TypeError, ValueError):
start_date_invalid = True
else:
start_date_invalid = False
if "after" not in self.request.GET:
# No filtering, which is obviously valid
return None, False

after = self.request.GET["after"]
try:
start_date = date.fromisoformat(after)
except (TypeError, ValueError):
return None, True

return (start_date, start_date_invalid)
return (
# Guard against a weird edge case -- we can accept the year 1!
# We'll try to *subtract* time, and end up an exception rather than BCE datetimes
# There were no trips in our database before the 2010s anyway.
max(start_date, date(1900, 1, 1)),
False,
)

def get_context_data(self, **kwargs):
"""Sort trips into past and present trips."""
Expand All @@ -519,9 +529,14 @@ def get_context_data(self, **kwargs):
context["on_or_after_date"] = on_or_after_date
trips = trips.filter(trip_date__gte=on_or_after_date)

# Get approximately one year prior for use in paginating back in time.
# (need not be exact/handle leap years)
context["one_year_prior"] = (on_or_after_date or today) - timedelta(days=365)
oldest_trip_date = on_or_after_date or today
# Prevent rendering "Previous trips" if we know there won't be any.
# Also prevents a weird edge case -- you can pass a date in the year 1.
# (When the server tries to subtract 365 days, Python declines to make BCE dates)
if oldest_trip_date >= FIRST_TRIP_DATE:
# Get approximately one year prior for use in paginating back in time.
# (need not be exact/handle leap years)
context["one_year_prior"] = oldest_trip_date - timedelta(days=365)

# By default, just show upcoming trips.
context["current_trips"] = trips.filter(trip_date__gte=today)
Expand Down

0 comments on commit 33c1063

Please sign in to comment.