Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Branding: improve IO resilience #3182

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bot/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self, converter: Converter, original: Exception, infraction_arg: in


class BrandingMisconfigurationError(RuntimeError):
"""Raised by the Branding cog when a misconfigured event is encountered."""
"""Raised by the Branding cog when branding misconfiguration is detected."""



Expand Down
34 changes: 22 additions & 12 deletions bot/exts/backend/branding/_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,14 +342,14 @@ async def synchronise(self) -> tuple[bool, bool]:
"""
log.debug("Synchronise: fetching current event.")

current_event, available_events = await self.repository.get_current_event()
try:
current_event, available_events = await self.repository.get_current_event()
except Exception:
log.exception("Synchronisation aborted: failed to fetch events.")
return False, False

await self.populate_cache_events(available_events)

if current_event is None:
log.error("Failed to fetch event. Cannot synchronise!")
return False, False

return await self.enter_event(current_event)

async def populate_cache_events(self, events: list[Event]) -> None:
Expand Down Expand Up @@ -433,10 +433,6 @@ async def daemon_main(self) -> None:

await self.populate_cache_events(available_events)

if new_event is None:
log.warning("Daemon main: failed to get current event from branding repository, will do nothing.")
return

if new_event.path != await self.cache_information.get("event_path"):
log.debug("Daemon main: new event detected!")
await self.enter_event(new_event)
Expand Down Expand Up @@ -596,10 +592,24 @@ async def branding_calendar_refresh_cmd(self, ctx: commands.Context) -> None:
log.info("Performing command-requested event cache refresh.")

async with ctx.typing():
available_events = await self.repository.get_events()
await self.populate_cache_events(available_events)
try:
available_events = await self.repository.get_events()
except Exception:
log.exception("Refresh aborted: failed to fetch events.")
resp = make_embed(
"Refresh aborted",
"Failed to fetch events. See log for details.",
success=False,
)
else:
await self.populate_cache_events(available_events)
resp = make_embed(
"Refresh successful",
"The event calendar has been refreshed.",
success=True,
)

await ctx.invoke(self.branding_calendar_group)
await ctx.send(embed=resp)

# endregion
# region: Command interface (branding daemon)
Expand Down
73 changes: 48 additions & 25 deletions bot/exts/backend/branding/_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from datetime import UTC, date, datetime

import frontmatter
from aiohttp import ClientResponse, ClientResponseError
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential

from bot.bot import Bot
from bot.constants import Keys
Expand Down Expand Up @@ -71,6 +73,35 @@ def __str__(self) -> str:
return f"<Event at '{self.path}'>"


class GitHubServerError(Exception):
"""
GitHub responded with 5xx status code.

Such error shall be retried.
"""


def _raise_for_status(resp: ClientResponse) -> None:
"""Raise custom error if resp status is 5xx."""
# Use the response's raise_for_status so that we can
# attach the full traceback to our custom error.
log.trace(f"GitHub response status: {resp.status}")
try:
resp.raise_for_status()
except ClientResponseError as err:
if resp.status >= 500:
raise GitHubServerError from err
raise


_retry_server_error = retry(
retry=retry_if_exception_type(GitHubServerError), # Only retry this error.
stop=stop_after_attempt(5), # Up to 5 attempts.
wait=wait_exponential(), # Exponential backoff: 1, 2, 4, 8 seconds.
reraise=True, # After final failure, re-raise original exception.
)


class BrandingRepository:
"""
Branding repository abstraction.
Expand All @@ -93,6 +124,7 @@ class BrandingRepository:
def __init__(self, bot: Bot) -> None:
self.bot = bot

@_retry_server_error
async def fetch_directory(self, path: str, types: t.Container[str] = ("file", "dir")) -> dict[str, RemoteObject]:
"""
Fetch directory found at `path` in the branding repository.
Expand All @@ -105,14 +137,12 @@ async def fetch_directory(self, path: str, types: t.Container[str] = ("file", "d
log.debug(f"Fetching directory from branding repository: '{full_url}'.")

async with self.bot.http_session.get(full_url, params=PARAMS, headers=HEADERS) as response:
if response.status != 200:
raise RuntimeError(f"Failed to fetch directory due to status: {response.status}")

log.debug("Fetch successful, reading JSON response.")
_raise_for_status(response)
json_directory = await response.json()

return {file["name"]: RemoteObject(file) for file in json_directory if file["type"] in types}

@_retry_server_error
async def fetch_file(self, download_url: str) -> bytes:
"""
Fetch file as bytes from `download_url`.
Expand All @@ -122,10 +152,7 @@ async def fetch_file(self, download_url: str) -> bytes:
log.debug(f"Fetching file from branding repository: '{download_url}'.")

async with self.bot.http_session.get(download_url, params=PARAMS, headers=HEADERS) as response:
if response.status != 200:
raise RuntimeError(f"Failed to fetch file due to status: {response.status}")

log.debug("Fetch successful, reading payload.")
_raise_for_status(response)
return await response.read()

def parse_meta_file(self, raw_file: bytes) -> MetaFile:
Expand Down Expand Up @@ -186,37 +213,34 @@ async def get_events(self) -> list[Event]:
"""
Discover available events in the branding repository.

Misconfigured events are skipped. May return an empty list in the catastrophic case.
Propagate errors if an event fails to fetch or deserialize.
"""
log.debug("Discovering events in branding repository.")

try:
event_directories = await self.fetch_directory("events", types=("dir",)) # Skip files.
except Exception:
log.exception("Failed to fetch 'events' directory.")
return []
event_directories = await self.fetch_directory("events", types=("dir",)) # Skip files.

instances: list[Event] = []

for event_directory in event_directories.values():
log.trace(f"Attempting to construct event from directory: '{event_directory.path}'.")
try:
instance = await self.construct_event(event_directory)
except Exception as exc:
log.warning(f"Could not construct event '{event_directory.path}'.", exc_info=exc)
else:
instances.append(instance)
log.trace(f"Reading event directory: '{event_directory.path}'.")
instance = await self.construct_event(event_directory)
instances.append(instance)

return instances

async def get_current_event(self) -> tuple[Event | None, list[Event]]:
async def get_current_event(self) -> tuple[Event, list[Event]]:
"""
Get the currently active event, or the fallback event.

The second return value is a list of all available events. The caller may discard it, if not needed.
Returning all events alongside the current one prevents having to query the API twice in some cases.

The current event may be None in the case that no event is active, and no fallback event is found.
Raise an error in the following cases:
* GitHub request fails
* The branding repo contains an invalid event
* No event is active and the fallback event is missing

Events are validated in the branding repo. The bot assumes that events are valid.
"""
utc_now = datetime.now(tz=UTC)
log.debug(f"Finding active event for: {utc_now}.")
Expand Down Expand Up @@ -249,5 +273,4 @@ async def get_current_event(self) -> tuple[Event | None, list[Event]]:
if event.meta.is_fallback:
return event, available_events

log.warning("No event is currently active and no fallback event was found!")
return None, available_events
raise BrandingMisconfigurationError("No event is active and the fallback event is missing!")
18 changes: 16 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ python-frontmatter = "1.1.0"
rapidfuzz = "3.9.6"
regex = "2024.7.24"
sentry-sdk = "2.13.0"
tenacity = "9.0.0"
tldextract = "5.1.2"
pydantic = "2.6.4"
pydantic-settings = "2.2.1"
Expand Down