Skip to content

Commit

Permalink
Optimize code and improve documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
solver-app[bot] authored Nov 1, 2024
1 parent 99b2de5 commit 6f422ea
Show file tree
Hide file tree
Showing 6 changed files with 667 additions and 321 deletions.
194 changes: 160 additions & 34 deletions city_scrapers/mixins/cuya_county.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,55 @@


class CuyaCountyMixin:
"""
Mixin for scraping meeting information from Cuyahoga County websites.
Provides common parsing methods for meeting details across different departments.
"""

timezone = "America/Detroit"
location = {
"name": "County Headquarters",
"address": "2079 East 9th St Cleveland, OH 44115",
}


# CSS selectors defined as class constants for reuse
DETAIL_LINK_SELECTOR = ".gridViewStyle td:nth-child(2) a::attr(href)"
TITLE_SELECTOR = "#contentColumn h1::text"
DATETIME_SELECTOR = "blockquote dd::text"

# Date/time format constants
DATETIME_FORMAT = "%m/%d/%Y-%I:%M %p"

def parse(self, response):
for detail_link in response.css(
".gridViewStyle td:nth-child(2) a::attr(href)"
).extract():
"""
Parse the list page to find and follow links to individual meeting pages.
Args:
response: Scrapy response object from the list page
Yields:
Scrapy Request objects for each meeting detail page
"""
for detail_link in response.css(self.DETAIL_LINK_SELECTOR).extract():
yield response.follow(
detail_link, callback=self._parse_detail, dont_filter=True
detail_link,
callback=self._parse_detail,
dont_filter=True
)

def _parse_detail(self, response):
"""Yield a meeting from an individual event page"""
"""
Parse an individual meeting page to create a Meeting object.
Args:
response: Scrapy response object from a meeting detail page
Yields:
Meeting: Object containing all parsed meeting information
"""
title = self._parse_title(response)
start, end = self._parse_start_end(response)

meeting = Meeting(
title=title,
description=self._parse_description(response),
Expand All @@ -33,58 +64,153 @@ def _parse_detail(self, response):
all_day=False,
location=self._parse_location(response),
links=self._parse_links(response),
source=self._parse_source(response),
source=response.url, # Direct use instead of separate method
)
meeting["status"] = self._parse_status(response, meeting)

meeting["status"] = self._get_status(meeting) # Direct use instead of _parse_status
meeting["id"] = self._get_id(meeting)
yield meeting

def _parse_title(self, response):
title_str = response.css("#contentColumn h1::text").extract_first().strip()
"""
Extract and clean the meeting title.
Args:
response: Scrapy response object
Returns:
str: Cleaned meeting title with " Meeting" removed unless it's a special meeting
"""
title_str = response.css(self.TITLE_SELECTOR).extract_first().strip()
if "Special" in title_str:
return title_str
return title_str.replace(" Meeting", "").strip()

def _parse_start_end(self, response):
dt_strs = [d.strip() for d in response.css("blockquote dd::text").extract()]
"""
Extract start and end times for the meeting.
Args:
response: Scrapy response object
Returns:
tuple: (start datetime, end datetime or None)
Note:
End time might be None if not provided or if parsing fails
"""
datetime_strings = [
d.strip()
for d in response.css(self.DATETIME_SELECTOR).extract()
]

if not datetime_strings:
raise ValueError("No datetime information found")

# Parse start time (required)
start = datetime.strptime(datetime_strings[0], self.DATETIME_FORMAT)

# Parse end time (optional)
end = None
start = datetime.strptime(dt_strs[0], "%m/%d/%Y-%I:%M %p")
if len(dt_strs) > 1:
if len(datetime_strings) > 1:
try:
end = datetime.strptime(dt_strs[1], "%m/%d/%Y-%I:%M %p")
end = datetime.strptime(datetime_strings[1], self.DATETIME_FORMAT)
except ValueError:
# End time parsing failed, leave as None
pass

return start, end

# Additional CSS selectors
LINKS_SELECTOR = "blockquote a"
VIDEO_SELECTOR = ".embed-container iframe"

# Regex patterns for location parsing
ROOM_NUMBER_PATTERN = re.compile(r" \d{3}")
WHITESPACE_PATTERN = re.compile(r"\s+")

def _parse_description(self, response):
"""
Parse meeting description (currently returns empty string as descriptions
are not provided in the source HTML).
Args:
response: Scrapy response object
Returns:
str: Empty string as descriptions are not available
"""
return ""

def _parse_classification(self, title):
"""
Get meeting classification (uses classification defined in spider).
Args:
title: Meeting title string (unused in base implementation)
Returns:
str: Classification string defined in spider class
"""
return self.classification

def _parse_location(self, response):
detail_strs = response.css("blockquote dd::text").extract()
loc_str = None
for detail_str in detail_strs:
if re.search(r" \d{3}", detail_str):
loc_str = re.sub(r"\s+", " ", detail_str).strip()
return loc_str
"""
Extract meeting location from detail strings.
Args:
response: Scrapy response object
Returns:
str or None: Location string containing room number if found,
None if no valid location found
Example:
Input: "Room 123 County Building"
Output: "Room 123 County Building"
"""
detail_strings = response.css(self.DATETIME_SELECTOR).extract()

# Look for strings containing room numbers (e.g., " 123")
for detail_str in detail_strings:
if self.ROOM_NUMBER_PATTERN.search(detail_str):
# Clean up whitespace and return
return self.WHITESPACE_PATTERN.sub(" ", detail_str).strip()

return None

def _parse_links(self, response):
"""
Extract links to meeting materials and video streams.
Args:
response: Scrapy response object
Returns:
list: List of dictionaries containing:
- title: Link text or "Video" for video streams
- href: Absolute URL to resource
Example:
[
{"title": "Agenda PDF", "href": "http://example.com/agenda.pdf"},
{"title": "Video", "href": "http://example.com/stream"}
]
"""
links = []
for link in response.css("blockquote a"):
links.append(
{
"title": " ".join(link.css("*::text").extract()),
"href": response.urljoin(link.attrib["href"]),
}
)
for iframe in response.css(".embed-container iframe"):
links.append({"title": "Video", "href": iframe.attrib["src"]})

# Parse document links
for link in response.css(self.LINKS_SELECTOR):
links.append({
"title": " ".join(link.css("*::text").extract()),
"href": response.urljoin(link.attrib["href"])
})

# Parse video streams
for iframe in response.css(self.VIDEO_SELECTOR):
links.append({
"title": "Video",
"href": iframe.attrib["src"]
})

return links

def _parse_source(self, response):
return response.url

def _parse_status(self, response, meeting):
return self._get_status(meeting)
Loading

0 comments on commit 6f422ea

Please sign in to comment.