Skip to content

Commit

Permalink
Add a testing mode for the 50-a officer scraper
Browse files Browse the repository at this point in the history
Add user agent information to 50-a scraper
  • Loading branch information
DMalone87 committed Oct 19, 2024
1 parent d044d10 commit 8aec071
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 8 deletions.
4 changes: 2 additions & 2 deletions scrapers/fifty_a/fifty_a/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = "fifty_a (+http://www.yourdomain.com)"
USER_AGENT = "NPDI/2.0 (+http://www.nationalpolicedata.org)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
Expand All @@ -25,7 +25,7 @@
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
Expand Down
44 changes: 38 additions & 6 deletions scrapers/fifty_a/fifty_a/spiders/officer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import random
import logging
from typing import Any, Dict, List, Optional, Tuple

from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from scrapy.linkextractors import LinkExtractor
from scrapers.common.parse import parse_string_to_number
from scrapers.fifty_a.fifty_a.items import OfficerItem

Expand All @@ -12,10 +13,41 @@ class OfficerSpider(CrawlSpider):
allowed_domains = ["www.50-a.org"]
start_urls = ["https://www.50-a.org/commands"]

rules = (
Rule(LinkExtractor(allow="command"), follow=True),
Rule(LinkExtractor(allow="officer"), callback="parse_officer"),
)
def __init__(self, *args, **kwargs):
super(OfficerSpider, self).__init__(*args, **kwargs)
self.test_mode = kwargs.get("test_mode", False).lower() == "true"
self.max_officers = int(kwargs.get("max_officers", 10))

if self.test_mode:
self.rules = (
Rule(LinkExtractor(allow="officer"), callback="parse_officer"),
)
else:
self.rules = (
Rule(LinkExtractor(allow="command"), follow=True),
Rule(LinkExtractor(allow="officer"), callback="parse_officer"),
)

def parse_start_url(self, response):
commands = response.css("a.command::attr(href)").getall()
if self.test_mode and commands:
selected_command = random.choice(commands)
yield response.follow(selected_command, self.parse_command)
elif not self.test_mode:
for command in commands:
yield response.follow(command, self.parse_command)

def parse_command(self, response):
officer_links = response.css("td.officer a.name::attr(href)").getall()
logging.info(f"Found {len(officer_links)} officers in {response.url}")

if self.test_mode:
random.shuffle(officer_links)
officer_links = officer_links[:self.max_officers]

for officer_link in officer_links:
logging.info(f"Yeilding request for {officer_link}")
yield response.follow(officer_link, self.parse_officer)

def parse_officer(self, response):
race, gender = self.parse_race_and_gender(response)
Expand Down

0 comments on commit 8aec071

Please sign in to comment.