Skip to content

Commit

Permalink
throttle user scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
threnjen committed Nov 27, 2024
1 parent b9cf14a commit 3263bb9
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 17 deletions.
45 changes: 31 additions & 14 deletions modules/bgg_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,22 +160,23 @@ def run_game_scraper_processes(self):
)

def _run_scrapy_scraper(self, scraper_urls_raw) -> None:
process = CrawlerProcess(
settings={
"LOG_LEVEL": "DEBUG",
"BOT_NAME": self.bot_scraper_name,
"ROBOTSTXT_OBEY": ROBOTSTXT_OBEY,
# "DOWNLOAD_DELAY": DOWNLOAD_DELAY,
"COOKIES_ENABLED": COOKIES_ENABLED,
"AUTOTHROTTLE_ENABLED": AUTOTHROTTLE_ENABLED,
"AUTOTHROTTLE_START_DELAY": AUTOTHROTTLE_START_DELAY,
"AUTOTHROTTLE_MAX_DELAY": AUTOTHROTTLE_MAX_DELAY,
"AUTOTHROTTLE_TARGET_CONCURRENCY": AUTOTHROTTLE_TARGET_CONCURRENCY,
"AUTOTHROTTLE_DEBUG": AUTOTHROTTLE_DEBUG,
}
)

if self.scraper_type in ["games", "ratings"]:
process = CrawlerProcess(
settings={
"LOG_LEVEL": "DEBUG",
"BOT_NAME": self.bot_scraper_name,
"ROBOTSTXT_OBEY": ROBOTSTXT_OBEY,
"DOWNLOAD_DELAY": 2,
"COOKIES_ENABLED": COOKIES_ENABLED,
"AUTOTHROTTLE_ENABLED": AUTOTHROTTLE_ENABLED,
"AUTOTHROTTLE_START_DELAY": AUTOTHROTTLE_START_DELAY,
"AUTOTHROTTLE_MAX_DELAY": AUTOTHROTTLE_MAX_DELAY,
"AUTOTHROTTLE_TARGET_CONCURRENCY": AUTOTHROTTLE_TARGET_CONCURRENCY,
"AUTOTHROTTLE_DEBUG": AUTOTHROTTLE_DEBUG,
}
)

process.crawl(
GameSpider,
name="bgg_raw",
Expand All @@ -188,6 +189,22 @@ def _run_scrapy_scraper(self, scraper_urls_raw) -> None:
process.start()

if self.scraper_type == "users":
process = CrawlerProcess(
settings={
"LOG_LEVEL": "DEBUG",
"BOT_NAME": self.bot_scraper_name,
"ROBOTSTXT_OBEY": ROBOTSTXT_OBEY,
"DOWNLOAD_DELAY": 2,
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
"COOKIES_ENABLED": COOKIES_ENABLED,
"AUTOTHROTTLE_ENABLED": AUTOTHROTTLE_ENABLED,
"AUTOTHROTTLE_START_DELAY": AUTOTHROTTLE_START_DELAY,
"AUTOTHROTTLE_MAX_DELAY": AUTOTHROTTLE_MAX_DELAY,
"AUTOTHROTTLE_TARGET_CONCURRENCY": AUTOTHROTTLE_TARGET_CONCURRENCY,
"AUTOTHROTTLE_DEBUG": AUTOTHROTTLE_DEBUG,
}
)

process.crawl(
UserSpider,
name="bgg_users",
Expand Down
9 changes: 6 additions & 3 deletions modules/bgg_scraper/scrapy_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,14 @@
# 'bggscraper.middlewares.BggscraperSpiderMiddleware': 543,
# }


# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'bggscraper.middlewares.BggscraperDownloaderMiddleware': 543,
# }
DOWNLOADER_MIDDLEWARES = {
"bggscraper.middlewares.BggscraperDownloaderMiddleware": 543,
}
RETRY_HTTP_CODES = [429]


# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
Expand Down

0 comments on commit 3263bb9

Please sign in to comment.