diff --git a/modules/bgg_scraper/main.py b/modules/bgg_scraper/main.py index 64cfcde..a51ad34 100644 --- a/modules/bgg_scraper/main.py +++ b/modules/bgg_scraper/main.py @@ -160,22 +160,23 @@ def run_game_scraper_processes(self): ) def _run_scrapy_scraper(self, scraper_urls_raw) -> None: - process = CrawlerProcess( - settings={ - "LOG_LEVEL": "DEBUG", - "BOT_NAME": self.bot_scraper_name, - "ROBOTSTXT_OBEY": ROBOTSTXT_OBEY, - # "DOWNLOAD_DELAY": DOWNLOAD_DELAY, - "COOKIES_ENABLED": COOKIES_ENABLED, - "AUTOTHROTTLE_ENABLED": AUTOTHROTTLE_ENABLED, - "AUTOTHROTTLE_START_DELAY": AUTOTHROTTLE_START_DELAY, - "AUTOTHROTTLE_MAX_DELAY": AUTOTHROTTLE_MAX_DELAY, - "AUTOTHROTTLE_TARGET_CONCURRENCY": AUTOTHROTTLE_TARGET_CONCURRENCY, - "AUTOTHROTTLE_DEBUG": AUTOTHROTTLE_DEBUG, - } - ) if self.scraper_type in ["games", "ratings"]: + process = CrawlerProcess( + settings={ + "LOG_LEVEL": "DEBUG", + "BOT_NAME": self.bot_scraper_name, + "ROBOTSTXT_OBEY": ROBOTSTXT_OBEY, + "DOWNLOAD_DELAY": 2, + "COOKIES_ENABLED": COOKIES_ENABLED, + "AUTOTHROTTLE_ENABLED": AUTOTHROTTLE_ENABLED, + "AUTOTHROTTLE_START_DELAY": AUTOTHROTTLE_START_DELAY, + "AUTOTHROTTLE_MAX_DELAY": AUTOTHROTTLE_MAX_DELAY, + "AUTOTHROTTLE_TARGET_CONCURRENCY": AUTOTHROTTLE_TARGET_CONCURRENCY, + "AUTOTHROTTLE_DEBUG": AUTOTHROTTLE_DEBUG, + } + ) + process.crawl( GameSpider, name="bgg_raw", @@ -188,6 +189,22 @@ def _run_scrapy_scraper(self, scraper_urls_raw) -> None: process.start() if self.scraper_type == "users": + process = CrawlerProcess( + settings={ + "LOG_LEVEL": "DEBUG", + "BOT_NAME": self.bot_scraper_name, + "ROBOTSTXT_OBEY": ROBOTSTXT_OBEY, + "DOWNLOAD_DELAY": 4, + "CONCURRENT_REQUESTS_PER_DOMAIN": 1, + "COOKIES_ENABLED": COOKIES_ENABLED, + "AUTOTHROTTLE_ENABLED": AUTOTHROTTLE_ENABLED, + "AUTOTHROTTLE_START_DELAY": 3, + "AUTOTHROTTLE_MAX_DELAY": 60, + "AUTOTHROTTLE_TARGET_CONCURRENCY": 1, + "AUTOTHROTTLE_DEBUG": AUTOTHROTTLE_DEBUG, + } + ) + process.crawl( UserSpider, name="bgg_users", diff --git a/modules/bgg_scraper/scrapy_settings.py b/modules/bgg_scraper/scrapy_settings.py index 9d67bdd..39748c4 100644 --- a/modules/bgg_scraper/scrapy_settings.py +++ b/modules/bgg_scraper/scrapy_settings.py @@ -50,11 +50,14 @@ # 'bggscraper.middlewares.BggscraperSpiderMiddleware': 543, # } + # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -# DOWNLOADER_MIDDLEWARES = { -# 'bggscraper.middlewares.BggscraperDownloaderMiddleware': 543, -# } +DOWNLOADER_MIDDLEWARES = { + "bggscraper.middlewares.BggscraperDownloaderMiddleware": 543, +} +RETRY_HTTP_CODES = [429] + # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html