Skip to content

Commit

Permalink
update scraper for game and ratings data saves
Browse files Browse the repository at this point in the history
  • Loading branch information
threnjen committed Nov 28, 2024
1 parent 4b222a5 commit 6e29195
Showing 1 changed file with 11 additions and 1 deletion.
12 changes: 11 additions & 1 deletion modules/bgg_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,19 @@ def __init__(
self.scraper_urls_raw = scraper_urls_raw
self.save_file_path = save_file_path
self.group = group
self.s3_file_handler = S3FileHandler()
self.local_file_handler = LocalFileHandler()

def start_requests(self):
for i, url in enumerate(self.scraper_urls_raw):

# check S3 for existing data
if self.s3_file_handler.check_file_exists(
file_path=f"{WORKING_DIR}{self.save_file_path}/{self.group}_{i}.xml"
):
self.logger.info(f"ID {self.group}_{i} already exists. Skipping...")
continue

print(f"Starting URL {i}: {url}")
save_response_with_index = partial(self._save_response, response_id=i)
yield scrapy.Request(url=url, callback=save_response_with_index)
Expand All @@ -61,7 +71,7 @@ def _save_response(self, response: scrapy.http.Response, response_id: int):

save_file_local_first(
path=self.save_file_path,
file_name=f"{self.group}_{response_id}_{timestamp}.xml",
file_name=f"{self.group}_{response_id}.xml",
data=response.body,
)

Expand Down

0 comments on commit 6e29195

Please sign in to comment.