Skip to content

Commit

Permalink
improve s3 checking
Browse files Browse the repository at this point in the history
  • Loading branch information
threnjen committed Nov 28, 2024
1 parent 84dd69d commit 8a59cdb
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions modules/bgg_scraper/spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,18 +90,25 @@ def __init__(
self.group = group
self.s3_client = boto3.client("s3")

def check_file_exists(self, file_path: str) -> bool:
try:
self.s3_client.head_object(Bucket=S3_SCRAPER_BUCKET, Key=file_path)
return True
except:
return False

def start_requests(self):
for self.group_num, url in enumerate(self.scraper_urls_raw):
user_id = url.split("username=")[-1].split("&rated")[0]

# check S3 for existing user data
if S3FileHandler().check_file_exists(
if self.check_file_exists(
file_path=f"{WORKING_DIR}{self.save_file_path}/user_{user_id}.xml"
):
self.logger.info(f"User {user_id} already exists. Skipping...")
continue

print(f"Starting URL {self.group_num}: {url}")
self.logger.info(f"Starting URL {self.group_num}: {url}")
yield scrapy.Request(
url=url,
meta={"group_num": self.group_num},
Expand Down

0 comments on commit 8a59cdb

Please sign in to comment.