improve s3 checking

threnjen · Nov 28, 2024 · 8a59cdb · 8a59cdb
1 parent 84dd69d
commit 8a59cdb
Showing 1 changed file with 9 additions and 2 deletions.
diff --git a/modules/bgg_scraper/spiders.py b/modules/bgg_scraper/spiders.py
@@ -90,18 +90,25 @@ def __init__(
         self.group = group
         self.s3_client = boto3.client("s3")
 
+    def check_file_exists(self, file_path: str) -> bool:
+        try:
+            self.s3_client.head_object(Bucket=S3_SCRAPER_BUCKET, Key=file_path)
+            return True
+        except:
+            return False
+
     def start_requests(self):
         for self.group_num, url in enumerate(self.scraper_urls_raw):
             user_id = url.split("username=")[-1].split("&rated")[0]
 
             # check S3 for existing user data
-            if S3FileHandler().check_file_exists(
+            if self.check_file_exists(
                 file_path=f"{WORKING_DIR}{self.save_file_path}/user_{user_id}.xml"
             ):
                 self.logger.info(f"User {user_id} already exists. Skipping...")
                 continue
 
-            print(f"Starting URL {self.group_num}: {url}")
+            self.logger.info(f"Starting URL {self.group_num}: {url}")
             yield scrapy.Request(
                 url=url,
                 meta={"group_num": self.group_num},