Merge pull request #60 from threnjen/directory_structure_updates

change directory structure for saved data
threnjen · Nov 18, 2024 · bd3603a · bd3603a
2 parents aefc1d6 + c4c5132
commit bd3603a
Show file tree

Hide file tree

Showing 49 changed files with 75 additions and 292 deletions.
diff --git a/.github/workflows/dev_deployment_ecs_scraper.yml b/.github/workflows/dev_deployment_ecs_scraper.yml
@@ -31,7 +31,7 @@ jobs:
         id: login-ecr
         uses: aws-actions/amazon-ecr-login@v2
 
-      - name: Build, tag, push image to AWS ECR game_data_scraper
+      - name: Build, tag, push image to AWS ECR dev_bgg_scraper
         env:
           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
           ECR_REPOSITORY: dev_bgg_scraper

diff --git a/.github/workflows/lambda_deployments_dev.yml b/.github/workflows/lambda_deployments_dev.yml
@@ -22,7 +22,7 @@ jobs:
           ref: ${{ github.event.pull_request.head.sha }}
       - name: Zip functions
         run: |
-          cd lambda_functions
+          cd modules/lambda_functions
           zip -r bgg_scraper_fargate_trigger.zip bgg_scraper_fargate_trigger.py ../config.py ../utils
           zip -r bgg_cleaner_fargate_trigger.zip bgg_cleaner_fargate_trigger.py ../config.py ../utils
           zip -r generate_game_urls_lambda.zip generate_game_urls_lambda.py ../config.py ../utils

diff --git a/Dockerfiles/Dockerfile.bgg_boardgame_file_retrieval b/Dockerfiles/Dockerfile.bgg_boardgame_file_retrieval
@@ -15,6 +15,6 @@ RUN rm ./chrome-installer.sh
 RUN pip install selenium
 RUN pip install awswrangler
 
-COPY bgg_boardgame_file_retrieval/get_bgg_games_file.py .
+COPY modules/bgg_boardgame_file_retrieval/get_bgg_games_file.py .
 
 CMD [ "get_bgg_games_file.lambda_handler" ]
diff --git a/Dockerfiles/Dockerfile.bgg_orchestrator b/Dockerfiles/Dockerfile.bgg_orchestrator
@@ -13,7 +13,7 @@ RUN pip3 install pipenv
 COPY aws_dagster_bgg aws_dagster_bgg
 COPY aws_dagster_bgg/Pipfile* .
 COPY utils utils
-COPY config.py .
+COPY modules/config.py .
 COPY pyproject.toml .
 
 # Install dependencies with pipenv

diff --git a/Dockerfiles/Dockerfile.game-data-cleaner b/Dockerfiles/Dockerfile.game-data-cleaner
@@ -14,15 +14,15 @@ ENV FILENAME=$FILENAME
 RUN pip3 install pipenv
 
 # Create necessary directories
-RUN mkdir -p data/users/scraped_xml_raw \
-    data/games/scraped_xml_raw \
-    data/game_dfs_dirty
+RUN mkdir -p data/prod/users/scraped_xml_raw \
+    data/prod/games/scraped_xml_raw \
+    data/prod/game_dfs_dirty
 
 # Copy the source code into the container
-COPY game_data_cleaner game_data_cleaner
+COPY modules/game_data_cleaner game_data_cleaner
 COPY utils utils
-COPY game_data_cleaner/Pipfile* .
-COPY config.py .
+COPY modules/game_data_cleaner/Pipfile* .
+COPY modules/config.py .
 
 # Install dependencies with pipenv
 RUN pipenv sync

diff --git a/Dockerfiles/Dockerfile.game-data-scraper b/Dockerfiles/Dockerfile.game-data-scraper
@@ -17,15 +17,15 @@ ENV SCRAPER_TYPE=$SCRAPER_TYPE
 RUN pip3 install pipenv
 
 # Create necessary directories
-RUN mkdir -p data/users/scraped_xml_raw \
-    data/games/scraped_xml_raw
+RUN mkdir -p data/prod/users/scraped_xml_raw \
+    data/prod/games/scraped_xml_raw
 
 # Copy the source code into the container
 # COPY data game_data_scraper/data
-COPY game_data_scraper game_data_scraper
+COPY modules/game_data_scraper game_data_scraper
 COPY utils utils
-COPY game_data_scraper/Pipfile* .
-COPY config.py .
+COPY modules/game_data_scraper/Pipfile* .
+COPY modules/config.py .
 
 # Install dependencies with pipenv
 RUN pipenv sync

diff --git a/README.md b/README.md
@@ -90,9 +90,9 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`
 
 - TEST LOCAL - `game_data_scraper.main.py` for GAME to test a single file locally
     - Use to test a single specific url file. Must have generated game urls first with step 02.
-    - Run locally and pass the scraper type `game` as an arg, and an existing filename without directory or suffix from `data/scraper_urls_raw_game`
+    - Run locally and pass the scraper type `game` as an arg, and an existing filename without directory or suffix from `data/prod/scraper_urls_raw_game`
     - Example: `python game_data_scraper/main.py game group1_game_scraper_urls_raw`
-    - Only saves data locally to `data/games/scraped_xml_raw`
+    - Only saves data locally to `data/prod/games/scraped_xml_raw`
 
 - TEST ON AWS - `lambda_functions.dev_bgg_scraper_fargate_trigger` for GAME will trigger process to run and write scraping on S3    
     - Must have generated game urls first with step 02.
@@ -114,15 +114,15 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`
 
 ### 08 Clean raw scraped GAME data
 
-- `game_data_cleaner.main.py`
+- `modules/game_data_cleaner.main.py`
     - Takes the scraped files and composes into various dirty data frames of full data. Writes these locally. Will only write to S3 if run on AWS.
     - Step 03 needs to have run at least once for this to work, although two sample files from local will also suffice for testing.
-    - If files are present on S3, it will download all of them for this process. If there are no files on S3 yet, it will use files in `data/games/scraped_xml_raw`
+    - If files are present on S3, it will download all of them for this process. If there are no files on S3 yet, it will use files in `data/prod/games/scraped_xml_raw`
 
 ### 09 Generate USER scraping URLS
 
 - `lambda_functions.generate_user_urls_lambda.py`
-- Must have `games.pkl` in directory `data/game_dfs_dirty` OR on S3 from prior step.
+- Must have `games.pkl` in directory `data/prod/game_dfs_dirty` OR on S3 from prior step.
 - Loads the `games.pkl` file generated by 04 and generates user ratings urls. Will attempt to load games.pkl locally, otherwise will retrieve it from S3.
 
 ### 10 Scrape users from URLS
@@ -134,9 +134,9 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`
 
 - TEST - `game_data_scraper.main.py` for USER
     - Use to test a single specific url file. Must have generated user urls first with step 05.
-    - Run locally and pass both scraper type `user` as an arg, and an existing filename without directory or suffix from `data/scraper_urls_raw_user`
+    - Run locally and pass both scraper type `user` as an arg, and an existing filename without directory or suffix from `data/prod/scraper_urls_raw_user`
     - Example: `python game_data_scraper/main.py user group1_user_scraper_urls_raw`
-    - Only saves data locally to `data/users/scraped_xml_raw`
+    - Only saves data locally to `data/prod/users/scraped_xml_raw`
 
 ## I added some new stuff to my deployment. How do I update it?
 

diff --git a/aws_dagster_bgg/assets/assets.py b/aws_dagster_bgg/assets/assets.py
@@ -25,9 +25,9 @@ def bgg_games_csv(
     s3_scraper_bucket = configs["s3_scraper_bucket"]
 
     original_timestamps = {
-        f'data/{configs["boardgamegeek_csv_filename"]}': s3_resource.get_last_modified(
+        f'data/prod/{configs["boardgamegeek_csv_filename"]}': s3_resource.get_last_modified(
             bucket=s3_scraper_bucket,
-            key=f'data/{configs["boardgamegeek_csv_filename"]}',
+            key=f'data/prod/{configs["boardgamegeek_csv_filename"]}',
         )
     }
 
@@ -39,7 +39,7 @@ def bgg_games_csv(
 
     return compare_timestamps_for_refresh(
         original_timestamps=original_timestamps,
-        file_list_to_check=[f'data/{configs["boardgamegeek_csv_filename"]}'],
+        file_list_to_check=[f'data/prod/{configs["boardgamegeek_csv_filename"]}'],
         location_bucket=s3_scraper_bucket,
         sleep_timer=15,
         s3_resource=s3_resource,
@@ -117,7 +117,7 @@ def game_dfs_clean(
     configs = config_resource.get_config_file()
 
     bucket = configs["s3_scraper_bucket"]
-    key = f'data/{configs["game"]["output_xml_directory"]}'
+    key = f'data/prod/{configs["game"]["output_xml_directory"]}'
     data_sets = configs["game"]["data_sets"]
 
     raw_game_files = s3_resource.list_file_keys(bucket=bucket, key=key)
@@ -129,7 +129,7 @@ def game_dfs_clean(
     logger.info(data_sets)
 
     data_set_file_names = [
-        f"data/{configs['game']['clean_dfs_directory']}/{x}_clean.pkl"
+        f"data/prod/{configs['game']['clean_dfs_directory']}/{x}_clean.pkl"
         for x in data_sets
     ]
     logger.info(data_set_file_names)
@@ -255,7 +255,7 @@ def create_new_urls(
     lambda_function_name: str,
 ) -> bool:
 
-    scraper_url_filenames = [f"data/{x}" for x in scraper_url_filenames]
+    scraper_url_filenames = [f"data/prod/{x}" for x in scraper_url_filenames]
     logger.info(f"Created location urls for {scraper_url_filenames}")
 
     original_timestamps = {
@@ -293,10 +293,10 @@ def scrape_data(
     output_key_directory = configs[scraper_type]["output_xml_directory"]
     output_key_suffix = configs[scraper_type]["output_raw_xml_suffix"]
 
-    input_urls_key = f"data/{input_urls_key}"
+    input_urls_key = f"data/prod/{input_urls_key}"
 
     scraper_raw_data_filenames = [
-        f"data/{output_key_directory}/{output_key_suffix.format(i)}"
+        f"data/prod/{output_key_directory}/{output_key_suffix.format(i)}"
         for i in range(1, 31)
     ]
 

diff --git a/aws_terraform_bgg/lambdas_direct.tf b/aws_terraform_bgg/lambdas_direct.tf
@@ -122,5 +122,17 @@ module "bgg_orchestrator_fargate_trigger" {
   description = "Lambda function to trigger the boardgamegeek orchestrator fargate task"
 }
 
+module "dev_bgg_orchestrator_fargate_trigger" {
+  source        = "./modules/lambda_function_direct"
+  function_name = "dev_bgg_orchestrator_fargate_trigger"
+  timeout       = 600
+  memory_size   = 128
+  role          = module.bgg_orchestrator_fargate_trigger_role.arn
+  handler       = "bgg_orchestrator_fargate_trigger.lambda_handler"
+  layers        = ["arn:aws:lambda:${var.REGION}:336392948345:layer:AWSSDKPandas-Python312:13"]
+  environment   = "dev"
+  description = "DEV Lambda function to trigger the boardgamegeek orchestrator fargate task"
+}
+
 
 
diff --git a/config.json b/config.json
@@ -7,8 +7,8 @@
     "orchestrator_task_definition": "bgg_orchestrator",
     "boardgamegeek_csv_filename": "boardgames_ranks.csv",
     "file_retrieval_lambda": "bgg_boardgame_file_retrieval",
-    "dev_directory": "test_data/",
-    "prod_directory": "data/",
+    "dev_directory": "data/test/",
+    "prod_directory": "data/prod/",
     "user": {
         "scrapy_bot_name": "bggscraper_users",
         "raw_urls_directory": "users/scraper_urls_raw",
@@ -26,7 +26,7 @@
         "clean_dfs_directory": "games/game_dfs_clean",
         "output_urls_json_suffix": "_game_scraper_urls_raw.json",
         "output_raw_xml_suffix": "master_group{}_game_raw.xml",
-        "kaggle_games_file": "games/kaggle_data/games.csv",
+        "kaggle_games_file": "games/kaggle_data/prod/games.csv",
         "data_sets": [
             "games",
             "designers",

diff --git a/data/games/game_dfs_clean/.gitkeep → data/prod/games/game_dfs_clean/.gitkeep b/data/games/game_dfs_clean/.gitkeep → data/prod/games/game_dfs_clean/.gitkeep
diff --git a/data/games/game_dfs_dirty/.gitkeep → data/prod/games/game_dfs_dirty/.gitkeep b/data/games/game_dfs_dirty/.gitkeep → data/prod/games/game_dfs_dirty/.gitkeep
diff --git a/data/games/scraped_xml_raw/.gitkeep → data/prod/games/scraped_xml_raw/.gitkeep b/data/games/scraped_xml_raw/.gitkeep → data/prod/games/scraped_xml_raw/.gitkeep
diff --git a/data/games/scraper_urls_raw/.gitkeep → data/prod/games/scraper_urls_raw/.gitkeep b/data/games/scraper_urls_raw/.gitkeep → data/prod/games/scraper_urls_raw/.gitkeep
diff --git a/data/users/scraped_xml_raw/.gitkeep → data/prod/users/scraped_xml_raw/.gitkeep b/data/users/scraped_xml_raw/.gitkeep → data/prod/users/scraped_xml_raw/.gitkeep
diff --git a/data/users/scraper_urls_raw/.gitkeep → data/prod/users/scraper_urls_raw/.gitkeep b/data/users/scraper_urls_raw/.gitkeep → data/prod/users/scraper_urls_raw/.gitkeep
diff --git a/test_data/games/scraped_xml_raw/.gitkeep → data/test/games/scraped_xml_raw/.gitkeep b/test_data/games/scraped_xml_raw/.gitkeep → data/test/games/scraped_xml_raw/.gitkeep
diff --git a/test_data/games/scraper_urls_raw/.gitkeep → data/test/games/scraper_urls_raw/.gitkeep b/test_data/games/scraper_urls_raw/.gitkeep → data/test/games/scraper_urls_raw/.gitkeep
diff --git a/test_data/users/scraped_xml_raw/.gitkeep → data/test/users/scraped_xml_raw/.gitkeep b/test_data/users/scraped_xml_raw/.gitkeep → data/test/users/scraped_xml_raw/.gitkeep
diff --git a/test_data/users/scraper_urls_raw/.gitkeep → data/test/users/scraper_urls_raw/.gitkeep b/test_data/users/scraper_urls_raw/.gitkeep → data/test/users/scraper_urls_raw/.gitkeep
diff --git a/game_data_cleaner/find_config.json b/game_data_cleaner/find_config.json