From 91174db6f2205bc94c11d628f763308f64d7dc42 Mon Sep 17 00:00:00 2001 From: threnjen Date: Tue, 15 Oct 2024 12:15:23 -0700 Subject: [PATCH] update terraform makefile to clean env file for AWS --- .../dev_deployment_ecs_scraper.yml} | 9 -- .../temp_off/dev_eployment_ecs_cleaner.yml | 41 +++++++++ .../lambda_deployments_dev.yml | 0 ...prod_deployment_ecs_bgg_file_retrieval.yml | 35 +++++++ .../temp_off/prod_deployment_ecs_cleaner.yml | 32 +++++++ .../temp_off/prod_deployment_ecs_scraper.yml | 35 +++++++ .github/workflows/prod_deployment.yml | 60 ------------ .../prod_deployment_ecs_orchestrator.yml | 36 ++++++++ Dockerfiles/Dockerfile.bgg_orchestrator | 8 +- aws_dagster_bgg/README.md | 0 aws_dagster_bgg/__init__.py | 6 +- aws_dagster_bgg/assets/assets.py | 91 +++++++++++-------- aws_dagster_bgg/bgg_orchestrate.py | 5 +- aws_dagster_bgg/dagster_runs/.gitkeep | 0 aws_dagster_bgg/resources/__init__.py | 22 +++-- aws_terraform_bgg/iam_policies.tf | 3 +- aws_terraform_bgg/lambdas_direct.tf | 4 +- aws_terraform_bgg/makefile | 6 +- .../modules/ecs_task_definition/ecs.tf | 4 - aws_terraform_bgg/scripts/clean_env.py | 3 + aws_terraform_bgg/vpc/main.tf | 2 +- dagster_cloud.yaml | 4 - .../bgg_orchestrator_fargate_trigger.py | 18 ++-- .../pyproject.toml => pyproject.toml | 2 +- utils/processing_functions.py | 7 +- 25 files changed, 287 insertions(+), 146 deletions(-) rename .github/{workflows/dev_deployment.yml => temp_off/dev_deployment_ecs_scraper.yml} (73%) create mode 100644 .github/temp_off/dev_eployment_ecs_cleaner.yml rename .github/{workflows => temp_off}/lambda_deployments_dev.yml (100%) create mode 100644 .github/temp_off/prod_deployment_ecs_bgg_file_retrieval.yml create mode 100644 .github/temp_off/prod_deployment_ecs_cleaner.yml create mode 100644 .github/temp_off/prod_deployment_ecs_scraper.yml delete mode 100644 .github/workflows/prod_deployment.yml create mode 100644 .github/workflows/prod_deployment_ecs_orchestrator.yml create mode 100644 aws_dagster_bgg/README.md create mode 100644 aws_dagster_bgg/dagster_runs/.gitkeep delete mode 100644 dagster_cloud.yaml rename aws_dagster_bgg/pyproject.toml => pyproject.toml (74%) diff --git a/.github/workflows/dev_deployment.yml b/.github/temp_off/dev_deployment_ecs_scraper.yml similarity index 73% rename from .github/workflows/dev_deployment.yml rename to .github/temp_off/dev_deployment_ecs_scraper.yml index 94a620e..55b4e1a 100644 --- a/.github/workflows/dev_deployment.yml +++ b/.github/temp_off/dev_deployment_ecs_scraper.yml @@ -38,12 +38,3 @@ jobs: run: | DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.game-data-scraper --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY . docker push $ECR_REGISTRY/$ECR_REPOSITORY - - - name: Build, tag, push image to AWS ECR boardgamegeek_cleaner - env: - ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} - ECR_REPOSITORY: boardgamegeek_cleaner_dev - run: | - DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.game-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY . - docker push $ECR_REGISTRY/$ECR_REPOSITORY - \ No newline at end of file diff --git a/.github/temp_off/dev_eployment_ecs_cleaner.yml b/.github/temp_off/dev_eployment_ecs_cleaner.yml new file mode 100644 index 0000000..3b5c486 --- /dev/null +++ b/.github/temp_off/dev_eployment_ecs_cleaner.yml @@ -0,0 +1,41 @@ +name: DEV deployment to boardgamegeek cleaner + +on: + push: + branches: + - main + pull_request: + branches: + - main + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v2 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Build, tag, push image to AWS ECR boardgamegeek_cleaner + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: boardgamegeek_cleaner_dev + run: | + DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.game-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY . + docker push $ECR_REGISTRY/$ECR_REPOSITORY + \ No newline at end of file diff --git a/.github/workflows/lambda_deployments_dev.yml b/.github/temp_off/lambda_deployments_dev.yml similarity index 100% rename from .github/workflows/lambda_deployments_dev.yml rename to .github/temp_off/lambda_deployments_dev.yml diff --git a/.github/temp_off/prod_deployment_ecs_bgg_file_retrieval.yml b/.github/temp_off/prod_deployment_ecs_bgg_file_retrieval.yml new file mode 100644 index 0000000..9d52838 --- /dev/null +++ b/.github/temp_off/prod_deployment_ecs_bgg_file_retrieval.yml @@ -0,0 +1,35 @@ +name: PROD deployment to bgg file retrieval + +on: + push: + branches: + - main + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v2 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Build, tag, push image to AWS ECR bgg_boardgame_file_retrieval + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: bgg_boardgame_file_retrieval + run: | + DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.bgg_boardgame_file_retrieval -t $ECR_REGISTRY/$ECR_REPOSITORY . + docker push $ECR_REGISTRY/$ECR_REPOSITORY diff --git a/.github/temp_off/prod_deployment_ecs_cleaner.yml b/.github/temp_off/prod_deployment_ecs_cleaner.yml new file mode 100644 index 0000000..d045a28 --- /dev/null +++ b/.github/temp_off/prod_deployment_ecs_cleaner.yml @@ -0,0 +1,32 @@ +name: PROD deployment to boardgamegeek cleaner + +on: + push: + branches: + - main + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v2 + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }} + aws-region: ${{ secrets.AWS_REGION }} + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + - name: Build, tag, push image to AWS ECR boardgamegeek_cleaner + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: boardgamegeek_cleaner + run: | + DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.game-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY . + docker push $ECR_REGISTRY/$ECR_REPOSITORY diff --git a/.github/temp_off/prod_deployment_ecs_scraper.yml b/.github/temp_off/prod_deployment_ecs_scraper.yml new file mode 100644 index 0000000..978e723 --- /dev/null +++ b/.github/temp_off/prod_deployment_ecs_scraper.yml @@ -0,0 +1,35 @@ +name: PROD deployment to boardgamegeek scraper + +on: + push: + branches: + - main + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v2 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Build, tag, push image to AWS ECR boardgamegeek_scraper + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: boardgamegeek_scraper + run: | + DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.game-data-scraper --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY . + docker push $ECR_REGISTRY/$ECR_REPOSITORY diff --git a/.github/workflows/prod_deployment.yml b/.github/workflows/prod_deployment.yml deleted file mode 100644 index 99366d4..0000000 --- a/.github/workflows/prod_deployment.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: PROD deployment to boardgamegeek scraper - -on: - push: - branches: - - main - -permissions: - id-token: write # This is required for requesting the JWT - contents: read # This is required for actions/checkout - -jobs: - build: - runs-on: ubuntu-latest - steps: - - name: Check out code - uses: actions/checkout@v2 - - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }} - aws-region: ${{ secrets.AWS_REGION }} - - - name: Login to Amazon ECR - id: login-ecr - uses: aws-actions/amazon-ecr-login@v2 - - - name: Build, tag, push image to AWS ECR bgg_boardgame_file_retrieval - env: - ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} - ECR_REPOSITORY: bgg_boardgame_file_retrieval - run: | - DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.bgg_boardgame_file_retrieval -t $ECR_REGISTRY/$ECR_REPOSITORY . - docker push $ECR_REGISTRY/$ECR_REPOSITORY - - - name: Build, tag, push image to AWS ECR boardgamegeek_scraper - env: - ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} - ECR_REPOSITORY: boardgamegeek_scraper - run: | - DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.game-data-scraper --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY . - docker push $ECR_REGISTRY/$ECR_REPOSITORY - - - name: Build, tag, push image to AWS ECR boardgamegeek_cleaner - env: - ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} - ECR_REPOSITORY: boardgamegeek_cleaner - run: | - DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.game-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY . - docker push $ECR_REGISTRY/$ECR_REPOSITORY - - - name: Build, tag, push image to AWS ECR boardgamegeek_cleaner - env: - ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} - ECR_REPOSITORY: bgg_orchestrator - run: | - DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.bgg_orchestrator --build-arg ENV="prod" --build-arg ASSET="all" -t $ECR_REGISTRY/$ECR_REPOSITORY . - docker push $ECR_REGISTRY/$ECR_REPOSITORY - \ No newline at end of file diff --git a/.github/workflows/prod_deployment_ecs_orchestrator.yml b/.github/workflows/prod_deployment_ecs_orchestrator.yml new file mode 100644 index 0000000..404899c --- /dev/null +++ b/.github/workflows/prod_deployment_ecs_orchestrator.yml @@ -0,0 +1,36 @@ +name: PROD deployment to boardgamegeek orchestrator + +on: + push: + branches: + - main + +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Check out code + uses: actions/checkout@v2 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Build, tag, push image to AWS ECR boardgamegeek_cleaner + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + ECR_REPOSITORY: bgg_orchestrator + run: | + DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.bgg_orchestrator --build-arg ENV="prod" --build-arg ASSET="all" -t $ECR_REGISTRY/$ECR_REPOSITORY . + docker push $ECR_REGISTRY/$ECR_REPOSITORY + \ No newline at end of file diff --git a/Dockerfiles/Dockerfile.bgg_orchestrator b/Dockerfiles/Dockerfile.bgg_orchestrator index b8e5d59..f846755 100644 --- a/Dockerfiles/Dockerfile.bgg_orchestrator +++ b/Dockerfiles/Dockerfile.bgg_orchestrator @@ -10,23 +10,23 @@ RUN apt-get update && apt-get install -y \ RUN pip3 install pipenv # Copy the source code into the container -# COPY data game_data_scraper/data COPY aws_dagster_bgg aws_dagster_bgg -COPY utils utils COPY aws_dagster_bgg/Pipfile* . +COPY utils utils COPY config.py . -COPY dagster_cloud.yaml . -COPY aws_dagster_bgg/pyproject.toml . +COPY pyproject.toml . # Install dependencies with pipenv RUN pipenv sync +# Set environment variables with ARG for build-time and ENV for runtime ARG ENV ENV ENV=$ENV ARG ASSET ENV ASSET=$ASSET +# Expose the port (if needed for the service) EXPOSE 3000 # Set the entry point and command diff --git a/aws_dagster_bgg/README.md b/aws_dagster_bgg/README.md new file mode 100644 index 0000000..e69de29 diff --git a/aws_dagster_bgg/__init__.py b/aws_dagster_bgg/__init__.py index a3884b4..7c0e964 100644 --- a/aws_dagster_bgg/__init__.py +++ b/aws_dagster_bgg/__init__.py @@ -29,9 +29,9 @@ "s3_resource": S3Resource( region_name=REGION, ), - "dynamodb_resource": DynamoDBResource( - region_name=REGION, table_name="boardgamegeek" - ), + # "dynamodb_resource": DynamoDBResource( + # region_name=REGION, table_name="boardgamegeek" + # ), "lambda_resource": LambdaHandlerResource(region_name=REGION), "ecs_resource": ECSResource(region_name=REGION), "config_resource": ConfigResource( diff --git a/aws_dagster_bgg/assets/assets.py b/aws_dagster_bgg/assets/assets.py index 175d207..feace1f 100644 --- a/aws_dagster_bgg/assets/assets.py +++ b/aws_dagster_bgg/assets/assets.py @@ -1,7 +1,10 @@ -from dagster import asset, ConfigurableResource, op +from dagster import asset, ConfigurableResource, op, get_dagster_logger import time import os from datetime import datetime +import logging + +logger = get_dagster_logger() @asset @@ -12,18 +15,23 @@ def bgg_games_csv( ) -> bool: f"""Triggers the lambda to get the games file from the BoardGameGeek website""" + logger.info("Getting the games csv file from BoardGameGeek") + configs = config_resource.get_config_file() s3_scraper_bucket = configs["s3_scraper_bucket"] - original_timestamps = get_original_timestamps( - s3_resource, - bucket=s3_scraper_bucket, - keys=[configs["boardgamegeek_csv_filename"]], - ) + original_timestamps = { + configs["boardgamegeek_csv_filename"]: s3_resource.get_last_modified( + bucket=s3_scraper_bucket, + key=configs["boardgamegeek_csv_filename"], + ) + } lambda_resource.invoke_lambda(function=configs["file_retrieval_lambda"]) + logger.info("Lambda invoked. Beginning timestamp checks...") + return compare_timestamps_for_refresh( original_timestamps=original_timestamps, file_list_to_check=[configs["boardgamegeek_csv_filename"]], @@ -51,6 +59,8 @@ def game_scraper_urls( Update the last modified timestamp of the keys in s3 """ + logger.info("Generating game scraper urls") + configs = config_resource.get_config_file() s3_scraper_bucket = configs["s3_scraper_bucket"] @@ -61,7 +71,7 @@ def game_scraper_urls( f"{raw_urls_directory}/group{i}{output_urls_json_suffix}" for i in range(1, 31) ] - return create_new_urls( + create_new_urls( lambda_resource, s3_resource, s3_scraper_bucket, @@ -69,6 +79,8 @@ def game_scraper_urls( lambda_function_name="bgg_generate_game_urls", ) + return True + @asset(deps=["game_scraper_urls"]) def scrape_game_data( @@ -118,7 +130,7 @@ def game_dfs_dirty( for key in data_set_file_names } - return compare_timestamps_for_refresh( + compare_timestamps_for_refresh( original_timestamps=original_timestamps, file_list_to_check=data_set_file_names, location_bucket=bucket, @@ -126,6 +138,8 @@ def game_dfs_dirty( s3_resource=s3_resource, ) + return True + @asset(deps=["game_dfs_dirty"]) def user_scraper_urls( @@ -155,7 +169,7 @@ def user_scraper_urls( f"{raw_urls_directory}/group{i}{output_urls_json_suffix}" for i in range(1, 31) ] - return create_new_urls( + create_new_urls( lambda_resource, s3_resource, s3_scraper_bucket, @@ -163,23 +177,7 @@ def user_scraper_urls( lambda_function_name="bgg_generate_user_urls", ) - -@op -def get_original_timestamps( - s3_resource: ConfigurableResource, - bucket: str, - keys: list[str], -) -> dict: - try: - return { - key: s3_resource.get_last_modified( - bucket=bucket, - key=key, - ) - for key in keys - } - except: - return {key: datetime(1970, 1, 1, 0, 0, 0, 0) for key in keys} + return True @op @@ -194,18 +192,25 @@ def compare_timestamps_for_refresh( time.sleep(sleep_timer) + logger.info("Checking timestamps...") + while len(file_list_to_check): + logger.info(f"Files to check: {file_list_to_check}") for key in file_list_to_check: + logger.info(f"Checking key: {key}") new_timestamp_tracker[key] = s3_resource.get_last_modified( bucket=location_bucket, key=key, ) + logger.info(f"Original timestamp: {original_timestamps[key]}") + logger.info(f"New timestamp: {new_timestamp_tracker[key]}") for key in original_timestamps: new_date = new_timestamp_tracker[key] old_date = original_timestamps[key] + logger.info(f"New date: {new_date}, Old date: {old_date}") if new_date > old_date: - print( + logger.info( f"new timestamp {new_date} is greater than old timestamp {old_date}" ) if key in file_list_to_check: @@ -225,15 +230,17 @@ def create_new_urls( lambda_function_name: str, ) -> bool: - original_timestamps = get_original_timestamps( - s3_resource=s3_resource, - bucket=s3_scraper_bucket, - keys=scraper_url_filenames, - ) + original_timestamps = { + key: s3_resource.get_last_modified( + bucket=s3_scraper_bucket, + key=key, + ) + for key in scraper_url_filenames + } lambda_resource.invoke_lambda(function=lambda_function_name) - return compare_timestamps_for_refresh( + compare_timestamps_for_refresh( original_timestamps=original_timestamps, file_list_to_check=scraper_url_filenames, location_bucket=s3_scraper_bucket, @@ -241,6 +248,8 @@ def create_new_urls( s3_resource=s3_resource, ) + return True + @op def scrape_data( @@ -259,11 +268,13 @@ def scrape_data( f"{output_key_directory}/{output_key_suffix.format(i)}" for i in range(1, 31) ] - original_timestamps = get_original_timestamps( - s3_resource=s3_resource, - bucket=bucket, - keys=scraper_raw_data_filenames, - ) + original_timestamps = { + key: s3_resource.get_last_modified( + bucket=bucket, + key=key, + ) + for key in scraper_raw_data_filenames + } game_scraper_url_filenames = s3_resource.list_file_keys( bucket=bucket, key=input_urls_key @@ -287,7 +298,7 @@ def scrape_data( } ecs_resource.launch_ecs_task(task_definition, overrides) - return compare_timestamps_for_refresh( + compare_timestamps_for_refresh( original_timestamps=original_timestamps, file_list_to_check=scraper_raw_data_filenames, location_bucket=bucket, @@ -295,6 +306,8 @@ def scrape_data( s3_resource=s3_resource, ) + return True + # @multi_asset(specs=[AssetSpec("asset1"), AssetSpec("asset2")]) # def materialize_1_and_2(): diff --git a/aws_dagster_bgg/bgg_orchestrate.py b/aws_dagster_bgg/bgg_orchestrate.py index e8847be..af050ac 100644 --- a/aws_dagster_bgg/bgg_orchestrate.py +++ b/aws_dagster_bgg/bgg_orchestrate.py @@ -1,6 +1,7 @@ import subprocess import time import sys +import os if __name__ == "__main__": @@ -16,12 +17,12 @@ if asset == "all": print("Executing all assets...") subprocess.run( - f"dagster job execute --package-name bgg_orchestrator -j bgg_job".split(" ") + f"dagster job execute --package-name aws_dagster_bgg -j bgg_job".split(" ") ) else: print(f"Executing asset: {asset}...") subprocess.run( - f"dagster asset materialize --select {asset} --package-name bgg_orchestrator".split( + f"dagster asset materialize --select {asset} --package-name aws_dagster_bgg".split( " " ) ) diff --git a/aws_dagster_bgg/dagster_runs/.gitkeep b/aws_dagster_bgg/dagster_runs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/aws_dagster_bgg/resources/__init__.py b/aws_dagster_bgg/resources/__init__.py index ed34523..be570af 100644 --- a/aws_dagster_bgg/resources/__init__.py +++ b/aws_dagster_bgg/resources/__init__.py @@ -1,9 +1,12 @@ -from dagster import EnvVar, ConfigurableResource +from dagster import EnvVar, ConfigurableResource, get_dagster_logger import boto3 import json from datetime import datetime import pytz import os +import logging + +logger = get_dagster_logger() REGION = os.environ.get("TF_VAR_REGION", "us-west-2") TERRAFORM_STATE_BUCKET = os.environ.get("TF_VAR_BUCKET") @@ -17,7 +20,7 @@ def get_dynamodb_client(self): return boto3.client("dynamodb", region_name=REGION) def get_last_modified(self, key): - print(f"Key: {key}") + logger.info(f"Key: {key}") return self.get_dynamodb_client().get_item( TableName=self.table_name, Key={ @@ -28,7 +31,7 @@ def get_last_modified(self, key): )["Item"]["last_modified"]["S"] def update_last_modified(self, key, timestamp): - print(f"Key: {key}, Timestamp: {timestamp}") + logger.info(f"Key: {key}, Timestamp: {timestamp}") self.get_dynamodb_client().put_item( TableName=self.table_name, Item={"filename": {"S": key}, "last_modified": {"S": timestamp}}, @@ -52,12 +55,13 @@ def get_s3_client(self): return boto3.client("s3", region_name=self.region_name) def get_last_modified(self, bucket: str, key): - print(f"Bucket: {bucket}, Key: {key}") + logger.info(f"Bucket: {bucket}, Key: {key}") try: return self.get_s3_client().get_object_attributes( Bucket=bucket, Key=key, ObjectAttributes=["ObjectParts"] )["LastModified"] - except: + except Exception as e: + logger.info(f"Error: {e}") return datetime(1970, 1, 1, 0, 0, 0, 0, pytz.UTC) def list_file_keys(self, bucket: str, key): @@ -67,7 +71,7 @@ def list_file_keys(self, bucket: str, key): return [x["Key"] for x in raw_files] def load_json(self, bucket: str, key): - print(f"Loading data from S3: {key}") + logger.info(f"Loading data from S3: {key}") object = ( self.get_s3_client() .get_object(Bucket=bucket, Key=key)["Body"] @@ -86,7 +90,7 @@ def get_config_file(self): try: return json.loads(open("config.json")) except: - print("No config file found") + logger.info("No config file found") configs = S3Resource(region_name=self.region_name).load_json( bucket=self.bucket, key="config.json" ) @@ -126,6 +130,10 @@ def launch_ecs_task(self, task_definition: str, overrides: dict = {}): terraform_state_file = self.get_terraform_state_file_for_vpc() + logger.info( + f"Got terraform state file. Launching ECS task for {task_definition}" + ) + self.get_ecs_client().run_task( taskDefinition=f"{task_definition}:{self.get_latest_task_revision(task_definition)}", cluster=ConfigResource()["ecs_task_components"]["cluster"], diff --git a/aws_terraform_bgg/iam_policies.tf b/aws_terraform_bgg/iam_policies.tf index d737571..295aa44 100644 --- a/aws_terraform_bgg/iam_policies.tf +++ b/aws_terraform_bgg/iam_policies.tf @@ -8,7 +8,8 @@ resource "aws_iam_policy" "S3_Access_boardgamegeek_scraper_policy" { Action = [ "s3:ListBucket", "s3:PutObject", - "s3:GetObject" + "s3:GetObject", + "s3:GetObjectAttributes" ] Effect = "Allow" Resource = [ diff --git a/aws_terraform_bgg/lambdas_direct.tf b/aws_terraform_bgg/lambdas_direct.tf index 331b4a4..1ace9c3 100644 --- a/aws_terraform_bgg/lambdas_direct.tf +++ b/aws_terraform_bgg/lambdas_direct.tf @@ -15,7 +15,7 @@ module "bgg_generate_game_urls" { source = "./modules/lambda_function_direct" function_name = "bgg_generate_game_urls" timeout = 900 - memory_size = 512 + memory_size = 1024 role = module.bgg_generate_game_urls_lambda_role.arn handler = "generate_game_urls_lambda.lambda_handler" layers = ["arn:aws:lambda:${var.REGION}:336392948345:layer:AWSSDKPandas-Python312:13"] @@ -26,7 +26,7 @@ module "bgg_generate_user_urls" { source = "./modules/lambda_function_direct" function_name = "bgg_generate_user_urls" timeout = 900 - memory_size = 512 + memory_size = 1024 role = module.bgg_generate_user_urls_lambda_role.arn handler = "generate_user_urls_lambda.lambda_handler" layers = ["arn:aws:lambda:${var.REGION}:336392948345:layer:AWSSDKPandas-Python312:13"] diff --git a/aws_terraform_bgg/makefile b/aws_terraform_bgg/makefile index c9f35f9..e206488 100644 --- a/aws_terraform_bgg/makefile +++ b/aws_terraform_bgg/makefile @@ -57,8 +57,11 @@ get_current_ip: get_terraform_bucket TF_VAR_MY_IP_FIRST_THREE_BLOCKS=$$response; \ echo "TF_VAR_MY_IP_FIRST_THREE_BLOCKS=$$TF_VAR_MY_IP_FIRST_THREE_BLOCKS" >> ../.env +enter_temp_vars: get_current_ip + echo "IS_LOCAL=False" >> ../.env + # Make the backend config file for terraform -backend_config: get_current_ip +backend_config: enter_temp_vars echo 'key="boardgamegeek.tfstate"' >> backend.conf # A target that runs the Python script and checks the output @@ -98,6 +101,7 @@ cleanup_superfluous_files: setup_boardgamegeek @echo "\nCleaning up garbage files" find . -type f -name "*.DS_Store" -delete find . -type f -name '*!lambda_function.zip' -delete + echo "PYTHONPATH=." >> ../.env failure-action: @echo "\nScript was not ready to run Terraform. Running cleanup and exiting, do not interrupt..." diff --git a/aws_terraform_bgg/modules/ecs_task_definition/ecs.tf b/aws_terraform_bgg/modules/ecs_task_definition/ecs.tf index 1a2e2bb..3498050 100644 --- a/aws_terraform_bgg/modules/ecs_task_definition/ecs.tf +++ b/aws_terraform_bgg/modules/ecs_task_definition/ecs.tf @@ -7,10 +7,6 @@ resource "aws_ecs_task_definition" "task_definition" { image = var.image cpu = 0, portMappings = [ - { - containerPort = 80, - hostPort = 80 - }, { containerPort = 3000, hostPort = 3000 diff --git a/aws_terraform_bgg/scripts/clean_env.py b/aws_terraform_bgg/scripts/clean_env.py index 3bc32f2..3bed004 100644 --- a/aws_terraform_bgg/scripts/clean_env.py +++ b/aws_terraform_bgg/scripts/clean_env.py @@ -8,6 +8,9 @@ "TF_VAR_BUCKET" in line or "TF_VAR_REGION" in line or "TF_VAR_MY_IP_FIRST_THREE_BLOCKS" in line + or "PYTHONPATH" in line + or "IS_LOCAL" in line + or "ENV" in line ): continue if line == "\n": diff --git a/aws_terraform_bgg/vpc/main.tf b/aws_terraform_bgg/vpc/main.tf index 32c9426..58bb0b6 100644 --- a/aws_terraform_bgg/vpc/main.tf +++ b/aws_terraform_bgg/vpc/main.tf @@ -52,7 +52,7 @@ resource "aws_security_group" "ec2_dagster_port_access" { "${var.MY_IP_FIRST_THREE_BLOCKS}.0/24" ], "description": "", - "from_port": 22, + "from_port": 3000, "ipv6_cidr_blocks": [], "prefix_list_ids": [], "protocol": "tcp", diff --git a/dagster_cloud.yaml b/dagster_cloud.yaml deleted file mode 100644 index d5f7668..0000000 --- a/dagster_cloud.yaml +++ /dev/null @@ -1,4 +0,0 @@ -locations: - - location_name: boardgamegeek - code_source: - package_name: aws_dagster_bgg \ No newline at end of file diff --git a/lambda_functions/bgg_orchestrator_fargate_trigger.py b/lambda_functions/bgg_orchestrator_fargate_trigger.py index fcc6d6a..0c59fa3 100644 --- a/lambda_functions/bgg_orchestrator_fargate_trigger.py +++ b/lambda_functions/bgg_orchestrator_fargate_trigger.py @@ -46,6 +46,15 @@ def lambda_handler(event, context): .get("revision") ) + subnets = terraform_state_file["outputs"]["public_subnets"]["value"] + print(subnets) + + security_groups = [ + terraform_state_file["outputs"]["sg_ec2_ssh_access"]["value"], + terraform_state_file["outputs"]["sg_ec2_dagster_port_access"]["value"], + ] + print(security_groups) + response = ecs_client.run_task( taskDefinition=f"{task_definition}:{latest_version}", cluster="boardgamegeek", @@ -55,13 +64,8 @@ def lambda_handler(event, context): enableECSManagedTags=False, networkConfiguration={ "awsvpcConfiguration": { - "subnets": terraform_state_file["outputs"]["public_subnets"]["value"], - "securityGroups": [ - terraform_state_file["outputs"]["sg_ec2_ssh_access"]["value"], - terraform_state_file["outputs"]["sg_ec2_dagster_port_access"][ - "value" - ], - ], + "subnets": subnets, + "securityGroups": security_groups, "assignPublicIp": "ENABLED", }, }, diff --git a/aws_dagster_bgg/pyproject.toml b/pyproject.toml similarity index 74% rename from aws_dagster_bgg/pyproject.toml rename to pyproject.toml index 85772aa..34b27c5 100644 --- a/aws_dagster_bgg/pyproject.toml +++ b/pyproject.toml @@ -3,4 +3,4 @@ requires = ["setuptools"] build-backend = "setuptools.build_meta" [tool.dagster] -module_name = "bgg_orchestrator" +module_name = "aws_dagster_bgg" diff --git a/utils/processing_functions.py b/utils/processing_functions.py index f7a7cd5..a0537dd 100644 --- a/utils/processing_functions.py +++ b/utils/processing_functions.py @@ -29,10 +29,13 @@ def save_file_local_first(path: str, file_name: str, data: Union[pd.DataFrame, dict]): file_path = f"{path}/{file_name}" + print(file_path) if IS_LOCAL: + print(f"Saving {file_name} to local") LocalFileHandler().save_file(file_path=file_path, data=data) if ENV == "prod": + print(f"Saving {file_name} to S3") S3FileHandler().save_file(file_path=file_path, data=data) @@ -45,7 +48,9 @@ def load_file_local_first(path: str, file_name: str): except FileNotFoundError as e: print(f"Downloading {file_name} from S3") file = S3FileHandler().load_file(file_path=file_path) - LocalFileHandler().save_file(file_path=file_path, data=file) + if IS_LOCAL: + print(f"Saving {file_name} to local") + LocalFileHandler().save_file(file_path=file_path, data=file) return file