Skip to content

Commit

Permalink
Merge pull request #68 from threnjen/rename_user_data_cleaner_to_rati…
Browse files Browse the repository at this point in the history
…ngs_cleaner

Rename user data cleaner to ratings cleaner
  • Loading branch information
threnjen authored Nov 26, 2024
2 parents 39e7239 + c731ee1 commit 1755b63
Show file tree
Hide file tree
Showing 39 changed files with 672 additions and 663 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy_file_to_s3.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: PROD deployment to boardgamegeek user data cleaner
name: PROD deployment to boardgamegeek ratings data cleaner

on:
push:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: DEV deployment to boardgamegeek user data cleaner
name: DEV deployment to boardgamegeek ratings data cleaner

on:
push:
Expand Down Expand Up @@ -31,10 +31,10 @@ jobs:
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2

- name: Build, tag, push image to AWS ECR dev_bgg_user_data_cleaner
- name: Build, tag, push image to AWS ECR dev_bgg_ratings_data_cleaner
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: dev_bgg_user_data_cleaner
ECR_REPOSITORY: dev_bgg_ratings_data_cleaner
run: |
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
docker push $ECR_REGISTRY/$ECR_REPOSITORY
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/lambda_deployments_dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ jobs:
zip -r bgg_scraper_fargate_trigger.zip bgg_scraper_fargate_trigger.py ../../config.py ../../utils
zip -r bgg_game_data_cleaner_fargate_trigger.zip bgg_game_data_cleaner_fargate_trigger.py ../../config.py ../../utils
zip -r generate_game_urls_lambda.zip generate_game_urls_lambda.py ../../config.py ../../utils
zip -r generate_user_urls_lambda.zip generate_user_urls_lambda.py ../../config.py ../../utils
zip -r generate_ratings_urls_lambda.zip generate_ratings_urls_lambda.py ../../config.py ../../utils
zip -r bgg_orchestrator_fargate_trigger.zip bgg_orchestrator_fargate_trigger.py ../../config.py ../../utils
zip -r bgg_user_data_cleaner_fargate_trigger.zip bgg_user_data_cleaner_fargate_trigger.py ../../config.py ../../utils
zip -r bgg_ratings_data_cleaner_fargate_trigger.zip bgg_ratings_data_cleaner_fargate_trigger.py ../../config.py ../../utils
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
Expand All @@ -40,7 +40,7 @@ jobs:
aws lambda update-function-code --function-name=dev_bgg_scraper_fargate_trigger --zip-file=fileb://bgg_scraper_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=dev_bgg_game_data_cleaner_fargate_trigger --zip-file=fileb://bgg_game_data_cleaner_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=dev_bgg_generate_game_urls --zip-file=fileb://generate_game_urls_lambda.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=dev_bgg_generate_user_urls --zip-file=fileb://generate_user_urls_lambda.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=dev_bgg_generate_ratings_urls --zip-file=fileb://generate_ratings_urls_lambda.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=dev_bgg_orchestrator_fargate_trigger --zip-file=fileb://bgg_orchestrator_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=dev_bgg_user_data_cleaner_fargate_trigger --zip-file=fileb://bgg_user_data_cleaner_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=dev_bgg_ratings_data_cleaner_fargate_trigger --zip-file=fileb://bgg_ratings_data_cleaner_fargate_trigger.zip > /dev/null 2>&1
8 changes: 4 additions & 4 deletions .github/workflows/lambda_deployments_prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ jobs:
zip -r bgg_scraper_fargate_trigger.zip bgg_scraper_fargate_trigger.py ../../config.py ../../utils
zip -r bgg_game_data_cleaner_fargate_trigger.zip bgg_game_data_cleaner_fargate_trigger.py ../../config.py ../../utils
zip -r generate_game_urls_lambda.zip generate_game_urls_lambda.py ../../config.py ../../utils
zip -r generate_user_urls_lambda.zip generate_user_urls_lambda.py ../../config.py ../../utils
zip -r generate_ratings_urls_lambda.zip generate_ratings_urls_lambda.py ../../config.py ../../utils
zip -r bgg_orchestrator_fargate_trigger.zip bgg_orchestrator_fargate_trigger.py ../../config.py ../../utils
zip -r bgg_user_data_cleaner_fargate_trigger.zip bgg_user_data_cleaner_fargate_trigger.py ../../config.py ../../utils
zip -r bgg_ratings_data_cleaner_fargate_trigger.zip bgg_ratings_data_cleaner_fargate_trigger.py ../../config.py ../../utils
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
Expand All @@ -35,6 +35,6 @@ jobs:
aws lambda update-function-code --function-name=bgg_scraper_fargate_trigger --zip-file=fileb://bgg_scraper_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=bgg_game_data_cleaner_fargate_trigger --zip-file=fileb://bgg_game_data_cleaner_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=bgg_generate_game_urls --zip-file=fileb://generate_game_urls_lambda.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=bgg_generate_user_urls --zip-file=fileb://generate_user_urls_lambda.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=bgg_generate_ratings_urls --zip-file=fileb://generate_ratings_urls_lambda.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=bgg_orchestrator_fargate_trigger --zip-file=fileb://bgg_orchestrator_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=bgg_user_data_cleaner_fargate_trigger --zip-file=fileb://bgg_user_data_cleaner_fargate_trigger.zip > /dev/null 2>&1
aws lambda update-function-code --function-name=bgg_ratings_data_cleaner_fargate_trigger --zip-file=fileb://bgg_ratings_data_cleaner_fargate_trigger.zip > /dev/null 2>&1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: PROD deployment to boardgamegeek user data cleaner
name: PROD deployment to boardgamegeek ratings data cleaner

on:
push:
Expand All @@ -23,10 +23,10 @@ jobs:
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Build, tag, push image to AWS ECR bgg_user_data_cleaner
- name: Build, tag, push image to AWS ECR bgg_ratings_data_cleaner
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: bgg_user_data_cleaner
ECR_REPOSITORY: bgg_ratings_data_cleaner
run: |
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
docker push $ECR_REGISTRY/$ECR_REPOSITORY
10 changes: 5 additions & 5 deletions Dockerfiles/Dockerfile.user-data-cleaner
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@ RUN apt-get update && apt-get install -y \
RUN pip3 install pipenv

# Create necessary directories
RUN mkdir -p data/prod/users/user_dfs_clean \
data/test/users/user_dfs_clean \
RUN mkdir -p data/prod/users/ratings_dfs_clean \
data/test/users/ratings_dfs_clean \
modules

# Copy the source code into the container
COPY modules/user_data_cleaner modules/user_data_cleaner
COPY modules/ratings_data_cleaner modules/ratings_data_cleaner
COPY utils utils
COPY modules/user_data_cleaner/Pipfile* .
COPY modules/ratings_data_cleaner/Pipfile* .
COPY config.py .

# Install dependencies with pipenv
RUN pipenv sync

ENTRYPOINT ["pipenv", "run", "python", "modules/user_data_cleaner/main.py"]
ENTRYPOINT ["pipenv", "run", "python", "modules/ratings_data_cleaner/main.py"]
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`
- On AWS, navigate to lambda
- From lambda, select `dev_bgg_scraper_fargate_trigger`
- To manually run, go to the "Test" tab
- In the "Event JSON" section, replace the existing keys with `"scraper_type": "game"`. It is recommended to enter in an event name and save the json for future.
- In the "Event JSON" section, replace the existing keys with `"scraper_type": "games"`. It is recommended to enter in an event name and save the json for future.
- Click "Test" to run.

- PROD - `lambda_functions.bgg_scraper_fargate_trigger` for GAME will trigger process to run and write scraping on S3
Expand All @@ -109,7 +109,7 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`
- On AWS, navigate to lambda
- From lambda, select `bgg_scraper_fargate_trigger`
- To manually run, go to the "Test" tab
- In the "Event JSON" section, replace the existing keys with "scraper_type": "game"`. It is recommended to enter in an event name and save the json for future.
- In the "Event JSON" section, replace the existing keys with "scraper_type": "games"`. It is recommended to enter in an event name and save the json for future.
- Click "Test" to run.

### 08 Clean raw scraped GAME data
Expand All @@ -121,21 +121,21 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`

### 09 Generate USER scraping URLS

- `lambda_functions.generate_user_urls_lambda.py`
- `lambda_functions.generate_ratings_urls_lambda.py`
- Must have `games.pkl` in directory `data/prod/game_dfs_dirty` OR on S3 from prior step.
- Loads the `games.pkl` file generated by 04 and generates user ratings urls. Will attempt to load games.pkl locally, otherwise will retrieve it from S3.
- Loads the `games.pkl` file generated by 04 and generates ratings ratings urls. Will attempt to load games.pkl locally, otherwise will retrieve it from S3.

### 10 Scrape users from URLS

- PROD - `lambda_functions.bgg_scraper_fargate_trigger` for USER will trigger process to run and write scraping on S3
- Must have generated game urls first with step 5.
- Scrapes the URLs generated by step #5. This script will always trigger tasks on AWS. DO NOT RUN WITHOUT INTENT costs over $15 per run.
- Must run with arg for scraper type "user" example `python lambda_functions.bgg_scraper_fargate_trigger.py user`
- Must run with arg for scraper type "ratings" example `python lambda_functions.bgg_scraper_fargate_trigger.py user`

- TEST - `bgg_scraper.main.py` for USER
- Use to test a single specific url file. Must have generated user urls first with step 05.
- Use to test a single specific url file. Must have generated ratings urls first with step 05.
- Run locally and pass both scraper type `user` as an arg, and an existing filename without directory or suffix from `data/prod/scraper_urls_raw_user`
- Example: `python bgg_scraper/main.py user group1_user_scraper_urls_raw`
- Example: `python bgg_scraper/main.py ratings group1_ratings_scraper_urls_raw`
- Only saves data locally to `data/prod/users/scraped_xml_raw`

## I added some new stuff to my deployment. How do I update it?
Expand Down
50 changes: 25 additions & 25 deletions aws_dagster_bgg/assets/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def game_scraper_urls(
configs = config_resource.get_config_file()

s3_scraper_bucket = configs["s3_scraper_bucket"]
raw_urls_directory = configs["game"]["raw_urls_directory"]
output_urls_json_suffix = configs["game"]["output_urls_json_suffix"]
raw_urls_directory = configs["games"]["raw_urls_directory"]
output_urls_json_suffix = configs["games"]["output_urls_json_suffix"]

game_scraper_url_filenames = (
[
Expand Down Expand Up @@ -107,7 +107,7 @@ def scraped_game_xmls(

configs = config_resource.get_config_file()

scrape_data(ecs_resource, s3_resource, configs, scraper_type="game")
scrape_data(ecs_resource, s3_resource, configs, scraper_type="games")

return True

Expand All @@ -125,8 +125,8 @@ def game_dfs_clean(
configs = config_resource.get_config_file()

bucket = configs["s3_scraper_bucket"]
key = f'{WORKING_ENV_DIR}{configs["game"]["output_xml_directory"]}'
data_sets = configs["game"]["data_sets"]
key = f'{WORKING_ENV_DIR}{configs["games"]["output_xml_directory"]}'
data_sets = configs["games"]["data_sets"]

raw_game_files = s3_resource.list_file_keys(bucket=bucket, key=key)

Expand Down Expand Up @@ -168,13 +168,13 @@ def game_dfs_clean(


@asset(deps=["game_dfs_clean"])
def user_scraper_urls(
def ratings_scraper_urls(
lambda_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
) -> bool:
"""
Generates the user scraper keys that should exist.
Generates the ratings scraper keys that should exist.
Gets the last modified timestamp of each keys from s3.
Runs the lambda function to generate the urls.
Waits for the urls to be generated.
Expand All @@ -188,10 +188,10 @@ def user_scraper_urls(
configs = config_resource.get_config_file()

s3_scraper_bucket = configs["s3_scraper_bucket"]
raw_urls_directory = configs["user"]["raw_urls_directory"]
output_urls_json_suffix = configs["user"]["output_urls_json_suffix"]
raw_urls_directory = configs["ratings"]["raw_urls_directory"]
output_urls_json_suffix = configs["ratings"]["output_urls_json_suffix"]

user_scraper_url_filenames = (
ratings_scraper_url_filenames = (
[
f"{raw_urls_directory}/group{i}{output_urls_json_suffix}"
for i in range(1, 31)
Expand All @@ -204,59 +204,59 @@ def user_scraper_urls(
lambda_resource,
s3_resource,
s3_scraper_bucket,
user_scraper_url_filenames,
lambda_function_name="bgg_generate_user_urls",
ratings_scraper_url_filenames,
lambda_function_name="bgg_generate_ratings_urls",
)

return True


@asset(deps=["user_scraper_urls"])
def scraped_user_xmls(
@asset(deps=["ratings_scraper_urls"])
def scraped_ratings_xmls(
ecs_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
) -> bool:
"""
Scrapes the BGG website for user data, using the URLs generated in the previous step
Scrapes the BGG website for ratings data, using the URLs generated in the previous step
"""

configs = config_resource.get_config_file()

scrape_data(ecs_resource, s3_resource, configs, scraper_type="user")
scrape_data(ecs_resource, s3_resource, configs, scraper_type="ratings")

return True


@asset(deps=["scraped_user_xmls"])
def user_data_df(
@asset(deps=["scraped_ratings_xmls"])
def ratings_data_df(
s3_resource: ConfigurableResource,
ecs_resource: ConfigurableResource,
config_resource: ConfigurableResource,
) -> bool:
"""
Creates a clean dataframe for the user data from the scraped user XML files
Creates a clean dataframe for the ratings data from the scraped ratings XML files
"""

configs = config_resource.get_config_file()

bucket = configs["s3_scraper_bucket"]
key = f'{WORKING_ENV_DIR}{configs["user"]["output_xml_directory"]}'
key = f'{WORKING_ENV_DIR}{configs["ratings"]["output_xml_directory"]}'

raw_user_files = s3_resource.list_file_keys(bucket=bucket, key=key)
raw_ratings_files = s3_resource.list_file_keys(bucket=bucket, key=key)

assert len(raw_user_files) == 30 if ENVIRONMENT == "prod" else 1
assert len(raw_ratings_files) == 30 if ENVIRONMENT == "prod" else 1

task_definition = (
"bgg_user_data_cleaner"
"bgg_ratings_data_cleaner"
if ENVIRONMENT == "prod"
else "dev_bgg_user_data_cleaner"
else "dev_bgg_ratings_data_cleaner"
)

ecs_resource.launch_ecs_task(task_definition=task_definition)

check_filenames = [
f"{WORKING_ENV_DIR}{configs['user']['clean_dfs_directory']}/user_data.pkl"
f"{WORKING_ENV_DIR}{configs['ratings']['clean_dfs_directory']}/ratings_data.pkl"
]
logger.info(check_filenames)

Expand Down
12 changes: 6 additions & 6 deletions aws_terraform_bgg/elastic_container_registry.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ locals {
module.bgg_scraper_ecr.ecr_repository_name,
module.dev_bgg_scraper_ecr.ecr_repository_name,
module.dev_bgg_orchestrator_ecr.ecr_repository_name,
module.bgg_user_data_cleaner_ecr.ecr_repository_name,
module.dev_bgg_user_data_cleaner_ecr.ecr_repository_name
module.bgg_ratings_data_cleaner_ecr.ecr_repository_name,
module.dev_bgg_ratings_data_cleaner_ecr.ecr_repository_name
]
}

Expand Down Expand Up @@ -40,14 +40,14 @@ module "dev_bgg_game_data_cleaner_ecr" {
ecr_repository_name = "dev_bgg_game_data_cleaner"
}

module "bgg_user_data_cleaner_ecr" {
module "bgg_ratings_data_cleaner_ecr" {
source = "./modules/ecr"
ecr_repository_name = "bgg_user_data_cleaner"
ecr_repository_name = "bgg_ratings_data_cleaner"
}

module "dev_bgg_user_data_cleaner_ecr" {
module "dev_bgg_ratings_data_cleaner_ecr" {
source = "./modules/ecr"
ecr_repository_name = "dev_bgg_user_data_cleaner"
ecr_repository_name = "dev_bgg_ratings_data_cleaner"
}

module "bgg_scraper_ecr" {
Expand Down
24 changes: 12 additions & 12 deletions aws_terraform_bgg/iam_ecs_role.tf
Original file line number Diff line number Diff line change
Expand Up @@ -59,33 +59,33 @@ resource "aws_iam_role_policy_attachment" "glue_boardgamegeekbgg_game_data_clean
policy_arn = aws_iam_policy.glue_table_access.arn
}

module "bgg_user_data_cleaner_FargateTaskRole_role" {
module "bgg_ratings_data_cleaner_FargateTaskRole_role" {
source = "./modules/iam_ecs_roles"
task_definition = "bgg_user_data_cleaner_FargateTaskRole"
task_definition = "bgg_ratings_data_cleaner_FargateTaskRole"
}

module "bgg_user_data_cleaner_FargateExecutionRole_role" {
module "bgg_ratings_data_cleaner_FargateExecutionRole_role" {
source = "./modules/iam_ecs_roles"
task_definition = "bgg_user_data_cleaner_FargateExecutionRole"
task_definition = "bgg_ratings_data_cleaner_FargateExecutionRole"
}

resource "aws_iam_role_policy_attachment" "S3_Access_bgg_user_data_cleaner_FargateExecutionRole_attach" {
role = module.bgg_user_data_cleaner_FargateExecutionRole_role.name
resource "aws_iam_role_policy_attachment" "S3_Access_bgg_ratings_data_cleaner_FargateExecutionRole_attach" {
role = module.bgg_ratings_data_cleaner_FargateExecutionRole_role.name
policy_arn = aws_iam_policy.S3_Access_bgg_scraper_policy.arn
}

resource "aws_iam_role_policy_attachment" "S3_Access_boardgamegeekbgg_user_data_cleaner_FargateTaskRoleattach" {
role = module.bgg_user_data_cleaner_FargateTaskRole_role.name
resource "aws_iam_role_policy_attachment" "S3_Access_boardgamegeekbgg_ratings_data_cleaner_FargateTaskRoleattach" {
role = module.bgg_ratings_data_cleaner_FargateTaskRole_role.name
policy_arn = aws_iam_policy.S3_Access_bgg_scraper_policy.arn
}

resource "aws_iam_role_policy_attachment" "Cloudwatch_Put_Metricsbgg_user_data_cleaner_FargateTaskRoleattach" {
role = module.bgg_user_data_cleaner_FargateTaskRole_role.name
resource "aws_iam_role_policy_attachment" "Cloudwatch_Put_Metricsbgg_ratings_data_cleaner_FargateTaskRoleattach" {
role = module.bgg_ratings_data_cleaner_FargateTaskRole_role.name
policy_arn = aws_iam_policy.Cloudwatch_Put_Metrics_policy.arn
}

resource "aws_iam_role_policy_attachment" "glue_boardgamegeekbgg_user_data_cleaner_FargateTaskRoleattach" {
role = module.bgg_user_data_cleaner_FargateTaskRole_role.name
resource "aws_iam_role_policy_attachment" "glue_boardgamegeekbgg_ratings_data_cleaner_FargateTaskRoleattach" {
role = module.bgg_ratings_data_cleaner_FargateTaskRole_role.name
policy_arn = aws_iam_policy.glue_table_access.arn
}

Expand Down
Loading

0 comments on commit 1755b63

Please sign in to comment.