Skip to content

Commit

Permalink
touchup and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
threnjen committed Nov 27, 2024
1 parent 9ffc64e commit a85dda8
Show file tree
Hide file tree
Showing 13 changed files with 112 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy_file_to_s3.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: PROD deployment to boardgamegeek ratings data cleaner
name: Deploy config file to S3

on:
push:
Expand Down
6 changes: 3 additions & 3 deletions Dockerfiles/Dockerfile.ratings-data-cleaner
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ RUN pip3 install pipenv
RUN mkdir -p modules

# Copy the source code into the container
COPY modules/user_data_cleaner modules/user_data_cleaner
COPY modules/ratings_data_cleaner modules/ratings_data_cleaner
COPY utils utils
COPY data data
COPY modules/user_data_cleaner/Pipfile* .
COPY modules/ratings_data_cleaner/Pipfile* .
COPY config.py .

# Install dependencies with pipenv
RUN pipenv sync

ENTRYPOINT ["pipenv", "run", "python", "modules/user_data_cleaner/main.py"]
ENTRYPOINT ["pipenv", "run", "python", "modules/ratings_data_cleaner/main.py"]
6 changes: 3 additions & 3 deletions Dockerfiles/Dockerfile.users-data-cleaner
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ RUN pip3 install pipenv
RUN mkdir -p modules

# Copy the source code into the container
COPY modules/ratings_data_cleaner modules/ratings_data_cleaner
COPY modules/users_data_cleaner modules/users_data_cleaner
COPY utils utils
COPY data data
COPY modules/ratings_data_cleaner/Pipfile* .
COPY modules/users_data_cleaner/Pipfile* .
COPY config.py .

# Install dependencies with pipenv
RUN pipenv sync

ENTRYPOINT ["pipenv", "run", "python", "modules/ratings_data_cleaner/main.py"]
ENTRYPOINT ["pipenv", "run", "python", "modules/users_data_cleaner/main.py"]
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`
- TEST LOCAL - `bgg_scraper.main.py` for GAME to test a single file locally
- Use to test a single specific url file. Must have generated game urls first with step 02.
- Run locally and pass the scraper type `game` as an arg, and an existing filename without directory or suffix from `data/prod/scraper_urls_raw_game`
- Example: `python bgg_scraper/main.py game group1_game_scraper_urls_raw`
- Example: `python bgg_scraper/main.py game group1_games_scraper_urls_raw_raw`
- Only saves data locally to `data/prod/games/scraped_xml_raw`

- TEST ON AWS - `lambda_functions.dev_bgg_scraper_fargate_trigger` for GAME will trigger process to run and write scraping on S3
Expand Down Expand Up @@ -135,7 +135,7 @@ Return to the `aws_terraform_bgg` directory and run `make setup_boardgamegeek`
- TEST - `bgg_scraper.main.py` for USER
- Use to test a single specific url file. Must have generated ratings urls first with step 05.
- Run locally and pass both scraper type `user` as an arg, and an existing filename without directory or suffix from `data/prod/scraper_urls_raw_user`
- Example: `python bgg_scraper/main.py ratings group1_ratings_scraper_urls_raw`
- Example: `python bgg_scraper/main.py ratings group1_ratings_scraper_urls_raw_raw`
- Only saves data locally to `data/prod/users/scraped_xml_raw`

## I added some new stuff to my deployment. How do I update it?
Expand Down
81 changes: 66 additions & 15 deletions aws_dagster_bgg/assets/assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@


@asset
def bgg_games_csv(
def boardgame_ranks_csv(
s3_resource: ConfigurableResource,
lambda_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand Down Expand Up @@ -49,8 +49,8 @@ def bgg_games_csv(
)


@asset(deps=["bgg_games_csv"])
def game_scraper_urls(
@asset(deps=["boardgame_ranks_csv"])
def games_scraper_urls_raw(
lambda_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand Down Expand Up @@ -95,8 +95,8 @@ def game_scraper_urls(
return True


@asset(deps=["game_scraper_urls"])
def scraped_game_xmls(
@asset(deps=["games_scraper_urls_raw"])
def games_scraped_xml_raw(
ecs_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand All @@ -112,7 +112,7 @@ def scraped_game_xmls(
return True


@asset(deps=["scraped_game_xmls"])
@asset(deps=["games_scraped_xml_raw"])
def game_dfs_clean(
s3_resource: ConfigurableResource,
ecs_resource: ConfigurableResource,
Expand Down Expand Up @@ -143,7 +143,7 @@ def game_dfs_clean(
logger.info(data_sets)

data_set_file_names = [
f"{WORKING_ENV_DIR}{configs['game']['clean_dfs_directory']}/{x}_clean.pkl"
f"{WORKING_ENV_DIR}{configs['games']['clean_dfs_directory']}/{x}_clean.pkl"
for x in data_sets
]
logger.info(data_set_file_names)
Expand All @@ -168,7 +168,7 @@ def game_dfs_clean(


@asset(deps=["game_dfs_clean"])
def ratings_scraper_urls(
def ratings_scraper_urls_raw(
lambda_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand Down Expand Up @@ -211,8 +211,8 @@ def ratings_scraper_urls(
return True


@asset(deps=["ratings_scraper_urls"])
def scraped_ratings_xmls(
@asset(deps=["ratings_scraper_urls_raw"])
def ratings_scraped_xml_raw(
ecs_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand All @@ -228,8 +228,8 @@ def scraped_ratings_xmls(
return True


@asset(deps=["scraped_ratings_xmls"])
def ratings_data_df(
@asset(deps=["ratings_scraped_xml_raw"])
def ratings_dfs_dirty(
s3_resource: ConfigurableResource,
ecs_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand Down Expand Up @@ -280,7 +280,7 @@ def ratings_data_df(


@asset
def user_scraper_urls(
def users_scraper_urls_raw(
lambda_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand Down Expand Up @@ -325,8 +325,8 @@ def user_scraper_urls(
return True


@asset(deps=["user_scraper_urls"])
def scraped_user_xmls(
@asset(deps=["users_scraper_urls_raw"])
def users_scraped_xml_raw(
ecs_resource: ConfigurableResource,
s3_resource: ConfigurableResource,
config_resource: ConfigurableResource,
Expand All @@ -342,6 +342,57 @@ def scraped_user_xmls(
return True


@asset(deps=["users_scraped_xml_raw"])
def users_dfs_clean(
s3_resource: ConfigurableResource,
ecs_resource: ConfigurableResource,
config_resource: ConfigurableResource,
) -> bool:
"""
Creates a clean dataframe for the ratings data from the scraped ratings XML files
"""

configs = config_resource.get_config_file()

bucket = configs["s3_scraper_bucket"]
key = f'{WORKING_ENV_DIR}{configs["users"]["output_xml_directory"]}'

raw_ratings_files = s3_resource.list_file_keys(bucket=bucket, key=key)

assert len(raw_ratings_files) == 30 if ENVIRONMENT == "prod" else 1

task_definition = (
"bgg_users_data_cleaner"
if ENVIRONMENT == "prod"
else "dev_bgg_users_data_cleaner"
)

ecs_resource.launch_ecs_task(task_definition=task_definition)

check_filenames = [
f"{WORKING_ENV_DIR}{configs['users']['clean_dfs_directory']}/users_data.pkl"
]
logger.info(check_filenames)

original_timestamps = {
key: s3_resource.get_last_modified(
bucket=bucket,
key=key,
)
for key in check_filenames
}

compare_timestamps_for_refresh(
original_timestamps=original_timestamps,
file_list_to_check=check_filenames,
location_bucket=bucket,
sleep_timer=300,
s3_resource=s3_resource,
)

return True


@op
def compare_timestamps_for_refresh(
original_timestamps: dict,
Expand Down
2 changes: 1 addition & 1 deletion aws_dagster_bgg/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
bgg_job = define_asset_job("bgg_job", AssetSelection.all())

user_job = define_asset_job(
"user_job", selection=["user_scraper_urls", "scraped_user_xmls"]
"user_job", selection=["users_scraper_urls_raw", "users_scraped_xml_raw"]
)
3 changes: 2 additions & 1 deletion config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"game_dfs_dirty": "game_dfs_dirty",
"scraper_task_definition": "bgg_scraper",
"game_cleaner_task_definition": "bgg_game_data_cleaner",
"user_cleaner_task_definition": "bgg_user_data_cleaner",
"ratings_cleaner_task_definition": "bgg_ratings_data_cleaner",
"user_cleaner_task_definition": "bgg_users_data_cleaner",
"ecs_cluster": "boardgamegeek",
"orchestrator_task_definition": "bgg_orchestrator",
"boardgamegeek_csv_filename": "boardgames_ranks.csv",
Expand Down
28 changes: 24 additions & 4 deletions modules/bgg_boardgame_file_retrieval/get_bgg_games_file.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import time
import zipfile
import csv
from os.path import expanduser
from tempfile import mkdtemp

Expand All @@ -24,8 +25,8 @@ def initialize_driver(default_directory: str) -> webdriver.Chrome:
options for the scraper to work. The function will return the
initialized driver."""

# if not os.environ.get("ENVIRONMENT", "dev") == "prod":
# return webdriver.Chrome()
if not os.environ.get("ENVIRONMENT", "dev") == "prod":
return webdriver.Chrome()

chrome_options = ChromeOptions()
chrome_options.add_argument("--headless=new")
Expand Down Expand Up @@ -106,13 +107,32 @@ def lambda_handler(event: dict = None, context: dict = None) -> None:
with zipfile.ZipFile(f"{default_directory}/Downloads/{filename}", "r") as zip_ref:
zip_ref.extractall(extract_directory)

local_file = f"{extract_directory}/boardgames_ranks.csv"
output_file = f"{extract_directory}/boardgames_ranks.tsv"

# Input and output file paths
input_file = local_file
output_file = output_file

# Convert CSV to TSV
with open(input_file, "r") as csv_file, open(
output_file, "w", newline=""
) as tsv_file:
csv_reader = csv.reader(csv_file)
tsv_writer = csv.writer(tsv_file, delimiter="\t")

for row in csv_reader:
tsv_writer.writerow(row)

print(f"Converted {input_file} to {output_file}")

wr.s3.upload(
local_file=f"{extract_directory}/boardgames_ranks.csv",
local_file=output_file,
path=f"s3://{S3_SCRAPER_BUCKET}/data/prod/boardgames_ranks.csv",
)

wr.s3.upload(
local_file=f"{extract_directory}/boardgames_ranks.csv",
local_file=output_file,
path=f"s3://{S3_SCRAPER_BUCKET}/data/test/boardgames_ranks.csv",
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = os.environ.get("S3_SCRAPER_BUCKET")
SCRAPER_TASK_DEFINITION = CONFIGS["ratings_cleaner_task_definition"]
TASK_DEFINITION = CONFIGS["ratings_cleaner_task_definition"]
TERRAFORM_STATE_BUCKET = os.environ.get("TF_VAR_BUCKET")


Expand Down Expand Up @@ -36,9 +36,7 @@ def lambda_handler(event, context):
terraform_state_file = get_terraform_state_file_for_vpc()

task_definition = (
f"dev_{SCRAPER_TASK_DEFINITION}"
if ENVIRONMENT != "prod"
else SCRAPER_TASK_DEFINITION
f"dev_{TASK_DEFINITION}" if ENVIRONMENT != "prod" else TASK_DEFINITION
)
print(task_definition)

Expand Down
15 changes: 8 additions & 7 deletions modules/ratings_data_cleaner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,14 @@ def _save_dfs_to_disk_or_s3(self, ratings_df: dict[pd.DataFrame]):
def _create_file_of_unique_user_ids(self, ratings_df: pd.DataFrame) -> list:
"""Create a list of unique user IDs"""

user_ratings_count_df = ratings_df.groupby("username").count()["rating"]
ratings_names_less_than_5 = user_ratings_count_df[
user_ratings_count_df < 5
].index
ratings_df = ratings_df.drop(
ratings_df[ratings_df["username"].isin(ratings_names_less_than_5)].index
)
if ENVIRONMENT == "prod":
user_ratings_count_df = ratings_df.groupby("username").count()["rating"]
ratings_names_less_than_5 = user_ratings_count_df[
user_ratings_count_df < 5
].index
ratings_df = ratings_df.drop(
ratings_df[ratings_df["username"].isin(ratings_names_less_than_5)].index
)

unique_ids = {"list_of_ids": ratings_df["username"].unique().tolist()}

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit a85dda8

Please sign in to comment.