Skip to content

Commit

Permalink
refactor user cleaner for new file structure
Browse files Browse the repository at this point in the history
  • Loading branch information
threnjen committed Nov 28, 2024
1 parent fd22398 commit 48eb900
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 29 deletions.
50 changes: 22 additions & 28 deletions modules/users_data_cleaner/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def data_extraction_chain(self):
self._save_dfs_to_disk_or_s3(
directory="dirty_dfs_directory", table_name="user_data", df=users_df
)
merged_df = self.merge_with_other_ratings_file(users_df)
merged_df = self.merge_with_other_ratings_file()
self._save_dfs_to_disk_or_s3(
directory="clean_dfs_directory",
table_name="complete_user_ratings",
Expand All @@ -47,6 +47,7 @@ def _get_file_list(self) -> list[str]:

xml_directory = USER_CONFIGS["output_xml_directory"]
file_list_to_process = get_s3_keys_based_on_env(xml_directory)
print(f"User files to process: {len(file_list_to_process)}\n\n")
if not file_list_to_process:
local_files = get_local_keys_based_on_env(xml_directory)
file_list_to_process = [x for x in local_files if x.endswith(".xml")]
Expand All @@ -60,42 +61,29 @@ def _process_file_list(self, file_list_to_process: list) -> list[dict]:

all_entries = []

for file in file_list_to_process:
for file_name in file_list_to_process:

user_entries = self._get_individual_user(file_name=file)
local_open = load_file_local_first(
path=USER_CONFIGS["output_xml_directory"],
file_name=file_name.split("/")[-1],
)

for user_entry in user_entries:
game_page = BeautifulSoup(local_open, features="xml")

user_entry = BeautifulSoup(str(user_entry), features="xml")
username = user_entry.find("username").get("value")
print(f"\nParsing user {username}")
username = file_name.split("user_")[-1].split(".xml")[0]
print(f"\nParsing user {username}")

one_user_reviews = self._get_ratings_from_user(username, user_entry)
print(f"Ratings for user {username}: {len(one_user_reviews)}")
one_user_reviews = self._get_ratings_from_user(username, game_page)
print(f"Ratings for user {username}: {len(one_user_reviews)}")

self.total_entries += len(one_user_reviews)
self.total_entries += len(one_user_reviews)

all_entries += one_user_reviews
all_entries += one_user_reviews

print(f"\nTotal number of ratings ratings processed: {self.total_entries}")

return all_entries

def _get_individual_user(self, file_name: str) -> Tuple[str, list[BeautifulSoup]]:
"""Get the BeautifulSoup object for the XML file"""
local_open = load_file_local_first(
path=USER_CONFIGS["output_xml_directory"],
file_name=file_name.split("/")[-1],
)

game_page = BeautifulSoup(local_open, features="xml")

user_entries = game_page.find_all("username")

print(f"\nTotal number of users in file: {len(user_entries)}\n")

return user_entries

def _get_ratings_from_user(
self, username: str, user_entry: BeautifulSoup
) -> list[list]:
Expand Down Expand Up @@ -124,10 +112,16 @@ def _create_table_from_data(self, all_entries: dict[list]) -> pd.DataFrame:

return df

def merge_with_other_ratings_file(self, users_df):
def merge_with_other_ratings_file(self):

users_df = load_file_local_first(
path=USER_CONFIGS["dirty_dfs_directory"], file_name=f"user_data.pkl"
)
ratings_df = load_file_local_first(
path=RATINGS_CONFIGS["clean_dfs_directory"], file_name=f"ratings_data.pkl"
path=RATINGS_CONFIGS["dirty_dfs_directory"], file_name=f"ratings_data.pkl"
)
print(ratings_df.info())
print(users_df.info())

merged_df = pd.merge(
ratings_df[["username", "BGGId", "rating", "value"]],
Expand Down
2 changes: 1 addition & 1 deletion utils/processing_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@


def get_s3_keys_based_on_env(directory: str):
directory = f"{WORKING_DIR}{directory}"
directory = f"s3://{S3_SCRAPER_BUCKET}/{WORKING_DIR}{directory}"
return S3FileHandler().list_files(directory)


Expand Down
3 changes: 3 additions & 0 deletions utils/s3_file_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import pickle
import awswrangler as wr
from datetime import datetime
from typing import Any, Union

Expand Down Expand Up @@ -113,6 +114,8 @@ def file_exists(self, file_path: str) -> bool:
return False

def list_files(self, directory: str) -> list[str]:

return wr.s3.list_objects(directory)
response = self.s3_client.list_objects_v2(
Bucket=S3_SCRAPER_BUCKET, Prefix=directory
)
Expand Down

0 comments on commit 48eb900

Please sign in to comment.