Skip to content

Commit

Permalink
create deployments for user data cleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
threnjen committed Nov 26, 2024
1 parent 927724b commit b74e3df
Show file tree
Hide file tree
Showing 10 changed files with 1,416 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/dev_deployment_ecs_ratings_cleaner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,6 @@ jobs:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: dev_bgg_ratings_data_cleaner
run: |
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.ratings-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
docker push $ECR_REGISTRY/$ECR_REPOSITORY
41 changes: 41 additions & 0 deletions .github/workflows/dev_deployment_ecs_users_cleaner.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: DEV deployment to boardgamegeek users data cleaner

on:
push:
branches:
- main
pull_request:
branches:
- main

permissions:
id-token: write # This is required for requesting the JWT
contents: read # This is required for actions/checkout

jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v2
with:
ref: ${{ github.event.pull_request.head.sha }}

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2

- name: Build, tag, push image to AWS ECR dev_bgg_users_data_cleaner
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: dev_bgg_users_data_cleaner
run: |
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
docker push $ECR_REGISTRY/$ECR_REPOSITORY
2 changes: 1 addition & 1 deletion .github/workflows/prod_deployment_ecs_ratings_cleaner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,5 @@ jobs:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: bgg_ratings_data_cleaner
run: |
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.ratings-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
docker push $ECR_REGISTRY/$ECR_REPOSITORY
32 changes: 32 additions & 0 deletions .github/workflows/prod_deployment_ecs_users_cleaner.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: PROD deployment to boardgamegeek users data cleaner

on:
push:
branches:
- main

permissions:
id-token: write # This is required for requesting the JWT
contents: read # This is required for actions/checkout

jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v2
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Build, tag, push image to AWS ECR bgg_users_data_cleaner
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: bgg_users_data_cleaner
run: |
DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
docker push $ECR_REGISTRY/$ECR_REPOSITORY
25 changes: 25 additions & 0 deletions Dockerfiles/Dockerfile.ratings-data-cleaner
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM python:3.12

# Install required dependencies
RUN apt-get update && apt-get install -y \
python3-pip \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Install pipenv
RUN pip3 install pipenv

# Create necessary directories
RUN mkdir -p modules

# Copy the source code into the container
COPY modules/user_data_cleaner modules/user_data_cleaner
COPY utils utils
COPY data data
COPY modules/user_data_cleaner/Pipfile* .
COPY config.py .

# Install dependencies with pipenv
RUN pipenv sync

ENTRYPOINT ["pipenv", "run", "python", "modules/user_data_cleaner/main.py"]
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,12 @@ RUN apt-get update && apt-get install -y \
RUN pip3 install pipenv

# Create necessary directories
RUN mkdir -p data/prod/users/ratings_dfs_clean \
data/test/users/ratings_dfs_clean \
modules
RUN mkdir -p modules

# Copy the source code into the container
COPY modules/ratings_data_cleaner modules/ratings_data_cleaner
COPY utils utils
COPY data data
COPY modules/ratings_data_cleaner/Pipfile* .
COPY config.py .

Expand Down
16 changes: 10 additions & 6 deletions modules/bgg_scraper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,11 +122,11 @@ def parse(self, response):
return

# Process the valid response
self._save_response(response, response.meta["group_num"])
self._save_response(response, response.url)

def _save_response(self, response: scrapy.http.Response, response_id: int):
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
file_path = f"{WORKING_DIR}{self.save_file_path}/{self.group}_{response_id}_{timestamp}.xml"
def _save_response(self, response: scrapy.http.Response, url: str):
user_id = url.split("=")[1].split("&")[0]
file_path = f"{WORKING_DIR}{self.save_file_path}/{user_id}.xml"
self.file_handler.save_file(file_path=file_path, data=response.body)
self.logger.info(f"Response saved to {file_path}")

Expand Down Expand Up @@ -212,9 +212,8 @@ def _combine_xml_files_to_master(self) -> str:
saved_files = [
x
for x in get_local_keys_based_on_env(self.scraped_files_folder)
if self.file_group in x and "combined" not in x and ".gitkeep" not in x
if "combined" not in x and ".gitkeep" not in x
]
print(f"{saved_files}\n\n")

# Parse the first XML file to get the root and header
tree = ET.parse(saved_files[0])
Expand All @@ -229,6 +228,11 @@ def _combine_xml_files_to_master(self) -> str:
tree = ET.parse(xml_file)
root = tree.getroot()

if self.scraper_type == "users":
user_name = xml_file.split("/")[-1].split(".")[0]
user_tag = ET.SubElement(combined_root, "username")
user_tag.text = user_name

# Append each <item> element to the new root
for item in root.findall("item"):
combined_root.append(item)
Expand Down
23 changes: 23 additions & 0 deletions modules/user_data_cleaner/Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
pandas = "*"
requests = "*"
lxml = "*"
bs4 = "*"
regex = "*"
boto3 = "*"
awswrangler = "*"
xmltodict = "*"

[dev-packages]
pytest = "*"
black = {extras = ["jupyter"], version = "*"}
ipykernel = "*"
isort = "*"

[requires]
python_version = "3.12"
Loading

0 comments on commit b74e3df

Please sign in to comment.