create deployments for user data cleaner

threnjen · Nov 26, 2024 · b74e3df · b74e3df
1 parent 927724b
commit b74e3df
Show file tree

Hide file tree

Showing 10 changed files with 1,416 additions and 11 deletions.
diff --git a/.github/workflows/dev_deployment_ecs_ratings_cleaner.yml b/.github/workflows/dev_deployment_ecs_ratings_cleaner.yml
@@ -36,6 +36,6 @@ jobs:
           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
           ECR_REPOSITORY: dev_bgg_ratings_data_cleaner
         run: |
-          DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
+          DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.ratings-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
           docker push $ECR_REGISTRY/$ECR_REPOSITORY
       
diff --git a/.github/workflows/dev_deployment_ecs_users_cleaner.yml b/.github/workflows/dev_deployment_ecs_users_cleaner.yml
@@ -0,0 +1,41 @@
+name: DEV deployment to boardgamegeek users data cleaner
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+permissions:
+  id-token: write # This is required for requesting the JWT
+  contents: read  # This is required for actions/checkout
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v2
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+
+      - name: Build, tag, push image to AWS ECR dev_bgg_users_data_cleaner
+        env:
+          ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          ECR_REPOSITORY: dev_bgg_users_data_cleaner
+        run: |
+          DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
+          docker push $ECR_REGISTRY/$ECR_REPOSITORY
+      
diff --git a/.github/workflows/prod_deployment_ecs_ratings_cleaner.yml b/.github/workflows/prod_deployment_ecs_ratings_cleaner.yml
@@ -28,5 +28,5 @@ jobs:
           ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
           ECR_REPOSITORY: bgg_ratings_data_cleaner
         run: |
-          DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
+          DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.ratings-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
           docker push $ECR_REGISTRY/$ECR_REPOSITORY
diff --git a/.github/workflows/prod_deployment_ecs_users_cleaner.yml b/.github/workflows/prod_deployment_ecs_users_cleaner.yml
@@ -0,0 +1,32 @@
+name: PROD deployment to boardgamegeek users data cleaner
+
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+  id-token: write # This is required for requesting the JWT
+  contents: read  # This is required for actions/checkout
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out code
+        uses: actions/checkout@v2
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_GITHUB_ROLE }}
+          aws-region: ${{ secrets.AWS_REGION }}
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - name: Build, tag, push image to AWS ECR bgg_users_data_cleaner
+        env:
+          ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
+          ECR_REPOSITORY: bgg_users_data_cleaner
+        run: |
+          DOCKER_BUILDKIT=1 docker build -f Dockerfiles/Dockerfile.user-data-cleaner --build-arg GROUP="group1" -t $ECR_REGISTRY/$ECR_REPOSITORY .
+          docker push $ECR_REGISTRY/$ECR_REPOSITORY
diff --git a/Dockerfiles/Dockerfile.ratings-data-cleaner b/Dockerfiles/Dockerfile.ratings-data-cleaner
@@ -0,0 +1,25 @@
+FROM python:3.12
+
+# Install required dependencies
+RUN apt-get update && apt-get install -y \
+    python3-pip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install pipenv
+RUN pip3 install pipenv
+
+# Create necessary directories
+RUN mkdir -p modules 
+
+# Copy the source code into the container
+COPY modules/user_data_cleaner modules/user_data_cleaner
+COPY utils utils
+COPY data data
+COPY modules/user_data_cleaner/Pipfile* .
+COPY config.py .
+
+# Install dependencies with pipenv
+RUN pipenv sync
+
+ENTRYPOINT ["pipenv", "run", "python", "modules/user_data_cleaner/main.py"]
diff --git a/Dockerfiles/Dockerfile.user-data-cleaner → Dockerfiles/Dockerfile.users-data-cleaner b/Dockerfiles/Dockerfile.user-data-cleaner → Dockerfiles/Dockerfile.users-data-cleaner
@@ -10,13 +10,12 @@ RUN apt-get update && apt-get install -y \
 RUN pip3 install pipenv
 
 # Create necessary directories
-RUN mkdir -p data/prod/users/ratings_dfs_clean \
-    data/test/users/ratings_dfs_clean \
-    modules 
+RUN mkdir -p modules 
 
 # Copy the source code into the container
 COPY modules/ratings_data_cleaner modules/ratings_data_cleaner
 COPY utils utils
+COPY data data
 COPY modules/ratings_data_cleaner/Pipfile* .
 COPY config.py .
 

diff --git a/modules/bgg_scraper/main.py b/modules/bgg_scraper/main.py
@@ -122,11 +122,11 @@ def parse(self, response):
             return
 
         # Process the valid response
-        self._save_response(response, response.meta["group_num"])
+        self._save_response(response, response.url)
 
-    def _save_response(self, response: scrapy.http.Response, response_id: int):
-        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
-        file_path = f"{WORKING_DIR}{self.save_file_path}/{self.group}_{response_id}_{timestamp}.xml"
+    def _save_response(self, response: scrapy.http.Response, url: str):
+        user_id = url.split("=")[1].split("&")[0]
+        file_path = f"{WORKING_DIR}{self.save_file_path}/{user_id}.xml"
         self.file_handler.save_file(file_path=file_path, data=response.body)
         self.logger.info(f"Response saved to {file_path}")
 
@@ -212,9 +212,8 @@ def _combine_xml_files_to_master(self) -> str:
         saved_files = [
             x
             for x in get_local_keys_based_on_env(self.scraped_files_folder)
-            if self.file_group in x and "combined" not in x and ".gitkeep" not in x
+            if "combined" not in x and ".gitkeep" not in x
         ]
-        print(f"{saved_files}\n\n")
 
         # Parse the first XML file to get the root and header
         tree = ET.parse(saved_files[0])
@@ -229,6 +228,11 @@ def _combine_xml_files_to_master(self) -> str:
             tree = ET.parse(xml_file)
             root = tree.getroot()
 
+            if self.scraper_type == "users":
+                user_name = xml_file.split("/")[-1].split(".")[0]
+                user_tag = ET.SubElement(combined_root, "username")
+                user_tag.text = user_name
+
             # Append each <item> element to the new root
             for item in root.findall("item"):
                 combined_root.append(item)

diff --git a/modules/user_data_cleaner/Pipfile b/modules/user_data_cleaner/Pipfile
@@ -0,0 +1,23 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pandas = "*"
+requests = "*"
+lxml = "*"
+bs4 = "*"
+regex = "*"
+boto3 = "*"
+awswrangler = "*"
+xmltodict = "*"
+
+[dev-packages]
+pytest = "*"
+black = {extras = ["jupyter"], version = "*"}
+ipykernel = "*"
+isort = "*"
+
+[requires]
+python_version = "3.12"