Update to work dynamically locally and on AWS. Added deploy.yml file …

…for CI/CD in github actions.
Robinh0 · Nov 4, 2024 · 38e9cb5 · 38e9cb5
1 parent 26d551b
commit 38e9cb5
Show file tree

Hide file tree

Showing 8 changed files with 234 additions and 129 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,43 @@
+name: Deploy to AWS Lambda from ECR
+
+on:
+  push:
+    branches:
+      - main  # Triggers on pushes to the main branch
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+
+      - name: Log in to Amazon ECR
+        env:
+          AWS_REGION: ${{ secrets.AWS_REGION }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin ${{ secrets.ACCOUNT_ID }}.dkr.ecr.$AWS_REGION.amazonaws.com
+
+      # - name: Run Unit Tests
+      #   run: |
+      #     docker build --target test -t your-image:test .
+      #     docker run your-image:test  # Run unit tests
+
+      - name: Build and Tag Docker Image
+        run: |
+          docker buildx build --platform linux/amd64 -t ${{ secrets.ACCOUNT_ID }}.dkr.ecr.eu-north-1.amazonaws.com/indeed-scraper:latest --push --provenance=false .
+
+      - name: Update Lambda to Use New Image
+        env:
+          AWS_REGION: ${{ secrets.AWS_REGION }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          aws lambda update-function-code --function-name indeed-scraper \
+            --image-uri ${{ secrets.ACCOUNT_ID }}.dkr.ecr.$AWS_REGION.amazonaws.com/indeed-scraper:latest
diff --git a/__pycache__/extract.cpython-311.pyc b/__pycache__/extract.cpython-311.pyc
diff --git a/__pycache__/generics.cpython-311.pyc b/__pycache__/generics.cpython-311.pyc
diff --git a/__pycache__/transform.cpython-311.pyc b/__pycache__/transform.cpython-311.pyc
diff --git a/extract.py b/extract.py
@@ -2,12 +2,16 @@
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.support.ui import WebDriverWait
-import pandas as pd
 import os
-from generics import setup_driver
+import pandas as pd
+import platform
 
+if platform.system() == "Windows":
+    from generics import setup_driver
+    from undetected_chromedriver import Chrome
 
-def scrape_search_results(driver, url_to_scrape):
+
+def scrape_search_results(url, driver):
     """
     Scrapes job titles and links from the provided Indeed search results URL.
 
@@ -19,107 +23,140 @@ def scrape_search_results(driver, url_to_scrape):
         pd.DataFrame: A DataFrame containing job titles and URLs.
         str: The first job link found on the page.
     """
-    driver.get(url_to_scrape)
+    def close_cookies(driver):
+        short_wait = WebDriverWait(driver, 2)
+        try:
+            cookie_button = short_wait.until(EC.presence_of_element_located(
+                (By.XPATH, "//button[@id='onetrust-reject-all-handler']")))
+            cookie_button.click()
+            # cookie_button_clicked = True
+            print('Cookie button closed!')
+            return True
+        except:
+            print('No cookies found.')
+            return False
+
+    def close_popup(driver):
+        short_wait = WebDriverWait(driver, 2)
+        try:
+            button = short_wait.until(EC.presence_of_element_located(
+                (By.XPATH, "//div[@id='mosaic-desktopserpjapopup']//button")))
+            button.click()
+            print('Popup closed!')
+            return True
+        except:
+            print("No popup found.")
+            return False
+
+    def click_next_button():
+        driver.execute_script(
+            "window.scrollTo(0, document.body.scrollHeight)")
+        next_button = wait.until(EC.presence_of_element_located(
+            (By.XPATH, "//nav//li/a[@aria-label='Next Page']")))
+        print("Next page button found! Clicking it.")
+        next_button.click()
+
+    def reset_driver(driver, url_to_scrape):
+        print("Resetting the driver")
+        driver.quit()
+        driver = setup_driver()
+        driver.get(url=url_to_scrape)
+        close_popup(driver)
+        close_cookies(driver)
+        print("Driver reset")
+        return driver
+
+    # sleep_random(200)
     continue_loop = True
     cookie_button_clicked = False
     popup_button_clicked = False
     data = []
     counter = 0
     max_pages_to_scrape = int(os.getenv("MAX_PAGES_TO_SCRAPE"))
+    nr_items_per_page = int(os.getenv('NR_ITEMS_PER_PAGE'))
+    url_to_scrape = url
+    driver.get(url_to_scrape)
     while continue_loop:
+        url_to_scrape = driver.current_url
         wait = WebDriverWait(driver, 5)  # 10 seconds timeout
         elements = wait.until(EC.presence_of_all_elements_located(
             (By.XPATH, "//div[contains(@class, 'job_seen_beacon')]")))
 
         if not cookie_button_clicked:
-            try:
-                cookie_button = wait.until(EC.presence_of_element_located(
-                    (By.XPATH, "//button[@id='onetrust-reject-all-handler']")))
-                cookie_button.click()
-                cookie_button_clicked = True
-                print('Cookie button closed!')
-            except:
-                print('No cookies found.')
-                with open('html_contents', 'w') as file:
-                    file.write(driver.page_source)
+            cookie_button_clicked = close_cookies(driver)
 
         if not popup_button_clicked:
-            try:
-                button = wait.until(EC.presence_of_element_located(
-                    (By.XPATH, "//div[@id='mosaic-desktopserpjapopup']//button")))
-                button.click()
-                popup_button_clicked = True
-                print('Popup closed!')
-            except:
-                print("No popup found.")
-                pass
+            popup_button_clicked = close_popup(driver)
 
         # Iterate through each element to extract job titles and links
-        for element in elements:
-            driver.execute_script("arguments[0].scrollIntoView();", element)
-            title_link = element.find_element(By.XPATH, ".//h2/a")
-            title_link.click()
-            sleep_random(200)
-            title = element.find_element(By.XPATH, ".//h2/a/span").text
-            company_name = element.find_element(
-                By.XPATH, ".//span[@data-testid='company-name']").text
-            location = element.find_element(
-                By.XPATH, ".//div[@data-testid='text-location']").text
-            link = element.find_element(
-                By.XPATH, './/h2/a').get_attribute("href")
-            description = wait.until(EC.presence_of_element_located(
-                (By.XPATH, ".//div[@id='jobDescriptionText']")))
-            description_text = description.text
-            description_html_content = description.get_attribute('innerHTML')
-
-            salary = "-"
-            try:
-                salary_element = element.find_element(
-                    By.XPATH, ".//div[contains(@class, 'salary-snippet-container')]")
-                salary = salary_element.text if salary_element else "-"
-            except:
-                pass
-
-            print(
-                f"""Title: {title}\n
-                URL: {link}\n
-                company_name: {company_name}\n
-                location: {location}\n
-                salary: {salary}\n\n
-                description: {description_text}\n\n
-                description_html_content: {description_html_content}\n\n
-                """)
-
-            data.append(
-                {
-                    "title": title,
-                    "url": link,
-                    "company_name": company_name,
-                    "location": location,
-                    "salary": salary,
-                    "description": description_text,
-                    "html_content": description_html_content,
-                }
-            )
-            break
+        element_counter = 0
+        try:
+            for element in elements:
+                print(f"Scraping element nr {element_counter}")
+                sleep_random(200)
+                # try:
+                driver.execute_script(
+                    "arguments[0].scrollIntoView();", element)
+                sleep_random(200)
+                title_link = element.find_element(By.XPATH, ".//h2/a")
+                title_link.click()
+                title = element.find_element(By.XPATH, ".//h2/a/span").text
+                company_name = element.find_element(
+                    By.XPATH, ".//span[@data-testid='company-name']").text
+                location = element.find_element(
+                    By.XPATH, ".//div[@data-testid='text-location']").text
+                link = element.find_element(
+                    By.XPATH, './/h2/a').get_attribute("href")
+                description = wait.until(EC.presence_of_element_located(
+                    (By.XPATH, ".//div[@id='jobDescriptionText']")))
+                description_text = description.text
+                description_html_content = description.get_attribute(
+                    'innerHTML')
+
+                # Extracting the salary if noted on page.
+                salary = "-"
+                try:
+                    salary_element = element.find_element(
+                        By.XPATH, ".//div[contains(@class, 'salary-snippet-container')]")
+                    salary = salary_element.text if salary_element else "-"
+                except:
+                    pass
+
+                print(
+                    f"title: {title}\ncompany_name: {company_name}\nlocation: {location}\n")
+
+                data.append(
+                    {
+                        "title": title,
+                        "url": link,
+                        "company_name": company_name,
+                        "location": location,
+                        "salary": salary,
+                        "description": description_text,
+                        "html_content": description_html_content,
+                    }
+                )
+                if element_counter >= nr_items_per_page-1:
+                    break
+                else:
+                    element_counter += 1
+        except:
+            driver = reset_driver(driver, url_to_scrape)
+            continue
+
+        # if not cookie_button_clicked:
+        #     cookie_button_clicked = close_cookies(driver)
+
         if counter < max_pages_to_scrape-1:
             print("Clicking the next button and scraping another page.")
             try:
-                driver.execute_script(
-                    "window.scrollTo(0, document.body.scrollHeight)")
-                next_button = wait.until(EC.presence_of_element_located(
-                    (By.XPATH, "//nav//li/a[@aria-label='Next Page']")))
-                print("Next page button found! Clicking it.")
-                next_button.click()
+                click_next_button()
                 counter += 1
             except:
-                print(
-                    "Next button not found. Setting continue_loop to False and ending the scrape.")
                 continue_loop = False
         else:
             print("All pages scaped. continue_loop is set to false.")
             continue_loop = False
-
     df = pd.DataFrame(data)
     # df.to_csv(f"results/{get_filename('descriptions')}", index=False)
     return df
@@ -128,14 +165,16 @@ def scrape_search_results(driver, url_to_scrape):
 def extract():
     """Main function to orchestrate the ETL extract phase."""
     try:
-        url = os.getenv('INDEED_URL')
-        # driver = setup_driver()
-        driver = setup_scrape_browser()
-        df = scrape_search_results(driver, url)
+        url = str(os.getenv('INDEED_URL'))
+        if platform.system() == "Windows":
+            driver = setup_driver()
+        else:
+            driver = setup_scrape_browser()
+        df = scrape_search_results(url, driver)
     except Exception as e:
         print(
             f"An error occured in the extract process: {e}")
     finally:
         # os.remove(f"results/{get_filename('links')}")
         driver.quit()
-        return df
+    return df
diff --git a/generics.py b/generics.py
@@ -8,14 +8,27 @@
 from webdriver_manager.chrome import ChromeDriverManager
 import os
 import time
-import undetected_chromedriver as uc
-
+import platform
 
 # Download NLTK resources
 # nltk.download('punkt')
 # nltk.download('stopwords')
 # nltk.download('punkt_tab')
 
+if platform.system() == "Windows":
+    import undetected_chromedriver as uc
+
+    def setup_driver():
+        """Initializes the Chrome WebDriver instance."""
+        chrome_options = uc.ChromeOptions()
+        try:
+            driver = uc.Chrome(options=chrome_options,
+                               service=Service(ChromeDriverManager().install()))
+        except Exception as e:
+            print(f"Error setting up Chrome driver: {e}")
+            return None
+        return driver
+
 
 def setup_scrape_browser():
     SBR_WEBDRIVER = os.getenv("SCRAPING_BROWSER_URI")
@@ -26,18 +39,6 @@ def setup_scrape_browser():
     return Remote(sbr_connection, options=chrome_options)
 
 
-def setup_driver():
-    """Initializes the Chrome WebDriver instance."""
-    chrome_options = uc.ChromeOptions()
-    try:
-        driver = uc.Chrome(options=chrome_options,
-                           service=Service(ChromeDriverManager().install()))
-    except Exception as e:
-        print(f"Error setting up Chrome driver: {e}")
-        return None
-    return driver
-
-
 def remove_stopwords(text):
     """
     Removes stopwords from the given text based on the detected language.