Skip to content

Commit

Permalink
Update to work dynamically locally and on AWS. Added deploy.yml file …
Browse files Browse the repository at this point in the history
…for CI/CD in github actions.
  • Loading branch information
Robinh0 committed Nov 4, 2024
1 parent 26d551b commit 38e9cb5
Show file tree
Hide file tree
Showing 8 changed files with 234 additions and 129 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: Deploy to AWS Lambda from ECR

on:
push:
branches:
- main # Triggers on pushes to the main branch

jobs:
deploy:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1

- name: Log in to Amazon ECR
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
aws ecr get-login-password --region $AWS_REGION | docker login --username AWS --password-stdin ${{ secrets.ACCOUNT_ID }}.dkr.ecr.$AWS_REGION.amazonaws.com
# - name: Run Unit Tests
# run: |
# docker build --target test -t your-image:test .
# docker run your-image:test # Run unit tests

- name: Build and Tag Docker Image
run: |
docker buildx build --platform linux/amd64 -t ${{ secrets.ACCOUNT_ID }}.dkr.ecr.eu-north-1.amazonaws.com/indeed-scraper:latest --push --provenance=false .
- name: Update Lambda to Use New Image
env:
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
run: |
aws lambda update-function-code --function-name indeed-scraper \
--image-uri ${{ secrets.ACCOUNT_ID }}.dkr.ecr.$AWS_REGION.amazonaws.com/indeed-scraper:latest
Binary file modified __pycache__/extract.cpython-311.pyc
Binary file not shown.
Binary file modified __pycache__/generics.cpython-311.pyc
Binary file not shown.
Binary file modified __pycache__/transform.cpython-311.pyc
Binary file not shown.
207 changes: 123 additions & 84 deletions extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import os
from generics import setup_driver
import pandas as pd
import platform

if platform.system() == "Windows":
from generics import setup_driver
from undetected_chromedriver import Chrome

def scrape_search_results(driver, url_to_scrape):

def scrape_search_results(url, driver):
"""
Scrapes job titles and links from the provided Indeed search results URL.
Expand All @@ -19,107 +23,140 @@ def scrape_search_results(driver, url_to_scrape):
pd.DataFrame: A DataFrame containing job titles and URLs.
str: The first job link found on the page.
"""
driver.get(url_to_scrape)
def close_cookies(driver):
short_wait = WebDriverWait(driver, 2)
try:
cookie_button = short_wait.until(EC.presence_of_element_located(
(By.XPATH, "//button[@id='onetrust-reject-all-handler']")))
cookie_button.click()
# cookie_button_clicked = True
print('Cookie button closed!')
return True
except:
print('No cookies found.')
return False

def close_popup(driver):
short_wait = WebDriverWait(driver, 2)
try:
button = short_wait.until(EC.presence_of_element_located(
(By.XPATH, "//div[@id='mosaic-desktopserpjapopup']//button")))
button.click()
print('Popup closed!')
return True
except:
print("No popup found.")
return False

def click_next_button():
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight)")
next_button = wait.until(EC.presence_of_element_located(
(By.XPATH, "//nav//li/a[@aria-label='Next Page']")))
print("Next page button found! Clicking it.")
next_button.click()

def reset_driver(driver, url_to_scrape):
print("Resetting the driver")
driver.quit()
driver = setup_driver()
driver.get(url=url_to_scrape)
close_popup(driver)
close_cookies(driver)
print("Driver reset")
return driver

# sleep_random(200)
continue_loop = True
cookie_button_clicked = False
popup_button_clicked = False
data = []
counter = 0
max_pages_to_scrape = int(os.getenv("MAX_PAGES_TO_SCRAPE"))
nr_items_per_page = int(os.getenv('NR_ITEMS_PER_PAGE'))
url_to_scrape = url
driver.get(url_to_scrape)
while continue_loop:
url_to_scrape = driver.current_url
wait = WebDriverWait(driver, 5) # 10 seconds timeout
elements = wait.until(EC.presence_of_all_elements_located(
(By.XPATH, "//div[contains(@class, 'job_seen_beacon')]")))

if not cookie_button_clicked:
try:
cookie_button = wait.until(EC.presence_of_element_located(
(By.XPATH, "//button[@id='onetrust-reject-all-handler']")))
cookie_button.click()
cookie_button_clicked = True
print('Cookie button closed!')
except:
print('No cookies found.')
with open('html_contents', 'w') as file:
file.write(driver.page_source)
cookie_button_clicked = close_cookies(driver)

if not popup_button_clicked:
try:
button = wait.until(EC.presence_of_element_located(
(By.XPATH, "//div[@id='mosaic-desktopserpjapopup']//button")))
button.click()
popup_button_clicked = True
print('Popup closed!')
except:
print("No popup found.")
pass
popup_button_clicked = close_popup(driver)

# Iterate through each element to extract job titles and links
for element in elements:
driver.execute_script("arguments[0].scrollIntoView();", element)
title_link = element.find_element(By.XPATH, ".//h2/a")
title_link.click()
sleep_random(200)
title = element.find_element(By.XPATH, ".//h2/a/span").text
company_name = element.find_element(
By.XPATH, ".//span[@data-testid='company-name']").text
location = element.find_element(
By.XPATH, ".//div[@data-testid='text-location']").text
link = element.find_element(
By.XPATH, './/h2/a').get_attribute("href")
description = wait.until(EC.presence_of_element_located(
(By.XPATH, ".//div[@id='jobDescriptionText']")))
description_text = description.text
description_html_content = description.get_attribute('innerHTML')

salary = "-"
try:
salary_element = element.find_element(
By.XPATH, ".//div[contains(@class, 'salary-snippet-container')]")
salary = salary_element.text if salary_element else "-"
except:
pass

print(
f"""Title: {title}\n
URL: {link}\n
company_name: {company_name}\n
location: {location}\n
salary: {salary}\n\n
description: {description_text}\n\n
description_html_content: {description_html_content}\n\n
""")

data.append(
{
"title": title,
"url": link,
"company_name": company_name,
"location": location,
"salary": salary,
"description": description_text,
"html_content": description_html_content,
}
)
break
element_counter = 0
try:
for element in elements:
print(f"Scraping element nr {element_counter}")
sleep_random(200)
# try:
driver.execute_script(
"arguments[0].scrollIntoView();", element)
sleep_random(200)
title_link = element.find_element(By.XPATH, ".//h2/a")
title_link.click()
title = element.find_element(By.XPATH, ".//h2/a/span").text
company_name = element.find_element(
By.XPATH, ".//span[@data-testid='company-name']").text
location = element.find_element(
By.XPATH, ".//div[@data-testid='text-location']").text
link = element.find_element(
By.XPATH, './/h2/a').get_attribute("href")
description = wait.until(EC.presence_of_element_located(
(By.XPATH, ".//div[@id='jobDescriptionText']")))
description_text = description.text
description_html_content = description.get_attribute(
'innerHTML')

# Extracting the salary if noted on page.
salary = "-"
try:
salary_element = element.find_element(
By.XPATH, ".//div[contains(@class, 'salary-snippet-container')]")
salary = salary_element.text if salary_element else "-"
except:
pass

print(
f"title: {title}\ncompany_name: {company_name}\nlocation: {location}\n")

data.append(
{
"title": title,
"url": link,
"company_name": company_name,
"location": location,
"salary": salary,
"description": description_text,
"html_content": description_html_content,
}
)
if element_counter >= nr_items_per_page-1:
break
else:
element_counter += 1
except:
driver = reset_driver(driver, url_to_scrape)
continue

# if not cookie_button_clicked:
# cookie_button_clicked = close_cookies(driver)

if counter < max_pages_to_scrape-1:
print("Clicking the next button and scraping another page.")
try:
driver.execute_script(
"window.scrollTo(0, document.body.scrollHeight)")
next_button = wait.until(EC.presence_of_element_located(
(By.XPATH, "//nav//li/a[@aria-label='Next Page']")))
print("Next page button found! Clicking it.")
next_button.click()
click_next_button()
counter += 1
except:
print(
"Next button not found. Setting continue_loop to False and ending the scrape.")
continue_loop = False
else:
print("All pages scaped. continue_loop is set to false.")
continue_loop = False

df = pd.DataFrame(data)
# df.to_csv(f"results/{get_filename('descriptions')}", index=False)
return df
Expand All @@ -128,14 +165,16 @@ def scrape_search_results(driver, url_to_scrape):
def extract():
"""Main function to orchestrate the ETL extract phase."""
try:
url = os.getenv('INDEED_URL')
# driver = setup_driver()
driver = setup_scrape_browser()
df = scrape_search_results(driver, url)
url = str(os.getenv('INDEED_URL'))
if platform.system() == "Windows":
driver = setup_driver()
else:
driver = setup_scrape_browser()
df = scrape_search_results(url, driver)
except Exception as e:
print(
f"An error occured in the extract process: {e}")
finally:
# os.remove(f"results/{get_filename('links')}")
driver.quit()
return df
return df
29 changes: 15 additions & 14 deletions generics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,27 @@
from webdriver_manager.chrome import ChromeDriverManager
import os
import time
import undetected_chromedriver as uc

import platform

# Download NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')

if platform.system() == "Windows":
import undetected_chromedriver as uc

def setup_driver():
"""Initializes the Chrome WebDriver instance."""
chrome_options = uc.ChromeOptions()
try:
driver = uc.Chrome(options=chrome_options,
service=Service(ChromeDriverManager().install()))
except Exception as e:
print(f"Error setting up Chrome driver: {e}")
return None
return driver


def setup_scrape_browser():
SBR_WEBDRIVER = os.getenv("SCRAPING_BROWSER_URI")
Expand All @@ -26,18 +39,6 @@ def setup_scrape_browser():
return Remote(sbr_connection, options=chrome_options)


def setup_driver():
"""Initializes the Chrome WebDriver instance."""
chrome_options = uc.ChromeOptions()
try:
driver = uc.Chrome(options=chrome_options,
service=Service(ChromeDriverManager().install()))
except Exception as e:
print(f"Error setting up Chrome driver: {e}")
return None
return driver


def remove_stopwords(text):
"""
Removes stopwords from the given text based on the detected language.
Expand Down
Loading

0 comments on commit 38e9cb5

Please sign in to comment.