Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugs fixes, adjustments, and cleaning. #84

Merged
merged 6 commits into from
Jul 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion jobfunnel/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.1.8'
__version__ = '2.1.9'
14 changes: 9 additions & 5 deletions jobfunnel/glassdoor_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
}
self.query = '-'.join(self.search_terms['keywords'])

def convert_radius(self, radius):
"""function that quantizes the user input radius to a valid radius
Expand All @@ -50,7 +49,7 @@ def convert_radius(self, radius):
radius = 25
elif 50 <= radius < 100:
radius = 50
elif 100 <= radius:
elif radius >= 100:
radius = 100
return radius

Expand All @@ -70,10 +69,15 @@ def convert_radius(self, radius):
elif radius >= 200:
radius = 200

glassdoor_radius = {0: 0, 10: 6, 20: 12,
30: 19, 50: 31, 100: 62, 200: 124}
glassdoor_radius = {0: 0,
10: 6,
20: 12,
30: 19,
50: 31,
100: 62,
200: 124}

return glassdoor_radius[radius]
return glassdoor_radius[radius]

def parse_blurb(self, job, html):
"""parses and stores job description into dict entry"""
Expand Down
2 changes: 2 additions & 0 deletions jobfunnel/glassdoor_dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ def __init__(self, args):
super().__init__(args)
self.provider = 'glassdoordynamic'

# Keeping old query function so this class does not break.
self.query = '-'.join(self.search_terms['keywords'])
# initialize the webdriver
self.driver = get_webdriver()

Expand Down
23 changes: 12 additions & 11 deletions jobfunnel/glassdoor_static.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from concurrent.futures import ThreadPoolExecutor, wait
from logging import info as log_info
from math import ceil
from requests import post
from time import sleep, time

from .jobfunnel import JobFunnel, MASTERLIST_HEADER
Expand All @@ -30,6 +29,10 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
}
# Sets headers as default on Session object
self.s.headers.update(self.headers)
# Concatenates keywords with '-'
self.query = ' '.join(self.search_terms['keywords'])

def get_search_url(self, method='get'):
"""gets the glassdoor search url"""
Expand All @@ -40,8 +43,9 @@ def get_search_url(self, method='get'):
# form the location lookup url
location_url = 'https://www.glassdoor.co.in/findPopularLocationAjax.htm?'

# get the location id for search location
# get location id for search location
location_response = self.s.post(
# set location headers to override default session headers
location_url, headers=self.location_headers, data=data
).json()

Expand Down Expand Up @@ -69,23 +73,22 @@ def get_search_url(self, method='get'):
else:
raise ValueError(f'No html method {method} exists')

def search_page_for_job_soups(self, data, page, url, job_soup_list):
def search_page_for_job_soups(self, page, url, job_soup_list):
"""function that scrapes the glassdoor page for a list of job soups"""
log_info(f'getting glassdoor page {page} : {url}')

job = BeautifulSoup(
self.s.post(url, headers=self.headers,
data=data).text, self.bs4_parser
self.s.get(url).text, self.bs4_parser
).find_all('li', attrs={'class', 'jl'})
job_soup_list.extend(job)

def search_joblink_for_blurb(self, job):
"""function that scrapes the glassdoor job link for the blurb"""
search = job['link']
log_info(f'getting glassdoor search: {search}')

job_link_soup = BeautifulSoup(
self.s.post(
search, headers=self.location_headers).text, self.bs4_parser
self.s.get(search).text, self.bs4_parser
)

try:
Expand All @@ -105,7 +108,7 @@ def get_blurb_with_delay(self, job, delay):
search = job['link']
log_info(f'delay of {delay:.2f}s, getting glassdoor search: {search}')

res = self.s.post(search, headers=self.location_headers).text
res = self.s.get(search).text
return job, res

def scrape(self):
Expand All @@ -116,7 +119,7 @@ def scrape(self):
search, data = self.get_search_url(method='post')

# get the html data, initialize bs4 with lxml
request_html = self.s.post(search, headers=self.headers, data=data)
request_html = self.s.post(search, data=data)

# create the soup base
soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
Expand All @@ -143,7 +146,6 @@ def scrape(self):
fts.append( # append thread job future to futures list
threads.submit(
self.search_page_for_job_soups,
data,
page,
request_html.url,
job_soup_list,
Expand All @@ -167,7 +169,6 @@ def scrape(self):
fts.append( # append thread job future to futures list
threads.submit(
self.search_page_for_job_soups,
data,
page,
page_url,
job_soup_list,
Expand Down
20 changes: 11 additions & 9 deletions jobfunnel/indeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from concurrent.futures import ThreadPoolExecutor, wait
from logging import info as log_info
from math import ceil
from requests import get
from time import sleep, time

from .jobfunnel import JobFunnel, MASTERLIST_HEADER
Expand All @@ -30,7 +29,10 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
}
self.query = '+'.join(self.search_terms['keywords'])
# Sets headers as default on Session object
self.s.headers.update(self.headers)
# Concatenates keywords with '+' and encodes spaces as '+'
self.query = '+'.join(self.search_terms['keywords']).replace(' ', '+')

def convert_radius(self, radius):
"""function that quantizes the user input radius to a valid radius
Expand All @@ -47,19 +49,19 @@ def convert_radius(self, radius):
radius = 25
elif 50 <= radius < 100:
radius = 50
elif 100 <= radius:
elif radius >= 100:
radius = 100
return radius

def get_search_url(self, method='get'):
"""gets the indeed search url"""
if method == 'get':
# form job search url
search = ('http://www.indeed.{0}/jobs?'
search = ('https://www.indeed.{0}/jobs?'
'q={1}&l={2}%2C+{3}&radius={4}&limit={5}&filter={6}'.format(
self.search_terms['region']['domain'],
self.query,
self.search_terms['region']['city'],
self.search_terms['region']['city'].replace(' ', '+'),
self.search_terms['region']['province'],
self.convert_radius(
self.search_terms['region']['radius']),
Expand All @@ -79,7 +81,7 @@ def search_page_for_job_soups(self, search, page, job_soup_list):
log_info(f'getting indeed page {page} : {url}')

jobs = BeautifulSoup(
self.s.get(url, headers=self.headers).text, self.bs4_parser). \
self.s.get(url).text, self.bs4_parser). \
find_all('div', attrs={'data-tn-component': 'organicJob'})

job_soup_list.extend(jobs)
Expand All @@ -90,7 +92,7 @@ def search_joblink_for_blurb(self, job):
log_info(f'getting indeed page: {search}')

job_link_soup = BeautifulSoup(
self.s.get(search, headers=self.headers).text, self.bs4_parser)
self.s.get(search).text, self.bs4_parser)

try:
job['blurb'] = job_link_soup.find(
Expand All @@ -107,7 +109,7 @@ def get_blurb_with_delay(self, job, delay):
search = job['link']
log_info(f'delay of {delay:.2f}s, getting indeed search: {search}')

res = self.s.get(search, headers=self.headers).text
res = self.s.get(search).text
return job, res

def parse_blurb(self, job, html):
Expand Down Expand Up @@ -247,7 +249,7 @@ def scrape(self):
search = self.get_search_url()

# get the html data, initialize bs4 with lxml
request_html = self.s.get(search, headers=self.headers)
request_html = self.s.get(search)

# create the soup base
soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
Expand Down
7 changes: 5 additions & 2 deletions jobfunnel/jobfunnel.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,9 +308,12 @@ def delay_threader(self,
try:
job, html = future.result()
parse_fn(job, html)
except Exception:
del results[future]
del html
except Exception as e:
self.logger.error(f'Blurb Future Error: {e}')
pass
del results[future]


threads.shutdown() # clean up threads when done
# end and print recorded time
Expand Down
19 changes: 11 additions & 8 deletions jobfunnel/monster.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ def __init__(self, args):
'Cache-Control': 'no-cache',
'Connection': 'keep-alive'
}
self.query = '-'.join(self.search_terms['keywords'])
# Sets headers as default on Session object
self.s.headers.update(self.headers)
# Concatenates keywords with '-' and encodes spaces as '-'
self.query = '-'.join(self.search_terms['keywords']).replace(' ', '-')

def convert_radius(self, radius):
"""function that quantizes the user input radius to a valid radius
Expand Down Expand Up @@ -57,7 +60,7 @@ def convert_radius(self, radius):
radius = 100
elif 150 <= radius < 200:
radius = 150
elif 200 <= radius:
elif radius >= 200:
radius = 200
else:
if radius < 5:
Expand All @@ -70,7 +73,7 @@ def convert_radius(self, radius):
radius = 20
elif 50 <= radius < 100:
radius = 50
elif 100 <= radius:
elif radius >= 100:
radius = 100

return radius
Expand All @@ -83,7 +86,7 @@ def get_search_url(self, method='get'):
'q={1}&where={2}__2C-{3}&intcid={4}&rad={5}&where={2}__2c-{3}'.format(
self.search_terms['region']['domain'],
self.query,
self.search_terms['region']['city'],
self.search_terms['region']['city'].replace(' ', "-"),
self.search_terms['region']['province'],
'skr_navigation_nhpso_searchMain',
self.convert_radius(self.search_terms['region']['radius'])))
Expand All @@ -101,7 +104,7 @@ def search_joblink_for_blurb(self, job):
log_info(f'getting monster search: {search}')

job_link_soup = BeautifulSoup(
self.s.get(search, headers=self.headers).text, self.bs4_parser)
self.s.get(search).text, self.bs4_parser)

try:
job['blurb'] = job_link_soup.find(
Expand All @@ -120,7 +123,7 @@ def get_blurb_with_delay(self, job, delay):
search = job['link']
log_info(f'delay of {delay:.2f}s, getting monster search: {search}')

res = self.s.get(search, headers=self.headers).text
res = self.s.get(search).text
return job, res

def parse_blurb(self, job, html):
Expand All @@ -143,7 +146,7 @@ def scrape(self):
search = self.get_search_url()

# get the html data, initialize bs4 with lxml
request_html = self.s.get(search, headers=self.headers)
request_html = self.s.get(search)

# create the soup base
soup_base = BeautifulSoup(request_html.text, self.bs4_parser)
Expand All @@ -160,7 +163,7 @@ def scrape(self):
log_info(f'getting monster pages 1 to {pages} : {page_url}')

jobs = BeautifulSoup(
self.s.get(page_url, headers=self.headers).text, self.bs4_parser). \
self.s.get(page_url).text, self.bs4_parser). \
find_all('div', attrs={'class': 'flex-row'})

job_soup_list = []
Expand Down
2 changes: 1 addition & 1 deletion tests/test_indeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def test_get_search_url(self, init_scraper, search_terms_config):
provider = init_scraper('indeed')
provider.search_terms = search_terms_config
if(provider.search_terms['region']['domain'] == 'ca'):
assert'http://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url()
assert'https://www.indeed.ca/jobs?q=Python&l=waterloo%2C+ON&radius=25&limit=50&filter=0' == provider.get_search_url()
with pytest.raises(ValueError) as e:
provider.get_search_url('panda')
assert str(e.value) == 'No html method panda exists'
Expand Down