Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Germany_German Support #132

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion jobfunnel/backend/scrapers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,4 +456,12 @@ class BaseFRFreScraper(BaseScraper):
"""
@property
def locale(self) -> Locale:
return Locale.FRANCE_FRENCH
return Locale.FRANCE_FRENCH


class BaseGEGerScraper(BaseScraper):
"""" Localized scraper for Germany German
"""
@property
def locale(self) -> Locale:
return Locale.GERMANY_GERMAN
67 changes: 65 additions & 2 deletions jobfunnel/backend/scrapers/indeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper,
BaseUSAEngScraper,
BaseUKEngScraper,
BaseFRFreScraper)
BaseFRFreScraper,
BaseGEGerScraper)
from jobfunnel.backend.tools.filters import JobFilter
from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
from jobfunnel.resources import MAX_CPU_WORKERS, JobField, Remoteness
Expand Down Expand Up @@ -419,4 +420,66 @@ def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
elif number_of_pages < max_pages:
return number_of_pages
else:
return max_pages
return max_pages


class IndeedScraperGEGer(BaseIndeedScraper, BaseGEGerScraper):
'''Scrapes jobos from indeed.de
'''
def _get_search_url(self, method: Optional[str] = 'get') -> str:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can remove _get_search_url and _get_num_search_result_pages here as these are defined by BaseIndeedScraper

"""Get the indeed search url from SearchTerms
TODO: use Enum for method instead of str.
"""
if method == 'get':
return (
"https://www.indeed.{}/jobs?q={}&l={}&radius={}&"
"limit={}&filter={}{}".format(
self.config.search_config.domain,
self.query,
self.config.search_config.city.replace(' ', '+',),
self._quantize_radius(self.config.search_config.radius),
self.max_results_per_page,
int(self.config.search_config.return_similar_results),
REMOTENESS_TO_QUERY[self.config.search_config.remoteness],
)
)
elif method == 'post':
raise NotImplementedError()
else:
raise ValueError(f'No html method {method} exists')

def _get_num_search_result_pages(self, search_url: str, max_pages=0) -> int:
"""Calculates the number of pages of job listings to be scraped.

i.e. your search yields 230 results at 50 res/page -> 5 pages of jobs

Args:
max_pages: the maximum number of pages to be scraped.
Returns:
The number of pages to be scraped.
"""
# Get the html data, initialize bs4 with lxml
request_html = self.session.get(search_url)
self.logger.debug(
"Got Base search results page: %s", search_url
)
query_resp = BeautifulSoup(request_html.text, self.config.bs4_parser)
num_res = query_resp.find(id='searchCountPages')
# TODO: we should consider expanding the error cases (scrape error page)
if not num_res:
raise ValueError(
"Unable to identify number of pages of results for query: {}"
" Please ensure linked page contains results, you may have"
" provided a city for which there are no results within this"
" province or state.".format(search_url)
)

num_res = normalize("NFKD", num_res.contents[0].strip())
num_res = int(re.findall(r'(\d+) ', num_res.replace(',', ''))[1])
number_of_pages = int(ceil(num_res / self.max_results_per_page))
if max_pages == 0:
return number_of_pages
elif number_of_pages < max_pages:
return number_of_pages
else:
return max_pages
31 changes: 30 additions & 1 deletion jobfunnel/backend/scrapers/monster.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from jobfunnel.backend import Job
from jobfunnel.backend.scrapers.base import (BaseCANEngScraper, BaseScraper,
BaseUSAEngScraper, BaseUKEngScraper,
BaseFRFreScraper)
BaseFRFreScraper, BaseGEGerScraper)
from jobfunnel.backend.tools.filters import JobFilter
from jobfunnel.backend.tools.tools import calc_post_date_from_relative_str
from jobfunnel.resources import JobField, Remoteness
Expand Down Expand Up @@ -406,3 +406,32 @@ def _get_search_url(self, method: Optional[str] = 'get',
raise NotImplementedError()
else:
raise ValueError(f'No html method {method} exists')

class MonsterScraperGEGer(MonsterMetricRadius, BaseMonsterScraper,
BaseGEGerScraper):
"""Scrapes jobs from www.monster.de
"""
def _get_search_url(self, method: Optional[str] = 'get',
page: int = 1) -> str:
"""Get the monster search url from SearchTerms
TODO: implement fulltime/part-time portion + company search?
TODO: implement POST
NOTE: unfortunately we cannot start on any page other than 1,
so the jobs displayed just scrolls forever and we will see
all previous jobs as we go.
"""
if method == 'get':
return (
'https://www.monster.{}/jobs/search/?{}q={}&where={}'
'&rad={}'.format(
self.config.search_config.domain,
f'page={page}&' if page > 1 else '',
self.query,
self.config.search_config.city.replace(' ', '-'),
self._convert_radius(self.config.search_config.radius)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does German's Monster allow searching by state? If so I think perhaps this is missing here.

)
)
elif method == 'post':
raise NotImplementedError()
else:
raise ValueError(f'No html method {method} exists')
8 changes: 6 additions & 2 deletions jobfunnel/backend/scrapers/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@

from jobfunnel.backend.scrapers.indeed import (
IndeedScraperCANEng, IndeedScraperUSAEng,
IndeedScraperUKEng, IndeedScraperFRFre
IndeedScraperUKEng, IndeedScraperFRFre,
IndeedScraperGEGer
)
from jobfunnel.backend.scrapers.monster import (
MonsterScraperCANEng, MonsterScraperUSAEng,
MonsterScraperUKEng, MonsterScraperFRFre
MonsterScraperUKEng, MonsterScraperFRFre,
MonsterScraperGEGer
)
from jobfunnel.backend.scrapers.glassdoor import (
GlassDoorScraperCANEng, GlassDoorScraperUSAEng,
Expand All @@ -25,6 +27,7 @@
Locale.USA_ENGLISH: IndeedScraperUSAEng,
Locale.UK_ENGLISH: IndeedScraperUKEng,
Locale.FRANCE_FRENCH: IndeedScraperFRFre,
Locale.GERMANY_GERMAN: IndeedScraperGEGer
},
Provider.GLASSDOOR: {
Locale.CANADA_ENGLISH: GlassDoorScraperCANEng,
Expand All @@ -36,5 +39,6 @@
Locale.USA_ENGLISH: MonsterScraperUSAEng,
Locale.UK_ENGLISH: MonsterScraperUKEng,
Locale.FRANCE_FRENCH: MonsterScraperFRFre,
Locale.GERMANY_GERMAN:MonsterScraperGEGer
},
}
8 changes: 4 additions & 4 deletions jobfunnel/backend/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
from jobfunnel.backend import Job

# Initialize list and store regex objects of date quantifiers
HOUR_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?(?:hour|hr|heure)')
DAY_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?(?:day|d|jour)')
MONTH_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?month|mois')
YEAR_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?year|annee')
HOUR_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?(?:hour|hr|heure|Stunde)')
DAY_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?(?:day|d|jour|Tag)')
MONTH_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?month|mois|Monat')
YEAR_REGEX = re.compile(r'(\d+)(?:[ +]{1,3})?year|annee|Jahr')
RECENT_REGEX_A = re.compile(r'[tT]oday|[jJ]ust [pP]osted')
RECENT_REGEX_B = re.compile(r'[yY]esterday')

Expand Down
1 change: 1 addition & 0 deletions jobfunnel/resources/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@
Locale.USA_ENGLISH: 'com',
Locale.UK_ENGLISH: 'co.uk',
Locale.FRANCE_FRENCH: 'fr',
Locale.GERMANY_GERMAN: 'de',
}
2 changes: 1 addition & 1 deletion jobfunnel/resources/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Locale(Enum):
USA_ENGLISH = 3
UK_ENGLISH = 4
FRANCE_FRENCH = 5

GERMANY_GERMAN = 6

class JobStatus(Enum):
"""Job statuses that are built-into jobfunnel
Expand Down
1 change: 1 addition & 0 deletions tests/config/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_search_config_query_string(mocker, keywords, exp_query_str):
(Locale.USA_ENGLISH, None, 'com'),
(Locale.UK_ENGLISH, None, 'co.uk'),
(Locale.FRANCE_FRENCH, None, 'fr'),
(Locale.GERMANY_GERMAN,None, 'de'),
(Locale.USA_ENGLISH, 'xyz', 'xyz'),
(None, None, None),
])
Expand Down