Skip to content

Commit

Permalink
Add company industry and job level to linkedin scraper (#166)
Browse files Browse the repository at this point in the history
  • Loading branch information
alibakhshiilani authored Jul 16, 2024
1 parent edffe18 commit 48631ea
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 3 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs = scrape_jobs(
hours_old=72, # (only Linkedin/Indeed is hour specific, others round up to days old)
country_indeed='USA', # only needed for indeed / glassdoor

# linkedin_fetch_description=True # get full description and direct job url for linkedin (slower)
# linkedin_fetch_description=True # get full description , direct job url , company industry and job level (seniority level) for linkedin (slower)
# proxies=["208.195.175.46:65095", "208.195.175.45:65095", "localhost"],

)
Expand Down Expand Up @@ -150,10 +150,15 @@ JobPost
├── emails (str)
└── is_remote (bool)
Linkedin specific
└── job_level (str)
Linkedin & Indeed specific
└── company_industry (str)
Indeed specific
├── company_country (str)
└── company_addresses (str)
└── company_industry (str)
└── company_employees_label (str)
└── company_revenue_label (str)
└── company_description (str)
Expand Down
1 change: 1 addition & 0 deletions src/jobspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ def convert_to_annual(job_data: dict):
"max_amount",
"currency",
"is_remote",
"job_level",
"job_function",
"listing_type",
"emails",
Expand Down
7 changes: 6 additions & 1 deletion src/jobspy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,14 @@ class JobPost(BaseModel):
is_remote: bool | None = None
listing_type: str | None = None

# linkedin specific
job_level: str | None = None

# linkedin and indeed specific
company_industry: str | None = None

# indeed specific
company_addresses: str | None = None
company_industry: str | None = None
company_num_employees: str | None = None
company_revenue: str | None = None
company_description: str | None = None
Expand Down
50 changes: 50 additions & 0 deletions src/jobspy/scrapers/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,8 @@ def _process_job(
job_url=f"{self.base_url}/jobs/view/{job_id}",
compensation=compensation,
job_type=job_details.get("job_type"),
job_level=job_details.get("job_level"),
company_industry=job_details.get("company_industry"),
description=job_details.get("description"),
job_url_direct=job_details.get("job_url_direct"),
emails=extract_emails_from_text(job_details.get("description")),
Expand Down Expand Up @@ -266,6 +268,8 @@ def _get_job_details(self, job_id: str) -> dict:
job_function = job_function_span.text.strip()
return {
"description": description,
"job_level": self._parse_job_level(soup),
"company_industry": self._parse_company_industry(soup),
"job_type": self._parse_job_type(soup),
"job_url_direct": self._parse_job_url_direct(soup),
"logo_photo_url": soup.find("img", {"class": "artdeco-entity-image"}).get(
Expand Down Expand Up @@ -325,6 +329,52 @@ def _parse_job_type(soup_job_type: BeautifulSoup) -> list[JobType] | None:

return [get_enum_from_job_type(employment_type)] if employment_type else []

@staticmethod
def _parse_job_level(soup_job_level: BeautifulSoup) -> str | None:
"""
Gets the job level from job page
:param soup_job_level:
:return: str
"""
h3_tag = soup_job_level.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Seniority level" in text,
)
job_level = None
if h3_tag:
job_level_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if job_level_span:
job_level = job_level_span.get_text(strip=True)

return job_level

@staticmethod
def _parse_company_industry(soup_industry: BeautifulSoup) -> str | None:
"""
Gets the company industry from job page
:param soup_industry:
:return: str
"""
h3_tag = soup_industry.find(
"h3",
class_="description__job-criteria-subheader",
string=lambda text: "Industries" in text,
)
industry = None
if h3_tag:
industry_span = h3_tag.find_next_sibling(
"span",
class_="description__job-criteria-text description__job-criteria-text--criteria",
)
if industry_span:
industry = industry_span.get_text(strip=True)

return industry

def _parse_job_url_direct(self, soup: BeautifulSoup) -> str | None:
"""
Gets the job url direct from job page
Expand Down

0 comments on commit 48631ea

Please sign in to comment.