-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ Add scraper for SPNV-Qualitätsmonitor NRW
disabled (for now)
- Loading branch information
Showing
6 changed files
with
284 additions
and
0 deletions.
There are no files selected for viewing
12 changes: 12 additions & 0 deletions
12
ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/.copier-answers.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY | ||
_src_path: /home/jhoeke/git/ddj/wdr-ddj-cloud/scraper_template | ||
contact_email: [email protected] | ||
contact_name: Jannes Höke | ||
description: Scraping the SPNV-Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html | ||
display_name: SPNV-Qualitätsmonitor NRW | ||
ephemeral_storage: '512' | ||
interval: daily | ||
memory_size: '1024' | ||
module_name: spnv_qualitaetsmonitor_nrw | ||
preset: minimal | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# SPNV-Qualitätsmonitor NRW | ||
|
||
**Contact:** Jannes Höke ([email protected]) | ||
|
||
Scraping the SPNV-Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html |
Empty file.
146 changes: 146 additions & 0 deletions
146
ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/models.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
from __future__ import annotations | ||
|
||
import abc | ||
from typing import Annotated, Any, Literal, Never, TypeAlias | ||
|
||
from annotated_types import Len # noqa: TCH002 | ||
from pydantic import BaseModel, Field | ||
|
||
|
||
class Filters: | ||
class ItemBase(BaseModel, abc.ABC): | ||
class Config: | ||
arbitrary_types_allowed = True | ||
|
||
status: str | ||
selected: bool | ||
children: Annotated[list[Never], Len(max_length=0)] | ||
|
||
class ItemInt(ItemBase): | ||
title: int | ||
|
||
class ItemString(ItemBase): | ||
title: str | ||
|
||
class TargetBase(BaseModel, abc.ABC): ... | ||
|
||
class TargetYear(TargetBase): | ||
target: Literal["year"] | ||
items: list[Filters.ItemInt] | ||
|
||
class TargetQuarter(TargetBase): | ||
target: Literal["quarter"] | ||
items: list[Filters.ItemInt] | ||
|
||
class TargetRegion(TargetBase): | ||
target: Literal["region"] | ||
items: list[Filters.ItemString] | ||
|
||
class TargetEvu(TargetBase): | ||
target: Literal["evu"] | ||
items: list[Filters.ItemString] | ||
|
||
class TargetProductType(TargetBase): | ||
target: Literal["product_type"] | ||
items: list[Filters.ItemString] | ||
|
||
class TargetLines(TargetBase): | ||
target: Literal["lines"] | ||
items: list[Filters.ItemString] | ||
|
||
class TargetComplexity(TargetBase): | ||
target: Literal["complexity"] | ||
items: list[Filters.ItemInt] | ||
|
||
Target: TypeAlias = ( | ||
TargetYear | ||
| TargetQuarter | ||
| TargetRegion | ||
| TargetEvu | ||
| TargetProductType | ||
| TargetLines | ||
| TargetComplexity | ||
) | ||
|
||
class Data(BaseModel): | ||
targets: list[ | ||
Annotated[ | ||
Filters.Target, | ||
Field(discriminator="target"), | ||
] | ||
] | ||
|
||
@staticmethod | ||
def from_json(json: list[dict[str, Any]]) -> Data: | ||
return Filters.Data.model_validate({"targets": json}) | ||
|
||
# class Data(BaseModel): | ||
# year: Filters.TargetYear | ||
# quarter: Filters.TargetQuarter | ||
# region: Filters.TargetRegion | ||
# evu: Filters.TargetEvu | ||
# product_type: Filters.TargetProductType | ||
# lines: Filters.TargetLines | ||
# complexity: Filters.TargetComplexity | ||
|
||
# @staticmethod | ||
# def from_json(json: list[dict[str, Any]]) -> Data: | ||
# data_dict: dict[str, Filters.TargetBase] = {} | ||
|
||
# for item in json: | ||
# for model in Filters.TargetBase.__subclasses__(): | ||
# with contextlib.suppress(ValidationError): | ||
# data_dict[item["target"]] = model.model_validate(item) | ||
|
||
# return Filters.Data.model_validate(data_dict) | ||
|
||
|
||
class Results: | ||
class ColumnBase(BaseModel, abc.ABC): | ||
year: int | ||
|
||
class ColumnOverallRanking(ColumnBase): | ||
quarters: list[float] | ||
|
||
class ColumnComplexity(ColumnBase): | ||
quarters: list[int] | ||
|
||
class ColumnPunctuality(ColumnBase): | ||
quarters: list[float] | ||
|
||
class ColumnReliability(ColumnBase): | ||
quarters: list[float] | ||
|
||
class ColumnTrainFormation(ColumnBase): | ||
quarters: list[float] | ||
|
||
class ColumnPassengers(ColumnBase): | ||
quarters: list[int] | ||
|
||
Column: TypeAlias = ( | ||
ColumnOverallRanking | ||
| ColumnComplexity | ||
| ColumnPunctuality | ||
| ColumnReliability | ||
| ColumnTrainFormation | ||
| ColumnPassengers | ||
) | ||
|
||
class Data(BaseModel): | ||
id: int = Field(alias="_id") | ||
evu: str = Field(alias="_evu") | ||
evutooltip: str = Field(alias="_evutooltip") | ||
producttype: str = Field(alias="_producttype") | ||
client: str = Field(alias="_client") | ||
fullname: str = Field(alias="_fullname") | ||
subnet: str = Field(alias="_subnet") | ||
runtime: str = Field(alias="_runtime") | ||
line_stations: str = Field(alias="_line_stations") | ||
line: str | ||
ranking: int | ||
overall_ranking: Results.ColumnOverallRanking | ||
complexity: Results.ColumnComplexity | ||
punctuality: Results.ColumnPunctuality | ||
reliability: Results.ColumnReliability | ||
train_formation: Results.ColumnTrainFormation | ||
passengers: Results.ColumnPassengers |
100 changes: 100 additions & 0 deletions
100
ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/spnv_qualitaetsmonitor_nrw.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
from collections.abc import Iterable | ||
from typing import cast | ||
|
||
import pandas as pd | ||
import requests | ||
|
||
from ddj_cloud.utils.storage import upload_dataframe | ||
|
||
from .models import Filters, Results | ||
|
||
BASE_URL = "https://infoportal.mobil.nrw" | ||
|
||
|
||
def _load_filters(): | ||
response = requests.get(f"{BASE_URL}/QmFilterShow.html") | ||
response.raise_for_status() | ||
|
||
response_json = response.json() | ||
assert isinstance(response_json, list), "Unexpected response type for QmFilterShow" | ||
|
||
return Filters.from_json(response_json) | ||
|
||
|
||
def _list_param_raw(target: str, items: Iterable[str]): | ||
return f"tx_cpqualitymonitor_ajaxlistfilter[filter][{target}]", ",".join(items) | ||
|
||
|
||
def _list_param(target: Filters.Target): | ||
return _list_param_raw(target.target, [str(item.title) for item in target.items]) | ||
|
||
|
||
def _load_year(targets: list[Filters.Target], year: int): | ||
url = f"{BASE_URL}/QmAjaxListFilter.html" | ||
post_data = dict( | ||
( | ||
_list_param_raw("year", [str(year)]), | ||
*map(_list_param, targets), | ||
) | ||
) | ||
|
||
response = requests.post(url, data=post_data) | ||
response.raise_for_status() | ||
|
||
response_json = response.json() | ||
assert isinstance(response_json, dict), "Unexpected response type for QmAjaxListFilter" | ||
assert "data" in response_json, "No data in response" | ||
|
||
for result in response_json["data"]: | ||
yield Results.Data.model_validate(result) | ||
|
||
|
||
def _to_quarter_rows(data: Results.Data, year: int): | ||
quarterly_columns = ( | ||
"overall_ranking", | ||
"complexity", | ||
"punctuality", | ||
"reliability", | ||
"train_formation", | ||
"passengers", | ||
) | ||
for quarter in range(4): | ||
# Skip quarters where data is unavailable | ||
if data.overall_ranking.quarters[quarter] == 0: | ||
continue | ||
|
||
base_data = { | ||
"year": year, | ||
"quarter": quarter, | ||
**data.model_dump(), | ||
} | ||
quarterly_data = { | ||
column: cast(Results.Column, getattr(data, column)).quarters[quarter] | ||
for column in quarterly_columns | ||
} | ||
|
||
yield base_data | quarterly_data | ||
|
||
|
||
def run(): | ||
filters_data = _load_filters() | ||
|
||
targets_without_year: list[Filters.Target] = [] | ||
years_available: list[int] = [] | ||
|
||
for target in filters_data.targets: | ||
match target.target: | ||
case "year": | ||
years_available = [item.title for item in target.items] | ||
case _: | ||
targets_without_year.append(target) | ||
|
||
assert len(years_available) > 0, "No years available" | ||
|
||
rows: list[dict] = [] | ||
for year in years_available: | ||
for result in _load_year(targets_without_year, year): | ||
rows.extend(_to_quarter_rows(result, year)) | ||
|
||
df = pd.DataFrame(rows) | ||
upload_dataframe(df, "spnv_qualitaetsmonitor_nrw/data.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -140,5 +140,26 @@ | |
"extra_env": [ | ||
"TALSPERREN_DATAWRAPPER_TOKEN" | ||
] | ||
}, | ||
{ | ||
"display_name": "SPNV-Qualitätsmonitor NRW", | ||
"module_name": "spnv_qualitaetsmonitor_nrw", | ||
"description": "Scraping the SPNV-Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html", | ||
"contact_name": "Jannes Höke", | ||
"contact_email": "[email protected]", | ||
"memory_size": "1024", | ||
"ephemeral_storage": "512", | ||
"preset": "minimal", | ||
"events": [ | ||
{ | ||
"type": "schedule", | ||
"enabled": false, | ||
"data": { | ||
"interval": "daily", | ||
"interval_custom": null | ||
} | ||
} | ||
], | ||
"extra_env": [] | ||
} | ||
] |