From 99e197ca9737b13471d30893be349774b0617814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jannes=20H=C3=B6ke?= Date: Tue, 22 Oct 2024 16:25:42 +0200 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Add=20scraper=20for=20SPNV-?= =?UTF-8?q?=E2=80=8B=E2=80=8BQualit=C3=A4tsmonitor=20NRW=20disabled=20(for?= =?UTF-8?q?=20now)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../.copier-answers.yml | 12 ++ .../spnv_qualitaetsmonitor_nrw/README.md | 5 + .../spnv_qualitaetsmonitor_nrw/__init__.py | 0 .../spnv_qualitaetsmonitor_nrw/models.py | 146 ++++++++++++++++++ .../spnv_qualitaetsmonitor_nrw.py | 100 ++++++++++++ scrapers_config.json | 21 +++ 6 files changed, 284 insertions(+) create mode 100644 ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/.copier-answers.yml create mode 100644 ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/README.md create mode 100644 ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/__init__.py create mode 100644 ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/models.py create mode 100644 ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/spnv_qualitaetsmonitor_nrw.py diff --git a/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/.copier-answers.yml b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/.copier-answers.yml new file mode 100644 index 0000000..d0ba4ba --- /dev/null +++ b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/.copier-answers.yml @@ -0,0 +1,12 @@ +# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY +_src_path: /home/jhoeke/git/ddj/wdr-ddj-cloud/scraper_template +contact_email: mail@jhoeke.de +contact_name: Jannes Höke +description: Scraping the SPNV-​​Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html +display_name: SPNV-​​Qualitätsmonitor NRW +ephemeral_storage: '512' +interval: daily +memory_size: '1024' +module_name: spnv_qualitaetsmonitor_nrw +preset: minimal + diff --git a/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/README.md b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/README.md new file mode 100644 index 0000000..72bc57b --- /dev/null +++ b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/README.md @@ -0,0 +1,5 @@ +# SPNV-​​Qualitätsmonitor NRW + +**Contact:** Jannes Höke (mail@jhoeke.de) + +Scraping the SPNV-​​Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html diff --git a/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/__init__.py b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/models.py b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/models.py new file mode 100644 index 0000000..ba7271b --- /dev/null +++ b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/models.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +import abc +from typing import Annotated, Any, Literal, Never, TypeAlias + +from annotated_types import Len # noqa: TCH002 +from pydantic import BaseModel, Field + + +class Filters: + class ItemBase(BaseModel, abc.ABC): + class Config: + arbitrary_types_allowed = True + + status: str + selected: bool + children: Annotated[list[Never], Len(max_length=0)] + + class ItemInt(ItemBase): + title: int + + class ItemString(ItemBase): + title: str + + class TargetBase(BaseModel, abc.ABC): ... + + class TargetYear(TargetBase): + target: Literal["year"] + items: list[Filters.ItemInt] + + class TargetQuarter(TargetBase): + target: Literal["quarter"] + items: list[Filters.ItemInt] + + class TargetRegion(TargetBase): + target: Literal["region"] + items: list[Filters.ItemString] + + class TargetEvu(TargetBase): + target: Literal["evu"] + items: list[Filters.ItemString] + + class TargetProductType(TargetBase): + target: Literal["product_type"] + items: list[Filters.ItemString] + + class TargetLines(TargetBase): + target: Literal["lines"] + items: list[Filters.ItemString] + + class TargetComplexity(TargetBase): + target: Literal["complexity"] + items: list[Filters.ItemInt] + + Target: TypeAlias = ( + TargetYear + | TargetQuarter + | TargetRegion + | TargetEvu + | TargetProductType + | TargetLines + | TargetComplexity + ) + + class Data(BaseModel): + targets: list[ + Annotated[ + Filters.Target, + Field(discriminator="target"), + ] + ] + + @staticmethod + def from_json(json: list[dict[str, Any]]) -> Data: + return Filters.Data.model_validate({"targets": json}) + + # class Data(BaseModel): + # year: Filters.TargetYear + # quarter: Filters.TargetQuarter + # region: Filters.TargetRegion + # evu: Filters.TargetEvu + # product_type: Filters.TargetProductType + # lines: Filters.TargetLines + # complexity: Filters.TargetComplexity + + # @staticmethod + # def from_json(json: list[dict[str, Any]]) -> Data: + # data_dict: dict[str, Filters.TargetBase] = {} + + # for item in json: + # for model in Filters.TargetBase.__subclasses__(): + # with contextlib.suppress(ValidationError): + # data_dict[item["target"]] = model.model_validate(item) + + # return Filters.Data.model_validate(data_dict) + + +class Results: + class ColumnBase(BaseModel, abc.ABC): + year: int + + class ColumnOverallRanking(ColumnBase): + quarters: list[float] + + class ColumnComplexity(ColumnBase): + quarters: list[int] + + class ColumnPunctuality(ColumnBase): + quarters: list[float] + + class ColumnReliability(ColumnBase): + quarters: list[float] + + class ColumnTrainFormation(ColumnBase): + quarters: list[float] + + class ColumnPassengers(ColumnBase): + quarters: list[int] + + Column: TypeAlias = ( + ColumnOverallRanking + | ColumnComplexity + | ColumnPunctuality + | ColumnReliability + | ColumnTrainFormation + | ColumnPassengers + ) + + class Data(BaseModel): + id: int = Field(alias="_id") + evu: str = Field(alias="_evu") + evutooltip: str = Field(alias="_evutooltip") + producttype: str = Field(alias="_producttype") + client: str = Field(alias="_client") + fullname: str = Field(alias="_fullname") + subnet: str = Field(alias="_subnet") + runtime: str = Field(alias="_runtime") + line_stations: str = Field(alias="_line_stations") + line: str + ranking: int + overall_ranking: Results.ColumnOverallRanking + complexity: Results.ColumnComplexity + punctuality: Results.ColumnPunctuality + reliability: Results.ColumnReliability + train_formation: Results.ColumnTrainFormation + passengers: Results.ColumnPassengers diff --git a/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/spnv_qualitaetsmonitor_nrw.py b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/spnv_qualitaetsmonitor_nrw.py new file mode 100644 index 0000000..e571e50 --- /dev/null +++ b/ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/spnv_qualitaetsmonitor_nrw.py @@ -0,0 +1,100 @@ +from collections.abc import Iterable +from typing import cast + +import pandas as pd +import requests + +from ddj_cloud.utils.storage import upload_dataframe + +from .models import Filters, Results + +BASE_URL = "https://infoportal.mobil.nrw" + + +def _load_filters(): + response = requests.get(f"{BASE_URL}/QmFilterShow.html") + response.raise_for_status() + + response_json = response.json() + assert isinstance(response_json, list), "Unexpected response type for QmFilterShow" + + return Filters.from_json(response_json) + + +def _list_param_raw(target: str, items: Iterable[str]): + return f"tx_cpqualitymonitor_ajaxlistfilter[filter][{target}]", ",".join(items) + + +def _list_param(target: Filters.Target): + return _list_param_raw(target.target, [str(item.title) for item in target.items]) + + +def _load_year(targets: list[Filters.Target], year: int): + url = f"{BASE_URL}/QmAjaxListFilter.html" + post_data = dict( + ( + _list_param_raw("year", [str(year)]), + *map(_list_param, targets), + ) + ) + + response = requests.post(url, data=post_data) + response.raise_for_status() + + response_json = response.json() + assert isinstance(response_json, dict), "Unexpected response type for QmAjaxListFilter" + assert "data" in response_json, "No data in response" + + for result in response_json["data"]: + yield Results.Data.model_validate(result) + + +def _to_quarter_rows(data: Results.Data, year: int): + quarterly_columns = ( + "overall_ranking", + "complexity", + "punctuality", + "reliability", + "train_formation", + "passengers", + ) + for quarter in range(4): + # Skip quarters where data is unavailable + if data.overall_ranking.quarters[quarter] == 0: + continue + + base_data = { + "year": year, + "quarter": quarter, + **data.model_dump(), + } + quarterly_data = { + column: cast(Results.Column, getattr(data, column)).quarters[quarter] + for column in quarterly_columns + } + + yield base_data | quarterly_data + + +def run(): + filters_data = _load_filters() + + targets_without_year: list[Filters.Target] = [] + years_available: list[int] = [] + + for target in filters_data.targets: + match target.target: + case "year": + years_available = [item.title for item in target.items] + case _: + targets_without_year.append(target) + + assert len(years_available) > 0, "No years available" + + rows: list[dict] = [] + for year in years_available: + for result in _load_year(targets_without_year, year): + rows.extend(_to_quarter_rows(result, year)) + + df = pd.DataFrame(rows) + upload_dataframe(df, "spnv_qualitaetsmonitor_nrw/data.csv") diff --git a/scrapers_config.json b/scrapers_config.json index 63bf6a3..5d021c2 100644 --- a/scrapers_config.json +++ b/scrapers_config.json @@ -140,5 +140,26 @@ "extra_env": [ "TALSPERREN_DATAWRAPPER_TOKEN" ] + }, + { + "display_name": "SPNV-​​Qualitätsmonitor NRW", + "module_name": "spnv_qualitaetsmonitor_nrw", + "description": "Scraping the SPNV-​​Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html", + "contact_name": "Jannes Höke", + "contact_email": "mail@jhoeke.de", + "memory_size": "1024", + "ephemeral_storage": "512", + "preset": "minimal", + "events": [ + { + "type": "schedule", + "enabled": false, + "data": { + "interval": "daily", + "interval_custom": null + } + } + ], + "extra_env": [] } ]