Skip to content

Commit

Permalink
✨ Add scraper for SPNV-​​Qualitätsmonitor NRW
Browse files Browse the repository at this point in the history
disabled (for now)
  • Loading branch information
jh0ker committed Oct 22, 2024
1 parent d1c617b commit 99e197c
Show file tree
Hide file tree
Showing 6 changed files with 284 additions and 0 deletions.
12 changes: 12 additions & 0 deletions ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/.copier-answers.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Changes here will be overwritten by Copier; NEVER EDIT MANUALLY
_src_path: /home/jhoeke/git/ddj/wdr-ddj-cloud/scraper_template
contact_email: [email protected]
contact_name: Jannes Höke
description: Scraping the SPNV-​​Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html
display_name: SPNV-​​Qualitätsmonitor NRW
ephemeral_storage: '512'
interval: daily
memory_size: '1024'
module_name: spnv_qualitaetsmonitor_nrw
preset: minimal

5 changes: 5 additions & 0 deletions ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# SPNV-​​Qualitätsmonitor NRW

**Contact:** Jannes Höke ([email protected])

Scraping the SPNV-​​Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html
Empty file.
146 changes: 146 additions & 0 deletions ddj_cloud/scrapers/spnv_qualitaetsmonitor_nrw/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
from __future__ import annotations

import abc
from typing import Annotated, Any, Literal, Never, TypeAlias

from annotated_types import Len # noqa: TCH002
from pydantic import BaseModel, Field


class Filters:
class ItemBase(BaseModel, abc.ABC):
class Config:
arbitrary_types_allowed = True

status: str
selected: bool
children: Annotated[list[Never], Len(max_length=0)]

class ItemInt(ItemBase):
title: int

class ItemString(ItemBase):
title: str

class TargetBase(BaseModel, abc.ABC): ...

class TargetYear(TargetBase):
target: Literal["year"]
items: list[Filters.ItemInt]

class TargetQuarter(TargetBase):
target: Literal["quarter"]
items: list[Filters.ItemInt]

class TargetRegion(TargetBase):
target: Literal["region"]
items: list[Filters.ItemString]

class TargetEvu(TargetBase):
target: Literal["evu"]
items: list[Filters.ItemString]

class TargetProductType(TargetBase):
target: Literal["product_type"]
items: list[Filters.ItemString]

class TargetLines(TargetBase):
target: Literal["lines"]
items: list[Filters.ItemString]

class TargetComplexity(TargetBase):
target: Literal["complexity"]
items: list[Filters.ItemInt]

Target: TypeAlias = (
TargetYear
| TargetQuarter
| TargetRegion
| TargetEvu
| TargetProductType
| TargetLines
| TargetComplexity
)

class Data(BaseModel):
targets: list[
Annotated[
Filters.Target,
Field(discriminator="target"),
]
]

@staticmethod
def from_json(json: list[dict[str, Any]]) -> Data:
return Filters.Data.model_validate({"targets": json})

# class Data(BaseModel):
# year: Filters.TargetYear
# quarter: Filters.TargetQuarter
# region: Filters.TargetRegion
# evu: Filters.TargetEvu
# product_type: Filters.TargetProductType
# lines: Filters.TargetLines
# complexity: Filters.TargetComplexity

# @staticmethod
# def from_json(json: list[dict[str, Any]]) -> Data:
# data_dict: dict[str, Filters.TargetBase] = {}

# for item in json:
# for model in Filters.TargetBase.__subclasses__():
# with contextlib.suppress(ValidationError):
# data_dict[item["target"]] = model.model_validate(item)

# return Filters.Data.model_validate(data_dict)


class Results:
class ColumnBase(BaseModel, abc.ABC):
year: int

class ColumnOverallRanking(ColumnBase):
quarters: list[float]

class ColumnComplexity(ColumnBase):
quarters: list[int]

class ColumnPunctuality(ColumnBase):
quarters: list[float]

class ColumnReliability(ColumnBase):
quarters: list[float]

class ColumnTrainFormation(ColumnBase):
quarters: list[float]

class ColumnPassengers(ColumnBase):
quarters: list[int]

Column: TypeAlias = (
ColumnOverallRanking
| ColumnComplexity
| ColumnPunctuality
| ColumnReliability
| ColumnTrainFormation
| ColumnPassengers
)

class Data(BaseModel):
id: int = Field(alias="_id")
evu: str = Field(alias="_evu")
evutooltip: str = Field(alias="_evutooltip")
producttype: str = Field(alias="_producttype")
client: str = Field(alias="_client")
fullname: str = Field(alias="_fullname")
subnet: str = Field(alias="_subnet")
runtime: str = Field(alias="_runtime")
line_stations: str = Field(alias="_line_stations")
line: str
ranking: int
overall_ranking: Results.ColumnOverallRanking
complexity: Results.ColumnComplexity
punctuality: Results.ColumnPunctuality
reliability: Results.ColumnReliability
train_formation: Results.ColumnTrainFormation
passengers: Results.ColumnPassengers
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from collections.abc import Iterable
from typing import cast

import pandas as pd
import requests

from ddj_cloud.utils.storage import upload_dataframe

from .models import Filters, Results

BASE_URL = "https://infoportal.mobil.nrw"


def _load_filters():
response = requests.get(f"{BASE_URL}/QmFilterShow.html")
response.raise_for_status()

response_json = response.json()
assert isinstance(response_json, list), "Unexpected response type for QmFilterShow"

return Filters.from_json(response_json)


def _list_param_raw(target: str, items: Iterable[str]):
return f"tx_cpqualitymonitor_ajaxlistfilter[filter][{target}]", ",".join(items)


def _list_param(target: Filters.Target):
return _list_param_raw(target.target, [str(item.title) for item in target.items])


def _load_year(targets: list[Filters.Target], year: int):
url = f"{BASE_URL}/QmAjaxListFilter.html"
post_data = dict(
(
_list_param_raw("year", [str(year)]),
*map(_list_param, targets),
)
)

response = requests.post(url, data=post_data)
response.raise_for_status()

response_json = response.json()
assert isinstance(response_json, dict), "Unexpected response type for QmAjaxListFilter"
assert "data" in response_json, "No data in response"

for result in response_json["data"]:
yield Results.Data.model_validate(result)


def _to_quarter_rows(data: Results.Data, year: int):
quarterly_columns = (
"overall_ranking",
"complexity",
"punctuality",
"reliability",
"train_formation",
"passengers",
)
for quarter in range(4):
# Skip quarters where data is unavailable
if data.overall_ranking.quarters[quarter] == 0:
continue

base_data = {
"year": year,
"quarter": quarter,
**data.model_dump(),
}
quarterly_data = {
column: cast(Results.Column, getattr(data, column)).quarters[quarter]
for column in quarterly_columns
}

yield base_data | quarterly_data


def run():
filters_data = _load_filters()

targets_without_year: list[Filters.Target] = []
years_available: list[int] = []

for target in filters_data.targets:
match target.target:
case "year":
years_available = [item.title for item in target.items]
case _:
targets_without_year.append(target)

assert len(years_available) > 0, "No years available"

rows: list[dict] = []
for year in years_available:
for result in _load_year(targets_without_year, year):
rows.extend(_to_quarter_rows(result, year))

df = pd.DataFrame(rows)
upload_dataframe(df, "spnv_qualitaetsmonitor_nrw/data.csv")
21 changes: 21 additions & 0 deletions scrapers_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -140,5 +140,26 @@
"extra_env": [
"TALSPERREN_DATAWRAPPER_TOKEN"
]
},
{
"display_name": "SPNV-​​Qualitätsmonitor NRW",
"module_name": "spnv_qualitaetsmonitor_nrw",
"description": "Scraping the SPNV-​​Qualitätsmonitor NRW website at https://infoportal.mobil.nrw/information-service/qualitaetsmonitor-nrw.html",
"contact_name": "Jannes Höke",
"contact_email": "[email protected]",
"memory_size": "1024",
"ephemeral_storage": "512",
"preset": "minimal",
"events": [
{
"type": "schedule",
"enabled": false,
"data": {
"interval": "daily",
"interval_custom": null
}
}
],
"extra_env": []
}
]

0 comments on commit 99e197c

Please sign in to comment.