Skip to content

Commit

Permalink
fix(parser, SV): adjust to data source changes (#7650)
Browse files Browse the repository at this point in the history
  • Loading branch information
consideRatio authored Jan 2, 2025
1 parent 14f0308 commit ea91b68
Show file tree
Hide file tree
Showing 10 changed files with 1,287 additions and 679 deletions.
1 change: 1 addition & 0 deletions config/zones/SV.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ capacity:
contributors:
- systemcatch
- nessie2013
- consideRatio
country: SV
emissionFactors:
direct:
Expand Down
221 changes: 109 additions & 112 deletions parsers/ESTADISTICO_UT.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,31 @@
#!/usr/bin/env python3

import json
import re
from datetime import datetime
from logging import Logger, getLogger
from zoneinfo import ZoneInfo

from bs4 import BeautifulSoup
from requests import Response, Session
from requests import Session

from electricitymap.contrib.lib.models.event_lists import (
ProductionBreakdownList,
ProductionMix,
)
from electricitymap.contrib.lib.types import ZoneKey
from electricitymap.contrib.parsers.lib.exceptions import ParserException

# This parser gets hourly electricity generation data from ut.com.sv for El Salvador.
# The 'Termico' category historicallyl only consisted of generation from oil/diesel, but this changed in 2022
# The 'Termico' category historically only consisted of generation from oil/diesel, but this changed in 2022
# when a new Liquid Natural Gas power plant came online
# See: https://ourworldindata.org/grapher/electricity-prod-source-stacked?country=~SLV
# A better data source that distinguishes between oil and gas can be found in #1733 and #5233

# Thanks to jarek for figuring out how to make the correct POST request to the data url.

DAILY_OPERATION_URL = "https://estadistico.ut.com.sv/OperacionDiaria.aspx"
TIMEZONE = ZoneInfo("America/El_Salvador")
SOURCE = "ut.com.sv"
ZONE_INFO = ZoneInfo("America/El_Salvador")

MODE_MAPPING = {
"Biomasa": "biomass",
Expand All @@ -38,134 +38,130 @@
}


def get_data(session: Session) -> Response:
def _fetch_data(session: Session) -> dict:
"""
Makes a get request to data url.
Parses the response then makes a post request to the same url using
parsed parameters from the get request.
Returns a requests response object.
Fetches production data from a webpage meant for human eyes rather than
programmatic access.
The returned production data is a response meant to be used by a dashboard
component on the webpage which needs to be parsed further.
"""
pagereq = session.get(DAILY_OPERATION_URL)

soup = BeautifulSoup(pagereq.content, "html.parser")

# Find and define parameters needed to send a POST request for the actual data.
viewstategenerator = soup.find("input", attrs={"id": "__VIEWSTATEGENERATOR"})[
"value"
]
viewstate = soup.find("input", attrs={"id": "__VIEWSTATE"})["value"]
eventvalidation = soup.find("input", attrs={"id": "__EVENTVALIDATION"})["value"]
DXCss = "1_33,1_4,1_9,1_5,15_2,15_4"
DXScript = "1_232,1_134,1_225,1_169,1_187,15_1,1_183,1_182,1_140,1_147,1_148,1_142,1_141,1_143,1_144,1_145,1_146,15_0,15_6,15_7"
callback_param_init = 'c0:{"Task":"Initialize","DashboardId":"OperacionDiaria","Settings":{"calculateHiddenTotals":false},"RequestMarker":0,"ClientState":{}}'

postdata = {
"__VIEWSTATE": viewstate,
"__VIEWSTATEGENERATOR": viewstategenerator,
"__EVENTVALIDATION": eventvalidation,
"__CALLBACKPARAM": callback_param_init,
# initial GET request to acquire required state data for POST request
initial_resp = session.get(DAILY_OPERATION_URL)
soup = BeautifulSoup(initial_resp.content, "html.parser")

# define POST request's post data based on
post_data = {
# dynamically set based on initial request's response
"__VIEWSTATE": soup.find("input", {"id": "__VIEWSTATE"})["value"],
"__VIEWSTATEGENERATOR": soup.find("input", {"id": "__VIEWSTATEGENERATOR"})[
"value"
],
"__EVENTVALIDATION": soup.find("input", {"id": "__EVENTVALIDATION"})["value"],
# hardcoded based on mimicing requests seen at
# https://estadistico.ut.com.sv/OperacionDiaria.aspx
"__CALLBACKID": "ASPxDashboardViewer1",
"DXScript": DXScript,
"DXCss": DXCss,
"__CALLBACKPARAM": 'c1:{"url":"DXDD.axd?action=DashboardItemBatchGetAction&dashboardId=DashboardID&parameters=%5B%7B%22name%22%3A%22FechaConsulta%22%2C%22value%22%3A%221990-01-01T17%3A32%3A00.000%22%2C%22type%22%3A%22System.DateTime%22%2C%22allowMultiselect%22%3Afalse%2C%22selectAll%22%3Afalse%7D%5D&items=%7B%22pivotDashboardItem1%22%3A%7B%7D%2C%22chartDashboardItem1%22%3A%7B%7D%2C%22gridDashboardItem1%22%3A%7B%7D%2C%22gridDashboardItem2%22%3A%7B%7D%2C%22gridDashboardItem3%22%3A%7B%7D%7D","method":"GET","data":""}',
"DXScript": "1_9,1_10,1_253,1_21,1_62,1_12,1_13,1_0,1_4,24_364,24_365,24_366,24_367,24_359,24_362,24_363,24_360,24_361,24_479,24_480,25_0,24_368,24_440,24_441,15_0,25_2,25_1,25_3",
"DXCss": "1_72,1_66,24_378,24_379,24_414,24_442,24_443,24_478,15_1",
}

datareq = session.post(DAILY_OPERATION_URL, data=postdata)
data_resp = session.post(DAILY_OPERATION_URL, data=post_data)

# The text response is expected to look like one of the strings:
#
# 0|/*DX*/({'id':1,'result':'{"gridDashboardItem3": {}, "gridDashboardItem2": {}}'})
#
# 0|/*DX*/({'error':{'message':'Callback request failed due to an internal server error.'},'result':null,'id':1})
#
# Note that:
# - <content> is wrapped like 0|/*DX*/(<content>)
# - <content> is JSON like, but using single quotes instead of double quotes
# - content data can include a result key, and possibly also an error key
# - the result value is a JSON string
#
content_string = data_resp.text[len("0|/*DX*/(") : -len(")")]
content_json = content_string.replace('"', r"\"").replace("'", '"')
content_data = json.loads(content_json)
if content_data.get("error"):
raise ParserException(
parser="SV", message=f'Error response returned: {content_data["error"]}'
)
data_resp = json.loads(content_data["result"])

return datareq
return data_resp


def data_parser(response: Response) -> list[dict]:
def _parse_data(data: dict) -> list[dict]:
"""
Slices the object down to a smaller size then converts to usable json.
Loads the data as json then finds the 'result' key.
Uses regex to find the start
and endpoints of the actual data.
Splits the data into datapoints then cleans them up for processing.
Parses already fetched data meant for use by a dashboard further into a list
of dictionaries.
"""
double_json = response.text[len("0|/*DX*/(") : -1]
double_json = double_json.replace("'", '"')
double_json = double_json.replace("\\n", "")
double_json = double_json.replace("\\t", "")
# Replacing js date objects with isoformat strings.
JS_DATE_REGEX = re.compile(
r"new Date\((?P<year>\d*),(?P<month>\d*),(?P<day>\d*),(?P<hour>\d*),(?P<minute>\d*),(?P<second>\d*),(?P<ms>\d*)\)"
)
matches = JS_DATE_REGEX.findall(double_json)
if matches:
for _match in matches:
year, month, day, hour, minute, second, ms = _match
dt = datetime(
year=int(year),
month=int(month) + 1,
day=int(day),
hour=int(hour),
tzinfo=TIMEZONE,
)
double_json = double_json.replace(
f"new Date({year},{month},{day},{hour},{minute},{second},{ms})",
f'\\"{dt.isoformat()}\\"',
)
data = json.loads(double_json)
jsresult = data["result"]
clean_json = json.loads(jsresult[1:-1])
datapoints = []
for item in clean_json["PaneContent"]:
generation_data = item["ItemData"]["DataStorageDTO"]
mapping = generation_data["EncodeMaps"]
if "DataItem3" not in mapping or len(mapping["DataItem3"]) != 1:
continue
day = mapping["DataItem3"][0]
hours = mapping["DataItem1"]
modes = mapping["DataItem2"]
slices = generation_data[
"Slices"
] # Slices are the different reprensentations of the data (hourly totals, hourly breakdowns, daily totals, daily breakdowns)
hourly_mode_breakdown = list(
filter(
lambda x: x["KeyIds"] == ["DataItem2", "DataItem3", "DataItem1"], slices
)
)[0] # We take the hourly breakdown per mode
for keys, value in hourly_mode_breakdown["Data"].items():
key_ids = [int(key) for key in keys[1:-1].split(",")]
mode = modes[key_ids[0]]
hour = hours[key_ids[2]]
datapoint = {
"mode": mode,
"datetime": datetime.fromisoformat(day).replace(
hour=int(hour), tzinfo=TIMEZONE
),
"value": value["0"],
production_data = data["pivotDashboardItem1"]["ItemData"]["DataStorageDTO"]

# power production data is available for listed modes, days, and hours
modes = production_data["EncodeMaps"]["DataItem2"]
days = production_data["EncodeMaps"]["DataItem3"]
hours = production_data["EncodeMaps"]["DataItem1"]

# look at power production data for specific mode, day, and hour
mode_day_hour_dict = [
s
for s in production_data["Slices"]
if s["KeyIds"] == ["DataItem2", "DataItem3", "DataItem1"]
][0]["Data"]

data_points = []
for index_values_json, mwh_production_dict in mode_day_hour_dict.items():
# index_values_json can for example look like "[1,0,1]", which would
# indicate that its associated with the second mode, first day, and
# second hour from the available modes, days, and hours
index_values = json.loads(index_values_json)
mode = modes[index_values[0]]
day = days[index_values[1]]
hour = hours[index_values[2]]

mwh_production = mwh_production_dict["0"]

# Python <=3.10 fromisoformat can't parse strings with sub-seconds,
# which the day string includes
day = day[: -len(".0000000")]
dt = datetime.fromisoformat(day).replace(hour=int(hour), tzinfo=ZONE_INFO)

data_points.append(
{
"mode": MODE_MAPPING[mode],
"datetime": dt,
"value": mwh_production,
}
datapoints.append(datapoint)

return datapoints
)
return data_points


def data_processer(
def _process_data(
zone_key: ZoneKey, data: list[dict], logger: Logger
) -> ProductionBreakdownList:
"""
Takes data in the form of a list of lists.
Converts each list to a dictionary.
Joins dictionaries based on shared datetime key.
Maps generation to type.
"""
# ignore collected exchange data for now
data = [d for d in data if d["mode"] != "exchange"]

per_mode_production: dict[str, ProductionBreakdownList] = {}
filtered_data = filter(
lambda x: x["mode"] != "Interconexión", data
) # TODO: handle interconnection
for point in filtered_data:
mode = point["mode"]
for d in data:
mode = d["mode"]
if mode not in per_mode_production:
per_mode_production[mode] = ProductionBreakdownList(logger)

mix = ProductionMix()
mix.add_value(MODE_MAPPING[mode], point["value"])
mix.add_value(mode, d["value"])
per_mode_production[mode].append(
zoneKey=zone_key, datetime=point["datetime"], source=SOURCE, production=mix
datetime=d["datetime"],
production=mix,
zoneKey=zone_key,
source=SOURCE,
)

return ProductionBreakdownList.merge_production_breakdowns(
list(per_mode_production.values()), logger
list(per_mode_production.values()),
logger,
)


Expand All @@ -180,9 +176,10 @@ def fetch_production(
raise NotImplementedError("This parser is not yet able to parse past dates")
if session is None:
session = Session()
req = get_data(session)
parsed = data_parser(req)
production_breakdown = data_processer(zone_key, parsed, logger)

data = _fetch_data(session)
parsed_data = _parse_data(data)
production_breakdown = _process_data(zone_key, parsed_data, logger)
return production_breakdown.to_list()


Expand Down
1 change: 0 additions & 1 deletion parsers/test/mocks/ESTADISTICO_UT/data.html

This file was deleted.

Loading

0 comments on commit ea91b68

Please sign in to comment.