fix(parser, SV): adjust to data source changes (#7650)

electricitymaps · Jan 2, 2025 · ea91b68 · ea91b68
1 parent 14f0308
commit ea91b68
Show file tree

Hide file tree

Showing 10 changed files with 1,287 additions and 679 deletions.
diff --git a/config/zones/SV.yaml b/config/zones/SV.yaml
@@ -39,6 +39,7 @@ capacity:
 contributors:
   - systemcatch
   - nessie2013
+  - consideRatio
 country: SV
 emissionFactors:
   direct:

diff --git a/parsers/ESTADISTICO_UT.py b/parsers/ESTADISTICO_UT.py
@@ -1,31 +1,31 @@
 #!/usr/bin/env python3
 
 import json
-import re
 from datetime import datetime
 from logging import Logger, getLogger
 from zoneinfo import ZoneInfo
 
 from bs4 import BeautifulSoup
-from requests import Response, Session
+from requests import Session
 
 from electricitymap.contrib.lib.models.event_lists import (
     ProductionBreakdownList,
     ProductionMix,
 )
 from electricitymap.contrib.lib.types import ZoneKey
+from electricitymap.contrib.parsers.lib.exceptions import ParserException
 
 # This parser gets hourly electricity generation data from ut.com.sv for El Salvador.
-# The 'Termico' category historicallyl only consisted of generation from oil/diesel, but this changed in 2022
+# The 'Termico' category historically only consisted of generation from oil/diesel, but this changed in 2022
 # when a new Liquid Natural Gas power plant came online
 # See: https://ourworldindata.org/grapher/electricity-prod-source-stacked?country=~SLV
 # A better data source that distinguishes between oil and gas can be found in #1733 and #5233
 
 # Thanks to jarek for figuring out how to make the correct POST request to the data url.
 
 DAILY_OPERATION_URL = "https://estadistico.ut.com.sv/OperacionDiaria.aspx"
-TIMEZONE = ZoneInfo("America/El_Salvador")
 SOURCE = "ut.com.sv"
+ZONE_INFO = ZoneInfo("America/El_Salvador")
 
 MODE_MAPPING = {
     "Biomasa": "biomass",
@@ -38,134 +38,130 @@
 }
 
 
-def get_data(session: Session) -> Response:
+def _fetch_data(session: Session) -> dict:
     """
-    Makes a get request to data url.
-    Parses the response then makes a post request to the same url using
-    parsed parameters from the get request.
-    Returns a requests response object.
+    Fetches production data from a webpage meant for human eyes rather than
+    programmatic access.
+
+    The returned production data is a response meant to be used by a dashboard
+    component on the webpage which needs to be parsed further.
     """
-    pagereq = session.get(DAILY_OPERATION_URL)
-
-    soup = BeautifulSoup(pagereq.content, "html.parser")
-
-    # Find and define parameters needed to send a POST request for the actual data.
-    viewstategenerator = soup.find("input", attrs={"id": "__VIEWSTATEGENERATOR"})[
-        "value"
-    ]
-    viewstate = soup.find("input", attrs={"id": "__VIEWSTATE"})["value"]
-    eventvalidation = soup.find("input", attrs={"id": "__EVENTVALIDATION"})["value"]
-    DXCss = "1_33,1_4,1_9,1_5,15_2,15_4"
-    DXScript = "1_232,1_134,1_225,1_169,1_187,15_1,1_183,1_182,1_140,1_147,1_148,1_142,1_141,1_143,1_144,1_145,1_146,15_0,15_6,15_7"
-    callback_param_init = 'c0:{"Task":"Initialize","DashboardId":"OperacionDiaria","Settings":{"calculateHiddenTotals":false},"RequestMarker":0,"ClientState":{}}'
-
-    postdata = {
-        "__VIEWSTATE": viewstate,
-        "__VIEWSTATEGENERATOR": viewstategenerator,
-        "__EVENTVALIDATION": eventvalidation,
-        "__CALLBACKPARAM": callback_param_init,
+    # initial GET request to acquire required state data for POST request
+    initial_resp = session.get(DAILY_OPERATION_URL)
+    soup = BeautifulSoup(initial_resp.content, "html.parser")
+
+    # define POST request's post data based on
+    post_data = {
+        # dynamically set based on initial request's response
+        "__VIEWSTATE": soup.find("input", {"id": "__VIEWSTATE"})["value"],
+        "__VIEWSTATEGENERATOR": soup.find("input", {"id": "__VIEWSTATEGENERATOR"})[
+            "value"
+        ],
+        "__EVENTVALIDATION": soup.find("input", {"id": "__EVENTVALIDATION"})["value"],
+        # hardcoded based on mimicing requests seen at
+        # https://estadistico.ut.com.sv/OperacionDiaria.aspx
         "__CALLBACKID": "ASPxDashboardViewer1",
-        "DXScript": DXScript,
-        "DXCss": DXCss,
+        "__CALLBACKPARAM": 'c1:{"url":"DXDD.axd?action=DashboardItemBatchGetAction&dashboardId=DashboardID&parameters=%5B%7B%22name%22%3A%22FechaConsulta%22%2C%22value%22%3A%221990-01-01T17%3A32%3A00.000%22%2C%22type%22%3A%22System.DateTime%22%2C%22allowMultiselect%22%3Afalse%2C%22selectAll%22%3Afalse%7D%5D&items=%7B%22pivotDashboardItem1%22%3A%7B%7D%2C%22chartDashboardItem1%22%3A%7B%7D%2C%22gridDashboardItem1%22%3A%7B%7D%2C%22gridDashboardItem2%22%3A%7B%7D%2C%22gridDashboardItem3%22%3A%7B%7D%7D","method":"GET","data":""}',
+        "DXScript": "1_9,1_10,1_253,1_21,1_62,1_12,1_13,1_0,1_4,24_364,24_365,24_366,24_367,24_359,24_362,24_363,24_360,24_361,24_479,24_480,25_0,24_368,24_440,24_441,15_0,25_2,25_1,25_3",
+        "DXCss": "1_72,1_66,24_378,24_379,24_414,24_442,24_443,24_478,15_1",
     }
 
-    datareq = session.post(DAILY_OPERATION_URL, data=postdata)
+    data_resp = session.post(DAILY_OPERATION_URL, data=post_data)
+
+    # The text response is expected to look like one of the strings:
+    #
+    #     0|/*DX*/({'id':1,'result':'{"gridDashboardItem3": {}, "gridDashboardItem2": {}}'})
+    #
+    #     0|/*DX*/({'error':{'message':'Callback request failed due to an internal server error.'},'result':null,'id':1})
+    #
+    # Note that:
+    # - <content> is wrapped like 0|/*DX*/(<content>)
+    # - <content> is JSON like, but using single quotes instead of double quotes
+    # - content data can include a result key, and possibly also an error key
+    # - the result value is a JSON string
+    #
+    content_string = data_resp.text[len("0|/*DX*/(") : -len(")")]
+    content_json = content_string.replace('"', r"\"").replace("'", '"')
+    content_data = json.loads(content_json)
+    if content_data.get("error"):
+        raise ParserException(
+            parser="SV", message=f'Error response returned: {content_data["error"]}'
+        )
+    data_resp = json.loads(content_data["result"])
 
-    return datareq
+    return data_resp
 
 
-def data_parser(response: Response) -> list[dict]:
+def _parse_data(data: dict) -> list[dict]:
     """
-    Slices the object down to a smaller size then converts to usable json.
-    Loads the data as json then finds the 'result' key.
-    Uses regex to find the start
-    and endpoints of the actual data.
-    Splits the data into datapoints then cleans them up for processing.
+    Parses already fetched data meant for use by a dashboard further into a list
+    of dictionaries.
     """
-    double_json = response.text[len("0|/*DX*/(") : -1]
-    double_json = double_json.replace("'", '"')
-    double_json = double_json.replace("\\n", "")
-    double_json = double_json.replace("\\t", "")
-    # Replacing js date objects with isoformat strings.
-    JS_DATE_REGEX = re.compile(
-        r"new Date\((?P<year>\d*),(?P<month>\d*),(?P<day>\d*),(?P<hour>\d*),(?P<minute>\d*),(?P<second>\d*),(?P<ms>\d*)\)"
-    )
-    matches = JS_DATE_REGEX.findall(double_json)
-    if matches:
-        for _match in matches:
-            year, month, day, hour, minute, second, ms = _match
-            dt = datetime(
-                year=int(year),
-                month=int(month) + 1,
-                day=int(day),
-                hour=int(hour),
-                tzinfo=TIMEZONE,
-            )
-            double_json = double_json.replace(
-                f"new Date({year},{month},{day},{hour},{minute},{second},{ms})",
-                f'\\"{dt.isoformat()}\\"',
-            )
-    data = json.loads(double_json)
-    jsresult = data["result"]
-    clean_json = json.loads(jsresult[1:-1])
-    datapoints = []
-    for item in clean_json["PaneContent"]:
-        generation_data = item["ItemData"]["DataStorageDTO"]
-        mapping = generation_data["EncodeMaps"]
-        if "DataItem3" not in mapping or len(mapping["DataItem3"]) != 1:
-            continue
-        day = mapping["DataItem3"][0]
-        hours = mapping["DataItem1"]
-        modes = mapping["DataItem2"]
-        slices = generation_data[
-            "Slices"
-        ]  # Slices are the different reprensentations of the data (hourly totals, hourly breakdowns, daily totals, daily breakdowns)
-        hourly_mode_breakdown = list(
-            filter(
-                lambda x: x["KeyIds"] == ["DataItem2", "DataItem3", "DataItem1"], slices
-            )
-        )[0]  # We take the hourly breakdown per mode
-        for keys, value in hourly_mode_breakdown["Data"].items():
-            key_ids = [int(key) for key in keys[1:-1].split(",")]
-            mode = modes[key_ids[0]]
-            hour = hours[key_ids[2]]
-            datapoint = {
-                "mode": mode,
-                "datetime": datetime.fromisoformat(day).replace(
-                    hour=int(hour), tzinfo=TIMEZONE
-                ),
-                "value": value["0"],
+    production_data = data["pivotDashboardItem1"]["ItemData"]["DataStorageDTO"]
+
+    # power production data is available for listed modes, days, and hours
+    modes = production_data["EncodeMaps"]["DataItem2"]
+    days = production_data["EncodeMaps"]["DataItem3"]
+    hours = production_data["EncodeMaps"]["DataItem1"]
+
+    # look at power production data for specific mode, day, and hour
+    mode_day_hour_dict = [
+        s
+        for s in production_data["Slices"]
+        if s["KeyIds"] == ["DataItem2", "DataItem3", "DataItem1"]
+    ][0]["Data"]
+
+    data_points = []
+    for index_values_json, mwh_production_dict in mode_day_hour_dict.items():
+        # index_values_json can for example look like "[1,0,1]", which would
+        # indicate that its associated with the second mode, first day, and
+        # second hour from the available modes, days, and hours
+        index_values = json.loads(index_values_json)
+        mode = modes[index_values[0]]
+        day = days[index_values[1]]
+        hour = hours[index_values[2]]
+
+        mwh_production = mwh_production_dict["0"]
+
+        # Python <=3.10 fromisoformat can't parse strings with sub-seconds,
+        # which the day string includes
+        day = day[: -len(".0000000")]
+        dt = datetime.fromisoformat(day).replace(hour=int(hour), tzinfo=ZONE_INFO)
+
+        data_points.append(
+            {
+                "mode": MODE_MAPPING[mode],
+                "datetime": dt,
+                "value": mwh_production,
             }
-            datapoints.append(datapoint)
-
-    return datapoints
+        )
+    return data_points
 
 
-def data_processer(
+def _process_data(
     zone_key: ZoneKey, data: list[dict], logger: Logger
 ) -> ProductionBreakdownList:
-    """
-    Takes data in the form of a list of lists.
-    Converts each list to a dictionary.
-    Joins dictionaries based on shared datetime key.
-    Maps generation to type.
-    """
+    # ignore collected exchange data for now
+    data = [d for d in data if d["mode"] != "exchange"]
+
     per_mode_production: dict[str, ProductionBreakdownList] = {}
-    filtered_data = filter(
-        lambda x: x["mode"] != "Interconexión", data
-    )  # TODO: handle interconnection
-    for point in filtered_data:
-        mode = point["mode"]
+    for d in data:
+        mode = d["mode"]
         if mode not in per_mode_production:
             per_mode_production[mode] = ProductionBreakdownList(logger)
+
         mix = ProductionMix()
-        mix.add_value(MODE_MAPPING[mode], point["value"])
+        mix.add_value(mode, d["value"])
         per_mode_production[mode].append(
-            zoneKey=zone_key, datetime=point["datetime"], source=SOURCE, production=mix
+            datetime=d["datetime"],
+            production=mix,
+            zoneKey=zone_key,
+            source=SOURCE,
         )
 
     return ProductionBreakdownList.merge_production_breakdowns(
-        list(per_mode_production.values()), logger
+        list(per_mode_production.values()),
+        logger,
     )
 
 
@@ -180,9 +176,10 @@ def fetch_production(
         raise NotImplementedError("This parser is not yet able to parse past dates")
     if session is None:
         session = Session()
-    req = get_data(session)
-    parsed = data_parser(req)
-    production_breakdown = data_processer(zone_key, parsed, logger)
+
+    data = _fetch_data(session)
+    parsed_data = _parse_data(data)
+    production_breakdown = _process_data(zone_key, parsed_data, logger)
     return production_breakdown.to_list()
 
 

diff --git a/parsers/test/mocks/ESTADISTICO_UT/data.html b/parsers/test/mocks/ESTADISTICO_UT/data.html