cmu-delphi · nmdefries · Aug 21, 2023 · Aug 31, 2023 · Aug 31, 2023 · Sep 1, 2023
diff --git a/src/acquisition/flusurv/flusurv.py b/src/acquisition/flusurv/flusurv.py
@@ -3,7 +3,7 @@
 === Purpose ===
 ===============
 
-Fetches FluSurv-NET data (flu hospitaliation rates) from CDC. Unlike the other
+Fetches FluSurv-NET data (flu hospitalization rates) from CDC. Unlike the other
 CDC-hosted datasets (e.g. FluView), FluSurv is not available as a direct
 download. This program emulates web browser requests for the web app and
 extracts data of interest from the JSON response.
@@ -36,9 +36,11 @@
 """
 
 # standard library
+from collections import defaultdict
 from datetime import datetime
 import json
 import time
+from warnings import warn
 
 # third party
 import requests
@@ -49,7 +51,7 @@
 
 # all currently available FluSurv locations and their associated codes
 # the number pair represents NetworkID and CatchmentID
-location_codes = {
+location_to_code = {
     "CA": (2, 1),
     "CO": (2, 2),
     "CT": (2, 3),
@@ -120,17 +122,46 @@ def fetch_json(path, payload, call_count=1, requests_impl=requests):
     return resp.json()
 
 
-def fetch_flusurv_object(location_code):
-    """Return decoded FluSurv JSON object for the given location."""
-    return fetch_json(
-        "PostPhase03GetData",
+def fetch_flusurv_location(location, seasonids):
+    """Return FluSurv JSON object for the given location."""
+    location_code = location_to_code[location]
+
+    result = fetch_json(
+        "PostPhase03DataTool",
         {
             "appversion": "Public",
-            "networkid": location_code[0],
-            "cacthmentid": location_code[1],
+            "key": "getdata",
+            "injson": [
+                {
+                    "networkid": location_code[0],
+                    "catchmentid": location_code[1],
+                    "seasonid": elem,
+                } for elem in seasonids],
         },
     )
 
+    # If no data is returned (a given seasonid is not reported,
+    # location codes are invalid, etc), the API returns a JSON like:
+    #    {
+    #        'default_data': {
+    #            'response': 'No Data'
+    #            }
+    #    }
+    #
+    # If data is returned, then data["default_data"] is a list
+    #  and data["default_data"]["response"] doesn't exist.
+    assert isinstance(result["default_data"], list) and len(result["default_data"]) > 0, \
+        f"Data was not correctly returned from the API for {location}"
+    return result
+
+
+def fetch_flusurv_metadata():
+    """Return FluSurv JSON metadata object."""
+    return fetch_json(
+        "PostPhase03DataTool",
+        {"appversion": "Public", "key": "", "injson": []}
+    )
+
 
 def mmwrid_to_epiweek(mmwrid):
     """Convert a CDC week index into an epiweek."""
@@ -142,78 +173,192 @@ def mmwrid_to_epiweek(mmwrid):
     return epiweek_200340.add_weeks(mmwrid - mmwrid_200340).get_ew()
 
 
-def extract_from_object(data_in):
+def group_by_epiweek(data, metadata):
     """
-    Given a FluSurv data object, return hospitaliation rates.
+    Convert default data for a single location into an epiweek-grouped dictionary
 
-    The returned object is indexed first by epiweek, then by zero-indexed age
-    group.
-    """
+    Args:
+        data: The "default_data" element of a GRASP API response object,
+        as fetched with 'fetch_flusurv_location' or `fetch_flusurv_metadata`
 
-    # an object to hold the result
-    data_out = {}
+    Returns a dictionary of the format
+        {
+            <location>: {
+                <epiweek>: {
+                    <ageid1>: <value>,
+                    ...
+                    <raceid2>: <value>,
+                    ...
+                }
+                ...
+            }
+            ...
+        }
+    """
+    data = data["default_data"]
 
-    # iterate over all seasons and age groups
-    for obj in data_in["busdata"]["dataseries"]:
-        if obj["age"] in (10, 11, 12):
-            # TODO(https://github.com/cmu-delphi/delphi-epidata/issues/242):
-            #   capture as-of-yet undefined age groups 10, 11, and 12
-            continue
-        age_index = obj["age"] - 1
-        # iterage over weeks
-        for mmwrid, _, _, rate in obj["data"]:
-            epiweek = mmwrid_to_epiweek(mmwrid)
-            if epiweek not in data_out:
-                # weekly rate of each age group
-                data_out[epiweek] = [None] * 9
-            prev_rate = data_out[epiweek][age_index]
-            if prev_rate is None:
-                # this is the first time to see a rate for this epiweek/age
-                data_out[epiweek][age_index] = rate
-            elif prev_rate != rate:
-                # a different rate was already found for this epiweek/age
-                format_args = (epiweek, obj["age"], prev_rate, rate)
-                print("warning: %d %d %f != %f" % format_args)
-
-    # sanity check the result
-    if len(data_out) == 0:
+    # Sanity check the input. We expect to see some epiweeks
+    if len(data) == 0:
         raise Exception("no data found")
 
-    # print the result and return flu data
-    print(f"found data for {len(data_out)} weeks")
+    id_label_map = make_id_label_map(metadata)
+    id_season_map = make_id_season_map(metadata)
+
+    # Create output object
+    # First layer of keys is epiweeks. Second layer of keys is groups
+    #  (by id, not age in years, sex abbr, etc).
+    #
+    # If a top-level key doesn't already exist, create a new empty dict.
+    # If a secondary key doesn't already exist, create a new key with a
+    #  default value of None if not provided.
+    data_out = defaultdict(lambda: defaultdict(lambda: None))
+
+    # data["default_data"] is a list of dictionaries, with the format
+    #     [
+    #         {'networkid': 1, 'catchmentid': 22, 'seasonid': 49, 'ageid': 0, 'sexid': 0, 'raceid': 1, 'rate': 4.3, 'weeklyrate': 1.7, 'mmwrid': 2493},
+    #         {'networkid': 1, 'catchmentid': 22, 'seasonid': 49, 'ageid': 0, 'sexid': 0, 'raceid': 1, 'rate': 20.3, 'weeklyrate': 0.1, 'mmwrid': 2513},
+    #         {'networkid': 1, 'catchmentid': 22, 'seasonid': 49, 'ageid': 0, 'sexid': 0, 'raceid': 1, 'rate': 20.6, 'weeklyrate': 0.1, 'mmwrid': 2516},
+    #         ...
+    #     ]
+    for obs in data:
+        epiweek = mmwrid_to_epiweek(obs["mmwrid"])
+        season = id_season_map[obs["seasonid"]]
+        groupname = groupids_to_name(
+            ageid = obs["ageid"], sexid = obs["sexid"], raceid = obs["raceid"],
+            id_label_map = id_label_map
+        )
+
+        # Set season description. This will be overwritten every iteration,
+        #  but should always have the same value per epiweek group.
+        data_out[epiweek]["season"] = season
+
+        rate = obs["weeklyrate"]
+        prev_rate = data_out[epiweek][groupname]
+        if prev_rate is None:
+            # This is the first time to see a rate for this epiweek-group
+            #  combo
+            data_out[epiweek][groupname] = rate
+        elif prev_rate != rate:
+            # Skip and warn; a different rate was already found for this
+            # epiweek-group combo
+            warn((f"warning: Multiple rates seen for {epiweek} "
+                   f"{groupname}, but previous value {prev_rate} does not "
+                   f"equal new value {rate}. Using the first value."))
+
+    # Sanity check the input. We expect to have populated our dictionary
+    if len(data_out.keys()) == 0:
+        raise Exception("no data loaded")
+
+    print(f"found data for {len(data_out.keys())} epiweeks")
+
     return data_out
 
 
-def get_data(location_code):
+def get_data(location, seasonids, metadata):
     """
     Fetch and parse flu data for the given location.
 
     This method performs the following operations:
-      - fetches FluSurv data from CDC
-      - extracts and returns hospitaliation rates
+      - fetch location-specific FluSurv data from CDC API
+      - extracts and returns hospitalization rates for each epiweek
     """
-
     # fetch
     print("[fetching flusurv data...]")
-    data_in = fetch_flusurv_object(location_code)
+    data_in = fetch_flusurv_location(location, seasonids)
 
     # extract
-    print("[extracting values...]")
-    data_out = extract_from_object(data_in)
+    print("[reformatting flusurv result...]")
+    data_out = group_by_epiweek(data_in, metadata)
 
     # return
-    print("[scraped successfully]")
+    print(f"[successfully fetched data for {location}]")
     return data_out
 
 
-def get_current_issue():
-    """Scrape the current issue from the FluSurv main page."""
-
-    # fetch
-    data = fetch_json("GetPhase03InitApp?appVersion=Public", None)
+def get_current_issue(data):
+    """
+    Extract the current issue from the FluSurv API result.
 
+    Args:
+        data: dictionary representing a JSON response from the FluSurv API
+    """
     # extract
     date = datetime.strptime(data["loaddatetime"], "%b %d, %Y")
 
     # convert and return
     return EpiDate(date.year, date.month, date.day).get_ew()
+
+
+def make_id_label_map(metadata):
+    """Create a map from valueid to group description"""
+    id_to_label = defaultdict(lambda: defaultdict(lambda: None))
+    for group in metadata["master_lookup"]:
+        # Skip "overall" group
+        if group["Variable"] is None:
+            continue
+        id_to_label[group["Variable"]][group["valueid"]] = group["Label"].replace(
+            " ", ""
+        ).replace(
+            "/", ""
+        ).replace(
+            "-", "t"
+        ).replace(
+            "yr", ""
+        ).lower()
+
+    return id_to_label
+
+
+def make_id_season_map(metadata):
+    """Create a map from seasonid to season description, in the format "YYYY-YY" """
+    id_to_label = defaultdict(lambda: defaultdict(lambda: None))
+    for season in metadata["seasons"]:
+        id_to_label[season["seasonid"]] = season["label"]
+
+    return id_to_label
+
+
+def groupids_to_name(ageid, sexid, raceid, id_label_map):
+    # Expect at least 2 of three ids to be 0
+    assert (ageid, sexid, raceid).count(0) >= 2, \
+        "At most one groupid can be non-zero"
+    if (ageid, sexid, raceid).count(0) == 3:
+        group = "overall"
+    elif ageid != 0:
+        # The column names used in the DB for the original age groups
+        #  are ordinal, such that:
+        #     "rate_age_0" corresponds to age group 1, 0-4 yr
+        #     "rate_age_1" corresponds to age group 2, 5-17 yr
+        #     "rate_age_2" corresponds to age group 3, 18-49 yr
+        #     "rate_age_3" corresponds to age group 4, 50-64 yr
+        #     "rate_age_4" corresponds to age group 5, 65+ yr
+        #     "rate_age_5" corresponds to age group 7, 65-74 yr
+        #     "rate_age_6" corresponds to age group 8, 75-84 yr
+        #     "rate_age_7" corresponds to age group 9, 85+ yr
+        #
+        #  Group 6 was the "overall" category and not included in the
+        #  ordinal naming scheme. Because of that, groups 1-5 have column
+        #  ids equal to the ageid - 1; groups 7-9 have column ids equal
+        #  to ageid - 2.
+        #
+        #  Automatically map from ageids 1-9 to column ids to match
+        #  the historical convention.
+        if ageid <= 5:
+            age_group = str(ageid - 1)
+        elif ageid == 6:
+            # Ageid of 6 used to be used for the "overall" category.
+            #  Now "overall" is represented by a valueid of 0, and ageid of 6
+            #  is not used for any group. If we see an ageid of 6, something
+            #  has gone wrong.
+            raise ValueError("Ageid cannot be 6; please check for changes in the API")
+        elif ageid <= 9:
+            age_group = str(ageid - 2)
+        else:
+            age_group = id_label_map["Age"][ageid]
+        group = "age_" + age_group
+    elif sexid != 0:
+        group = "sex_" + id_label_map["Sex"][sexid]
+    elif raceid != 0:
+        group = "race_" + id_label_map["Race"][raceid]
+
+    return "rate_" + group