Merge pull request #1481 from cmu-delphi/release/indicators_v0.2.22_u…

…tils_v0.2.10 Release covidcast-indicators 0.2.22
cmu-delphi · Jan 20, 2022 · 6abcab3 · 6abcab3
2 parents b685b2e + 3af5cdc
commit 6abcab3
Show file tree

Hide file tree

Showing 32 changed files with 1,328 additions and 32 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.2.21
+current_version = 0.2.22
 commit = True
 message = chore: bump covidcast-indicators to {new_version}
 tag = False
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -16,7 +16,7 @@ jobs:
     if: github.event.pull_request.draft == false
     strategy:
       matrix:
-        packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, doctor_visits, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts]
+        packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, doctor_visits, dsew_community_profile, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts]
     defaults:
       run:
         working-directory: ${{ matrix.packages }}

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -9,7 +9,7 @@
    - Keep in sync with '.github/workflows/python-ci.yml'.
    - TODO: #527 Get this list automatically from python-ci.yml at runtime.
  */
-def indicator_list = ["changehc", "claims_hosp", "facebook", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph_patterns", "sir_complainsalot", "usafacts"]
+def indicator_list = ["changehc", "claims_hosp", "facebook", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph_patterns", "sir_complainsalot", "usafacts", "dsew_community_profile"]
 def build_package = [:]
 def deploy_staging = [:]
 def deploy_production = [:]

diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.2.9
+current_version = 0.2.10
 commit = True
 message = chore: bump delphi_utils to {new_version}
 tag = False

diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py
@@ -15,4 +15,4 @@
 from .nancodes import Nans
 from .weekday import Weekday
 
-__version__ = "0.2.9"
+__version__ = "0.2.10"
diff --git a/_delphi_utils_python/delphi_utils/validator/datafetcher.py b/_delphi_utils_python/delphi_utils/validator/datafetcher.py
@@ -111,14 +111,9 @@ def get_geo_signal_combos(data_source):
     Cross references based on combinations reported available by COVIDcast metadata.
     """
     # Maps data_source name with what's in the API, lists used in case of multiple names
-    # pylint: disable=fixme
-    # TODO: Extract this mapping from meta response instead of hard-coding
-    # https://github.com/cmu-delphi/covidcast-indicators/issues/1457
-    source_signal_mappings = {
-        'indicator-combination': ['indicator-combination-cases-deaths'],
-        'quidel': ['quidel-covid-ag'],
-        'safegraph': ['safegraph-weekly']
-    }
+
+    source_signal_mappings = {i['source']:i['db_source'] for i in
+        requests.get("https://api.covidcast.cmu.edu/epidata/covidcast/meta").json()}
     meta = covidcast.metadata()
     source_meta = meta[meta['data_source'] == data_source]
     # Need to convert np.records to tuples so they are hashable and can be used in sets and dicts.
@@ -130,8 +125,9 @@ def get_geo_signal_combos(data_source):
     # True/False indicate if status is active, "unknown" means we should check
     sig_combo_seen = dict()
     for combo in geo_signal_combos:
-        if source_signal_mappings.get(data_source):
-            src_list = source_signal_mappings.get(data_source)
+        if data_source in source_signal_mappings.values():
+            src_list = [key for (key, value) in source_signal_mappings.items()
+                if value == data_source]
         else:
             src_list = [data_source]
         for src in src_list:

diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py
@@ -26,7 +26,7 @@
 
 setup(
     name="delphi_utils",
-    version="0.2.9",
+    version="0.2.10",
     description="Shared Utility Functions for Indicators",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/_delphi_utils_python/tests/validator/test_datafetcher.py b/_delphi_utils_python/tests/validator/test_datafetcher.py
@@ -21,25 +21,44 @@ def test_make_date_filter(self):
         assert not date_filter(FILENAME_REGEX.match("20200620_a_b.csv"))
         assert not date_filter(FILENAME_REGEX.match("202006_a_b.csv"))
 
-    # pylint: disable=fixme
-    # TODO: mock out the advanced meta endpoint /covidcast/meta as well
-    # https://github.com/cmu-delphi/covidcast-indicators/issues/1456
+    # Solution from https://stackoverflow.com/questions/15753390/
+    #how-can-i-mock-requests-and-the-response
+    def mocked_requests_get(*args, **kwargs):
+        class MockResponse:
+            def __init__(self, json_data, status_code):
+                self.json_data = json_data
+                self.status_code = status_code
+
+            def json(self):
+                return self.json_data
+        if len(kwargs) == 0:
+            return MockResponse([{'source': 'chng', 'db_source': 'chng'},
+                {'source': 'covid-act-now', 'db_source': 'covid-act-now'}], 200)
+        elif kwargs["params"] == {'signal': 'chng:inactive'}:
+            return MockResponse([{"signals": [{"active": False}]}], 200)
+        else:
+            return MockResponse([{"signals": [{"active": True}]}], 200)
+    @mock.patch('requests.get', side_effect=mocked_requests_get)
     @mock.patch("covidcast.metadata")
-    def test_get_geo_signal_combos(self, mock_metadata):
+    def test_get_geo_signal_combos(self, mock_metadata, mock_get):
         """Test that the geo signal combos are correctly pulled from the covidcast metadata."""
         # Need to use actual data_source and signal names since we reference the API
+        # We let the chng signal "inactive" be an inactive signal
         mock_metadata.return_value = pd.DataFrame({"data_source": ["chng", "chng", "chng",
                                                                    "covid-act-now",
                                                                    "covid-act-now",
-                                                                   "covid-act-now"],
+                                                                   "covid-act-now",
+                                                                   "chng"],
                                                    "signal": ["smoothed_outpatient_cli",
                                                               "smoothed_outpatient_covid",
                                                               "smoothed_outpatient_covid",
                                                               "pcr_specimen_positivity_rate",
                                                               "pcr_specimen_positivity_rate",
-                                                              "pcr_specimen_total_tests"],
+                                                              "pcr_specimen_total_tests",
+                                                              "inactive"],
                                                    "geo_type": ["state", "state", "county",
-                                                                "hrr", "msa", "msa"]
+                                                                "hrr", "msa", "msa",
+                                                                "state"]
                                                   })
 
         assert set(get_geo_signal_combos("chng")) == set(

diff --git a/ansible/templates/dsew_community_profile-params-prod.json.j2 b/ansible/templates/dsew_community_profile-params-prod.json.j2
@@ -0,0 +1,32 @@
+{
+  "common": {
+    "export_dir": "/common/covidcast/receiving/dsew-cpr",
+    "log_filename": "/var/log/indicators/dsew_cpr.log"
+  },
+  "indicator": {
+    "input_cache": "./input_cache",
+    "reports": "new"
+  },
+  "validation": {
+    "common": {
+      "data_source": "dsew-cpr",
+      "span_length": 14,
+      "min_expected_lag": {"all": "5"},
+      "max_expected_lag": {"all": "9"},
+      "dry_run": true,
+      "suppressed_errors": []
+    },
+    "static": {
+      "minimum_sample_size": 0,
+      "missing_se_allowed": true,
+      "missing_sample_size_allowed": true
+    },
+    "dynamic": {
+      "ref_window_size": 7,
+      "smoothed_signals": [
+        "naats_total_7dav",
+        "naats_positivity_7dav"
+      ]
+    }
+  }
+}
diff --git a/ansible/templates/facebook-params-prod.json.j2 b/ansible/templates/facebook-params-prod.json.j2
@@ -38,6 +38,7 @@
         "Survey of COVID-Like Illness - Wave 11": "fb-survey",
         "Survey of COVID-Like Illness - Wave 12": "fb-survey",
         "Survey of COVID-Like Illness - Wave 12 - Full Launch": "fb-survey",
+        "Survey of COVID-Like Illness - Wave 13": "fb-survey",
         "Survey of COVID-Like Illness - Wave 4": "fb-survey",
         "Survey of COVID-Like Illness - Wave 5": "fb-survey",
         "Survey of COVID-Like Illness - Wave 6": "fb-survey",

diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2
@@ -90,7 +90,40 @@
         ["smoothed_dontneed_reason_not_high_risk", "hrr"], ["smoothed_wdontneed_reason_not_high_risk", "hrr"],
         ["smoothed_dontneed_reason_not_serious", "hrr"], ["smoothed_wdontneed_reason_not_serious", "hrr"],
         ["smoothed_dontneed_reason_other", "hrr"], ["smoothed_wdontneed_reason_other", "hrr"],
-        ["smoothed_dontneed_reason_precautions", "hrr"], ["smoothed_wdontneed_reason_precautions", "hrr"]
+        ["smoothed_dontneed_reason_precautions", "hrr"], ["smoothed_wdontneed_reason_precautions", "hrr"],
+        "smoothed_screening_tested_positive_14d", "smoothed_wscreening_tested_positive_14d",
+        "smoothed_travel_outside_state_7d", "smoothed_wtravel_outside_state_7d",
+        "smoothed_belief_vaccinated_mask_unnecessary", "smoothed_wbelief_vaccinated_mask_unnecessary",
+        "smoothed_belief_children_immune", "smoothed_wbelief_children_immune",
+        "smoothed_received_2_vaccine_doses", "smoothed_wreceived_2_vaccine_doses",
+        "smoothed_vaccine_barrier_eligible", "smoothed_wvaccine_barrier_eligible",
+        "smoothed_vaccine_barrier_no_appointments", "smoothed_wvaccine_barrier_no_appointments",
+        "smoothed_vaccine_barrier_appointment_time", "smoothed_wvaccine_barrier_appointment_time",
+        "smoothed_vaccine_barrier_technical_difficulties", "smoothed_wvaccine_barrier_technical_difficulties",
+        "smoothed_vaccine_barrier_document", "smoothed_wvaccine_barrier_document",
+        "smoothed_vaccine_barrier_technology_access", "smoothed_wvaccine_barrier_technology_access",
+        "smoothed_vaccine_barrier_travel", "smoothed_wvaccine_barrier_travel",
+        "smoothed_vaccine_barrier_language", "smoothed_wvaccine_barrier_language",
+        "smoothed_vaccine_barrier_childcare", "smoothed_wvaccine_barrier_childcare",
+        "smoothed_vaccine_barrier_time", "smoothed_wvaccine_barrier_time",
+        "smoothed_vaccine_barrier_type", "smoothed_wvaccine_barrier_type",
+        "smoothed_vaccine_barrier_none", "smoothed_wvaccine_barrier_none",
+        "smoothed_vaccine_barrier_appointment_location", "smoothed_wvaccine_barrier_appointment_location",
+        "smoothed_vaccine_barrier_other", "smoothed_wvaccine_barrier_other",
+        "smoothed_vaccine_barrier_eligible_has", "smoothed_wvaccine_barrier_eligible_has",
+        "smoothed_vaccine_barrier_no_appointments_has", "smoothed_wvaccine_barrier_no_appointments_has",
+        "smoothed_vaccine_barrier_appointment_time_has", "smoothed_wvaccine_barrier_appointment_time_has",
+        "smoothed_vaccine_barrier_technical_difficulties_has", "smoothed_wvaccine_barrier_technical_difficulties_has",
+        "smoothed_vaccine_barrier_document_has", "smoothed_wvaccine_barrier_document_has",
+        "smoothed_vaccine_barrier_technology_access_has", "smoothed_wvaccine_barrier_technology_access_has",
+        "smoothed_vaccine_barrier_travel_has", "smoothed_wvaccine_barrier_travel_has",
+        "smoothed_vaccine_barrier_language_has", "smoothed_wvaccine_barrier_language_has",
+        "smoothed_vaccine_barrier_childcare_has", "smoothed_wvaccine_barrier_childcare_has",
+        "smoothed_vaccine_barrier_time_has", "smoothed_wvaccine_barrier_time_has",
+        "smoothed_vaccine_barrier_type_has", "smoothed_wvaccine_barrier_type_has",
+        "smoothed_vaccine_barrier_none_has", "smoothed_wvaccine_barrier_none_has",
+        "smoothed_vaccine_barrier_appointment_location_has", "smoothed_wvaccine_barrier_appointment_location_has",
+        "smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has"
       ]
     },
     "quidel": {

diff --git a/dsew_community_profile/.pylintrc b/dsew_community_profile/.pylintrc
@@ -0,0 +1,22 @@
+
+[MESSAGES CONTROL]
+
+disable=logging-format-interpolation,
+    too-many-locals,
+    too-many-arguments,
+    # Allow pytest functions to be part of a class.
+    no-self-use,
+    # Allow pytest classes to have one test.
+    too-few-public-methods
+
+[BASIC]
+
+# Allow arbitrarily short-named variables.
+variable-rgx=[a-z_][a-z0-9_]*
+argument-rgx=[a-z_][a-z0-9_]*
+attr-rgx=[a-z_][a-z0-9_]*
+
+[DESIGN]
+
+# Don't complain about pytest "unused" arguments.
+ignored-argument-names=(_.*|run_as_module)
diff --git a/dsew_community_profile/DETAILS.md b/dsew_community_profile/DETAILS.md
@@ -0,0 +1,133 @@
+# Dataset layout
+
+The Data Strategy and Execution Workgroup (DSEW) publishes a Community Profile
+Report each weekday, comprising a pair of files: an Excel workbook (.xlsx) and a
+PDF which shows select metrics from the workbook as time series charts and
+choropleth maps. These files are listed as attachments on the healthdata.gov
+site:
+
+https://healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9
+
+Each Excel file attachment has a filename. The filename contains a date,
+presumably the publish date. The attachment also has an alphanumeric
+assetId. Both the filename and the assetId are required for downloading the
+file. Whether this means that updated versions of a particular file may be
+uploaded by DSEW at later times is not known. The attachment does not explicitly
+list an upload timestamp. To be safe, we cache our downloads using both the
+assetId and the filename.
+
+# Workbook layout
+
+Each Excel file is a workbook with multiple sheets. The exemplar file used in
+writing this indicator is "Community Profile Report 20211102.xlsx". The sheets
+include:
+
+- User Notes: Instructions for using the workbook
+- Overview: US National figures for the last 5 weeks, plus monthly peaks back to
+  April 2020
+- Regions*: Figures for FEMA regions (double-checked: they match HHS regions
+  except that FEMA 2 does not include Palau while HHS 2 does)
+- States*: Figures for US states and territories
+- CBSAs*: Figures for US Census Block Statistical Areas
+- Counties*: Figures for US counties
+- Weekly Transmission Categories: Lists of high, substantial, and moderate
+  transmission states and territories
+- National Peaks: Monthly national peaks back to April 2020
+- National Historic: Daily national figures back to January 22 2020
+- Data Notes: Source and methods information for all metrics
+- Color Thresholds: Color-coding is used extensively in all sheets; these are
+  the keys
+
+The starred sheets above have nearly-identical column layouts, and together
+cover the county, MSA, state, and HHS geographical levels used in
+covidcast. Rather than aggregate them ourselves and risk a mismatch, this
+indicator lifts these geographical aggregations directly from the corresponding
+sheets of the workbook. 
+
+GeoMapper _is_ used to generate national figures from
+state, due to architectural differences between the starred sheets and the
+Overview sheet. If we discover that our nation-level figures differ too much
+from those listed in the Overview sheet, we can add dedicated parsing for the
+Overview sheet and remove GeoMapper from this indicator altogether.
+
+# Sheet layout
+
+## Headers
+
+Each starred sheet has two rows of headers. The first row uses merged cells to
+group several columns together under a single "overheader". This overheader
+often includes the reference period for that group of columns, such as:
+
+- CASES/DEATHS: LAST WEEK (October 26-November 1)
+- TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)
+- TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)
+
+Overheaders have changed periodically since the first report. For example, the
+"TESTING: LAST WEEK" overheader above has also appeared as "VIRAL (RT-PCR) LAB
+TESTING: LAST WEEK", with and without a separate reference date for Test
+Volume. All known overheader forms are checked in test_pull.py.
+
+The second row contains a header for each column. The headers uniquely identify
+each column included in the sheet. Column headers include spaces, and typically
+specify both the metric and the reference period over which it was calculated,
+such as:
+
+- Total NAATs - last 7 days (may be an underestimate due to delayed reporting)
+- NAAT positivity rate - previous 7 days (may be an underestimate due to delayed
+  reporting)
+
+Columns headers have also changed periodically since the first report. For
+example, the "Total NAATs - last 7 days" header above has also appeared as
+"Total RT-PCR diagnostic tests - last 7 days".
+
+## Contents
+
+Each starred sheet contains test positivity and total test volume figures for
+two reference periods, "last [week]" and "previous [week]". In some reports, the
+reference periods for test positivity and total test volume are the same; in
+others, they are different, such that the report contains figures for four
+distinct reference periods, two for each metric we extract.
+
+# Time series conversions and parsing notes
+
+## Reference date
+
+The reference period in the overheader never includes the year. We guess the
+reference year by picking the same year as the publish date (i.e., the date
+extracted from the filename), and if the reference month is greater than the
+publish month, subtract 1 from the reference year. This adequately covers the
+December-January boundary.
+
+We select as reference date the end date of the reference period for each
+metric. Reference periods are always 7 days, so this indicator produces
+seven-day averages. We divide the total testing volume by seven and leave the
+test positivity alone.
+
+## Geo ID
+
+The Counties sheet lists FIPS codes numerically, such that FIPS with a leading
+zero only have four digits. We fix this by zero-filling to five characters.
+
+MSAs are a subset of CBSAs. We fix this by selecting only CBSAs with type
+"Metropolitan".
+
+Most of the starred sheets have the geo id as the first non-index column. The
+Region sheet has no such column. We fix this by generating the HHS ids from the
+index column instead.
+
+## Combining multiple reports
+
+Each report file generates two reference dates for each metric, up to four
+reference dates total. Since it's not clear whether new versions of past files
+are ever made available, the default mode (params.indicator.reports="new")
+fetches any files that are not already in the input cache, then combines the
+results into a single data frame before exporting. This will generate correct
+behavior should (for instance) a previously-downloaded file get a new assetId.
+
+For the initial run on an empty input cache, and for runs configured to process
+a range of reports (using params.indicator.reports=YYYY-mm-dd--YYYY-mm-dd), this
+indicator makes no distinction between figures that came from different
+reports. That may not be what you want. If the covidcast issue date needs to
+match the date on the report filename, then the indicator must instead be run
+repeatedly, with equal start and end dates, keeping the output of each run
+separate.