From 431d8c1b567b9472b1e61e7f14fc0655ab54cabc Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Thu, 11 Nov 2021 14:05:35 -0500 Subject: [PATCH 01/32] Initial add of community profile indicator --- dsew_community_profile/.pylintrc | 22 ++ dsew_community_profile/Makefile | 24 ++ dsew_community_profile/README.md | 62 +++++ dsew_community_profile/REVIEW.md | 38 +++ dsew_community_profile/cache/.gitignore | 0 .../delphi_dsew_community_profile/__init__.py | 13 + .../delphi_dsew_community_profile/__main__.py | 12 + .../constants.py | 56 +++++ .../delphi_dsew_community_profile/pull.py | 231 ++++++++++++++++++ .../delphi_dsew_community_profile/run.py | 63 +++++ dsew_community_profile/input_cache/.gitignore | 0 dsew_community_profile/params.json.template | 25 ++ dsew_community_profile/receiving/.gitignore | 0 dsew_community_profile/setup.py | 29 +++ dsew_community_profile/static/.gitignore | 0 .../tests/params.json.template | 25 ++ dsew_community_profile/tests/test_pull.py | 71 ++++++ 17 files changed, 671 insertions(+) create mode 100644 dsew_community_profile/.pylintrc create mode 100644 dsew_community_profile/Makefile create mode 100644 dsew_community_profile/README.md create mode 100644 dsew_community_profile/REVIEW.md create mode 100644 dsew_community_profile/cache/.gitignore create mode 100644 dsew_community_profile/delphi_dsew_community_profile/__init__.py create mode 100644 dsew_community_profile/delphi_dsew_community_profile/__main__.py create mode 100644 dsew_community_profile/delphi_dsew_community_profile/constants.py create mode 100644 dsew_community_profile/delphi_dsew_community_profile/pull.py create mode 100644 dsew_community_profile/delphi_dsew_community_profile/run.py create mode 100644 dsew_community_profile/input_cache/.gitignore create mode 100644 dsew_community_profile/params.json.template create mode 100644 dsew_community_profile/receiving/.gitignore create mode 100644 dsew_community_profile/setup.py create mode 100644 dsew_community_profile/static/.gitignore create mode 100644 dsew_community_profile/tests/params.json.template create mode 100644 dsew_community_profile/tests/test_pull.py diff --git a/dsew_community_profile/.pylintrc b/dsew_community_profile/.pylintrc new file mode 100644 index 000000000..f30837c7e --- /dev/null +++ b/dsew_community_profile/.pylintrc @@ -0,0 +1,22 @@ + +[MESSAGES CONTROL] + +disable=logging-format-interpolation, + too-many-locals, + too-many-arguments, + # Allow pytest functions to be part of a class. + no-self-use, + # Allow pytest classes to have one test. + too-few-public-methods + +[BASIC] + +# Allow arbitrarily short-named variables. +variable-rgx=[a-z_][a-z0-9_]* +argument-rgx=[a-z_][a-z0-9_]* +attr-rgx=[a-z_][a-z0-9_]* + +[DESIGN] + +# Don't complain about pytest "unused" arguments. +ignored-argument-names=(_.*|run_as_module) \ No newline at end of file diff --git a/dsew_community_profile/Makefile b/dsew_community_profile/Makefile new file mode 100644 index 000000000..72814c37b --- /dev/null +++ b/dsew_community_profile/Makefile @@ -0,0 +1,24 @@ +.PHONY = venv, lint, test, clean + +dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*') + +venv: + python3.8 -m venv env + +install: venv + . env/bin/activate; \ + pip install wheel ; \ + pip install -e ../_delphi_utils_python ;\ + pip install -e . + +lint: + . env/bin/activate; pylint $(dir) + . env/bin/activate; pydocstyle $(dir) + +test: + . env/bin/activate ;\ + (cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing) + +clean: + rm -rf env + rm -f params.json diff --git a/dsew_community_profile/README.md b/dsew_community_profile/README.md new file mode 100644 index 000000000..e4f3e64e4 --- /dev/null +++ b/dsew_community_profile/README.md @@ -0,0 +1,62 @@ +# COVID-19 Community Profile Report + + + +## Running the Indicator + +The indicator is run by directly executing the Python module contained in this +directory. The safest way to do this is to create a virtual environment, +installed the common DELPHI tools, and then install the module and its +dependencies. To do this, run the following command from this directory: + +``` +make install +``` + +This command will install the package in editable mode, so you can make changes that +will automatically propagate to the installed package. + +All of the user-changable parameters are stored in `params.json`. To execute +the module and produce the output datasets (by default, in `receiving`), run +the following: + +``` +env/bin/python -m delphi_dsew_community_profile +``` + +If you want to enter the virtual environment in your shell, +you can run `source env/bin/activate`. Run `deactivate` to leave the virtual environment. + +Once you are finished, you can remove the virtual environment and +params file with the following: + +``` +make clean +``` + +## Testing the code + +To run static tests of the code style, run the following command: + +``` +make lint +``` + +Unit tests are also included in the module. To execute these, run the following +command from this directory: + +``` +make test +``` + +To run individual tests, run the following: + +``` +(cd tests && ../env/bin/pytest .py --cov=delphi_dsew_community_profile --cov-report=term-missing) +``` + +The output will show the number of unit tests that passed and failed, along +with the percentage of code covered by the tests. + +None of the linting or unit tests should fail, and the code lines that are not covered by unit tests should be small and +should not include critical sub-routines. diff --git a/dsew_community_profile/REVIEW.md b/dsew_community_profile/REVIEW.md new file mode 100644 index 000000000..03f87b17a --- /dev/null +++ b/dsew_community_profile/REVIEW.md @@ -0,0 +1,38 @@ +## Code Review (Python) + +A code review of this module should include a careful look at the code and the +output. To assist in the process, but certainly not in replace of it, please +check the following items. + +**Documentation** + +- [ ] the README.md file template is filled out and currently accurate; it is +possible to load and test the code using only the instructions given +- [ ] minimal docstrings (one line describing what the function does) are +included for all functions; full docstrings describing the inputs and expected +outputs should be given for non-trivial functions + +**Structure** + +- [ ] code should pass lint checks (`make lint`) +- [ ] any required metadata files are checked into the repository and placed +within the directory `static` +- [ ] any intermediate files that are created and stored by the module should +be placed in the directory `cache` +- [ ] final expected output files to be uploaded to the API are placed in the +`receiving` directory; output files should not be committed to the respository +- [ ] all options and API keys are passed through the file `params.json` +- [ ] template parameter file (`params.json.template`) is checked into the +code; no personal (i.e., usernames) or private (i.e., API keys) information is +included in this template file + +**Testing** + +- [ ] module can be installed in a new virtual environment (`make install`) +- [ ] reasonably high level of unit test coverage covering all of the main logic +of the code (e.g., missing coverage for raised errors that do not currently seem +possible to reach are okay; missing coverage for options that will be needed are +not) +- [ ] all unit tests run without errors (`make test`) +- [ ] indicator directory has been added to GitHub CI +(`covidcast-indicators/.github/workflows/python-ci.yml`) diff --git a/dsew_community_profile/cache/.gitignore b/dsew_community_profile/cache/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/dsew_community_profile/delphi_dsew_community_profile/__init__.py b/dsew_community_profile/delphi_dsew_community_profile/__init__.py new file mode 100644 index 000000000..52a507259 --- /dev/null +++ b/dsew_community_profile/delphi_dsew_community_profile/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +"""Module to pull and clean indicators from the XXXXX source. + +This file defines the functions that are made public by the module. As the +module is intended to be executed though the main method, these are primarily +for testing. +""" + +from __future__ import absolute_import + +from . import run + +__version__ = "0.1.0" diff --git a/dsew_community_profile/delphi_dsew_community_profile/__main__.py b/dsew_community_profile/delphi_dsew_community_profile/__main__.py new file mode 100644 index 000000000..ab5a749dc --- /dev/null +++ b/dsew_community_profile/delphi_dsew_community_profile/__main__.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +"""Call the function run_module when executed. + +This file indicates that calling the module (`python -m delphi_dsew_community_profile`) will +call the function `run_module` found within the run.py file. There should be +no need to change this template. +""" + +from delphi_utils import read_params +from .run import run_module # pragma: no cover + +run_module(read_params()) # pragma: no cover diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py new file mode 100644 index 000000000..5339dd081 --- /dev/null +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -0,0 +1,56 @@ +"""Registry for variations.""" +from collections.abc import Callable as function +from dataclasses import dataclass + +URL_PREFIX = "https://healthdata.gov/api/views/gqxm-d9w9" +DOWNLOAD_ATTACHMENT = URL_PREFIX + "/files/{assetId}?download=true&filename={filename}" +DOWNLOAD_LISTING = URL_PREFIX + ".json" + +@dataclass +class Transform: + """Transformation filters for interpreting a particular sheet in the workbook.""" + + name: str = None + level: str = None + row_filter: function = None + geo_id_select: function = None + geo_id_apply: function = None + +T_FIRST = lambda df: df[df.columns[0]] +TRANSFORMS = { + t.name: t for t in [ + Transform( + name="Regions", + level="hhs", + geo_id_select=lambda df: df.index.to_series(), + geo_id_apply=lambda x: x.replace("Region ", "") + ), + Transform( + name="States", + level="state", + geo_id_select=T_FIRST, + geo_id_apply=lambda x: x.lower() + ), + Transform( + name="CBSAs", + level="msa", + row_filter=lambda df: df['CBSA type'] == "Metropolitan", + geo_id_select=T_FIRST, + geo_id_apply=lambda x: f"{x}" + ), + Transform( + name="Counties", + level="county", + geo_id_select=T_FIRST, + geo_id_apply=lambda x: f"{x:05}" + ) + ]} + +SIGNALS = [ + "total", + "positivity" +] + +def make_signal_name(key): + """Convert a signal key to the corresponding signal name for the API.""" + return f"naats_{key}_7dav" diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py new file mode 100644 index 000000000..49198c76b --- /dev/null +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -0,0 +1,231 @@ +# -*- coding: utf-8 -*- +"""Functions to call when downloading data.""" +from dataclasses import dataclass +import datetime +import os +import re +from urllib.parse import quote_plus as quote_as_url + +import pandas as pd +import requests + +from delphi_utils.geomap import GeoMapper + +from .constants import TRANSFORMS, SIGNALS +from .constants import DOWNLOAD_ATTACHMENT, DOWNLOAD_LISTING + +# YYYYMMDD +# example: "Community Profile Report 20211104.xlsx" +RE_DATE_FROM_FILENAME = re.compile(r'.*([0-9]{4})([0-9]{2})([0-9]{2}).*xlsx') + +# example: "TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)" +# example: "TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)" +DATE_EXP = r'(?:(.*) )?([0-9]{1,2})' +DATE_RANGE_EXP = f"{DATE_EXP}-{DATE_EXP}" +RE_DATE_FROM_HEADER = re.compile( + rf'TESTING: (.*) WEEK \({DATE_RANGE_EXP}, Test Volume ({DATE_RANGE_EXP})\)' +) + +# example: "NAAT positivity rate - last 7 days (may be an underestimate due to delayed reporting)" +# example: "Total NAATs - last 7 days (may be an underestimate due to delayed reporting)" +RE_COLUMN_FROM_HEADER = re.compile('- (.*) 7 days') + +@dataclass +class DatasetTimes: + """Collect reference dates for a column.""" + + column: str + positivity_reference_date: datetime.date + total_reference_date: datetime.date + def __getitem__(self, key): + """Use DatasetTimes like a dictionary.""" + if key.lower()=="positivity": + return self.positivity_reference_date + if key.lower()=="total": + return self.total_reference_date + raise ValueError( + f"Bad reference date type request '{key}'; need 'total' or 'positivity'" + ) + def __eq__(self, other): + """Check equality by value.""" + return isinstance(other, DatasetTimes) and \ + other.column == self.column and \ + other.positivity_reference_date == self.positivity_reference_date and \ + other.total_reference_date == self.total_reference_date + +def as_reference_date(header, year=2021): + """Convert reference dates in overheader to DatasetTimes.""" + findall_result = RE_DATE_FROM_HEADER.findall(header)[0] + def as_date(sub_result): + month = sub_result[2] if sub_result[2] else sub_result[0] + day = sub_result[3] + return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%B-%d").date() + column = findall_result[0].lower() + return DatasetTimes(column, as_date(findall_result[1:5]), as_date(findall_result[6:10])) + +class Dataset: + """All data extracted from a single report file.""" + + def __init__(self, config, sheets=TRANSFORMS.keys(), logger=None): + """Create a new Dataset instance. + + Download and cache the requested report file. + + Parse the file into data frames at multiple geo levels. + """ + self.publish_date = datetime.date( + *[int(x) for x in RE_DATE_FROM_FILENAME.findall(config['filename'])[0]] + ) + + self.url = DOWNLOAD_ATTACHMENT.format( + asset_id=config['assetId'], + filename=quote_as_url(config['filename']) + ) + if logger: + logger.info("Downloading file", filename=config['cached_filename']) + resp = requests.get(self.url) + with open(config['cached_filename'], 'wb') as f: + f.write(resp.content) + + self.workbook = pd.ExcelFile(config['cached_filename']) + + self.dfs = {} + self.times = {} + for si in sheets: + assert si in TRANSFORMS, f"Bad sheet requested: {si}" + if logger: + logger.info("Building dfs", + sheet=f"{si}", + filename=config['cached_filename']) + sheet = TRANSFORMS[sheet] + self._parse_times_for_sheet(sheet) + self._parse_sheet(sheet) + + @staticmethod + def skip_overheader(header): + """Ignore irrelevant overheaders.""" + # include "TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)" + # include "TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)" + return not (isinstance(header, str) and header.startswith("TESTING:") \ + # exclude "TESTING: % CHANGE FROM PREVIOUS WEEK" \ + # exclude "TESTING: DEMOGRAPHIC DATA" \ + and header.find("WEEK (") > 0) + def _parse_times_for_sheet(self, sheet): + """Record reference dates for this sheet.""" + # grab reference dates from overheaders + for h in pd.read_excel( + self.workbook, sheet_name=sheet.name, + header=None, + nrows=1 + ).values.flatten().tolist(): + if self.skip_overheader(h): + continue + + dt = as_reference_date(h) + if dt.column in self.times: + assert self.times[dt.column] == dt, \ + f"Conflicting reference date from {sheet.name} {dt}" + \ + f"vs previous {self.times[dt.column]}" + else: + self.times[dt.column] = dt + + @staticmethod + def retain_header(header): + """Ignore irrelevant headers.""" + return all([ + # include "Total NAATs - last 7 days ..." + # include "Total NAATs - previous 7 days ..." + # include "NAAT positivity rate - last 7 days ..." + # include "NAAT positivity rate - previous 7 days ..." + (header.startswith("Total NAATs") or header.startswith("NAAT positivity rate")), + # exclude "NAAT positivity rate - absolute change ..." + header.find("7 days") > 0, + # exclude "NAAT positivity rate - last 7 days - ages <5" + header.find(" ages") < 0, + ]) + def _parse_sheet(self, sheet): + """Extract data frame for this sheet.""" + df = pd.read_excel( + self.workbook, + sheet_name=sheet.name, + header=1, + index_col=0, + ) + if sheet.row_filter: + df = df.loc[sheet.row_filter(df)] + select = [ + (RE_COLUMN_FROM_HEADER.findall(h)[0], h, h.lower()) + for h in list(df.columns) + if self.retain_header(h) + ] + for sig in SIGNALS: + sig_select = [s for s in select if s[-1].find(sig) >= 0] + self.dfs[(sheet.level, sig)] = pd.concat([ + pd.DataFrame({ + "geo_id": sheet.geo_id_select(df).apply(sheet.geo_id_apply), + "timestamp": pd.to_datetime(self.times[si[0]][sig]), + "val": df[si[-2]], + "se": None, + "sample_size": None + }) + for si in sig_select + ]) + self.dfs[(sheet.level, "total")]["val"] /= 7 # 7-day total -> 7-day average + + +def as_cached_filename(params, config): + """Formulate a filename to uniquely identify this report in the input cache.""" + return os.path.join( + params['indicator']['input_cache'], + f"{config['assetId']}--{config['filename']}" + ) + +def fetch_listing(params): + """Generate the list of report files to process.""" + listing = requests.get(DOWNLOAD_LISTING).json()['metadata']['attachments'] + + # drop the pdf files + listing = [ + dict(el, cached_filename=as_cached_filename(params, el)) + for el in listing if el['filename'].endswith("xlsx") + ] + + # drop files we already have in the input cache + listing = [el for el in listing if os.path.exists(el['cached_filename'])] + return listing + +def download_and_parse(listing, logger): + """Convert a list of report files into Dataset instances.""" + datasets = {} + for item in listing: + d = Dataset(item, logger=logger) + for sig, df in d.dfs.items(): + if sig not in datasets: + datasets[sig] = [] + datasets[sig].append(df) + return datasets + +def fetch_new_reports(params, logger=None): + """Retrieve, compute, and collate all data we haven't seen yet.""" + listing = fetch_listing(params) + + # download and parse individual reports + datasets = download_and_parse(listing, logger) + + # collect like signals together + ret = {} + for sig, lst in datasets.items(): + ret[sig] = pd.concat(lst) + + # add nation from state + geomapper = GeoMapper() + for sig in SIGNALS: + df = geomapper.replace_geocode( + ret[("state", sig)].rename(columns={"geo_id":"state_code"}), + 'state_code', + 'nation', + new_col="geo_id" + ) + ret[("nation", sig)] = df + + return ret diff --git a/dsew_community_profile/delphi_dsew_community_profile/run.py b/dsew_community_profile/delphi_dsew_community_profile/run.py new file mode 100644 index 000000000..1856da215 --- /dev/null +++ b/dsew_community_profile/delphi_dsew_community_profile/run.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +"""Functions to call when running the indicator. + +This module should contain a function called `run_module`, that is executed when +the module is run with `python -m delphi_dsew_community_profile`. +`run_module`'s lone argument should be a nested dictionary of parameters loaded +from the params.json file. We expect the `params` to have the following +structure: + + - "common": + - "export_dir": str, directory to which the results are exported + - "log_filename": (optional) str, path to log file + - "indicator": (optional) + - Any other indicator-specific settings +""" +from datetime import datetime +import time + +from delphi_utils import get_structured_logger +from delphi_utils.export import create_export_csv + +from .constants import make_signal_name +from .pull import fetch_new_reports + + +def run_module(params): + """ + Run the indicator. + + Arguments + -------- + params: Dict[str, Any] + Nested dictionary of parameters. + """ + start_time = time.time() + logger = get_structured_logger( + __name__, filename=params["common"].get("log_filename"), + log_exceptions=params["common"].get("log_exceptions", True)) + + run_stats = [] + dfs = fetch_new_reports(params, logger) + for key, df in dfs.items(): + (geo, sig) = key + dates = create_export_csv( + df, + params['common']['export_dir'], + geo, + make_signal_name(sig) + ) + if len(dates)>0: + run_stats.append((max(dates), len(dates))) + + ## log this indicator run + elapsed_time_in_seconds = round(time.time() - start_time, 2) + min_max_date = run_stats and min(s[0] for s in run_stats) + csv_export_count = sum(s[-1] for s in run_stats) + max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days + formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d") + logger.info("Completed indicator run", + elapsed_time_in_seconds = elapsed_time_in_seconds, + csv_export_count = csv_export_count, + max_lag_in_days = max_lag_in_days, + oldest_final_export_date = formatted_min_max_date) diff --git a/dsew_community_profile/input_cache/.gitignore b/dsew_community_profile/input_cache/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template new file mode 100644 index 000000000..dd61d4a93 --- /dev/null +++ b/dsew_community_profile/params.json.template @@ -0,0 +1,25 @@ +{ + "common": { + "export_dir": "./receiving", + "log_filename": "dsew_cpr.log" + }, + "indicator": { + "input_cache": "./input_cache" + }, + "validation": { + "common": { + "data_source": "dsew_cpr", + "span_length": 14, + "min_expected_lag": {"all": "5"}, + "max_expected_lag": {"all": "9"}, + "dry_run": true, + "suppressed_errors": [] + }, + "static": { + "minimum_sample_size": 0, + "missing_se_allowed": true, + "missing_sample_size_allowed": true + }, + "dynamic": {} + } +} diff --git a/dsew_community_profile/receiving/.gitignore b/dsew_community_profile/receiving/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/dsew_community_profile/setup.py b/dsew_community_profile/setup.py new file mode 100644 index 000000000..f9ba22053 --- /dev/null +++ b/dsew_community_profile/setup.py @@ -0,0 +1,29 @@ +from setuptools import setup +from setuptools import find_packages + +required = [ + "numpy", + "pandas", + "pydocstyle", + "pytest", + "pytest-cov", + "pylint==2.8.3", + "delphi-utils", + "covidcast" +] + +setup( + name="delphi_dsew_community_profile", + version="0.1.0", + description="Indicator tracking specimen test results published in the COVID-19 Community Profile Report by the Data Strategy and Execution Workgroup", + author="", + author_email="", + url="https://github.com/cmu-delphi/covidcast-indicators", + install_requires=required, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3.8", + ], + packages=find_packages(), +) diff --git a/dsew_community_profile/static/.gitignore b/dsew_community_profile/static/.gitignore new file mode 100644 index 000000000..e69de29bb diff --git a/dsew_community_profile/tests/params.json.template b/dsew_community_profile/tests/params.json.template new file mode 100644 index 000000000..dd61d4a93 --- /dev/null +++ b/dsew_community_profile/tests/params.json.template @@ -0,0 +1,25 @@ +{ + "common": { + "export_dir": "./receiving", + "log_filename": "dsew_cpr.log" + }, + "indicator": { + "input_cache": "./input_cache" + }, + "validation": { + "common": { + "data_source": "dsew_cpr", + "span_length": 14, + "min_expected_lag": {"all": "5"}, + "max_expected_lag": {"all": "9"}, + "dry_run": true, + "suppressed_errors": [] + }, + "static": { + "minimum_sample_size": 0, + "missing_se_allowed": true, + "missing_sample_size_allowed": true + }, + "dynamic": {} + } +} diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py new file mode 100644 index 000000000..40812227e --- /dev/null +++ b/dsew_community_profile/tests/test_pull.py @@ -0,0 +1,71 @@ +from collections import namedtuple +from datetime import date +import pandas as pd +import pytest + +from delphi_dsew_community_profile.pull import DatasetTimes, as_reference_date +from delphi_dsew_community_profile.pull import Dataset + +example = namedtuple("example", "given expected") + +class TestPull: + def test_DatasetTimes(self): + examples = [ + example(DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20)), + DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20))), + ] + for ex in examples: + assert ex.given == ex.expected, "Equality" + + dt = DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20)) + assert dt["positivity"] == date(2021, 10, 30), "positivity" + assert dt["total"] == date(2021, 10, 20), "total" + with pytest.raises(ValueError): + dt["xyzzy"] + + def test_as_reference_date(self): + examples = [ + example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", + DatasetTimes("last", date(2021, 10, 30), date(2021, 10, 26))), + example("TESTING: PREVIOUS WEEK (October 24-30, Test Volume October 20-26)", + DatasetTimes("previous", date(2021, 10, 30), date(2021, 10, 26))), + example("TESTING: LAST WEEK (October 24-November 30, Test Volume October 20-26)", + DatasetTimes("last", date(2021, 11, 30), date(2021, 10, 26))), + ] + for ex in examples: + assert as_reference_date(ex.given) == ex.expected, ex.given + + def test_Dataset_skip_overheader(self): + examples = [ + example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", + False), + example("TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)", + False), + example("TESTING: % CHANGE FROM PREVIOUS WEEK", + True), + example("TESTING: DEMOGRAPHIC DATA", + True) + ] + for ex in examples: + assert Dataset.skip_overheader(ex.given) == ex.expected, ex.given + def test_Dataset_retain_header(self): + examples = [ + example("Total NAATs - last 7 days (may be an underestimate due to delayed reporting)", + True), + example("Total NAATs - previous 7 days (may be an underestimate due to delayed reporting)", + True), + example("NAAT positivity rate - last 7 days (may be an underestimate due to delayed reporting)", + True), + example("NAAT positivity rate - previous 7 days (may be an underestimate due to delayed reporting)", + True), + example("NAAT positivity rate - absolute change (may be an underestimate due to delayed reporting)", + False), + example("NAAT positivity rate - last 7 days - ages <5", + False) + ] + for ex in examples: + assert Dataset.retain_header(ex.given) == ex.expected, ex.given + + def test_Dataset_parse_sheet(self): + # TODO + pass From ff3b7da1ea1d6ca64f8c3cc4381c23d51ed3dece Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Fri, 12 Nov 2021 13:31:58 -0500 Subject: [PATCH 02/32] Changes to support all known backissues --- .../constants.py | 2 + .../delphi_dsew_community_profile/pull.py | 118 ++++++++++++------ dsew_community_profile/input_cache/.gitignore | 1 + dsew_community_profile/params.json.template | 3 +- dsew_community_profile/setup.py | 1 + .../tests/params.json.template | 3 +- dsew_community_profile/tests/test_pull.py | 30 ++++- 7 files changed, 115 insertions(+), 43 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index 5339dd081..07ceb7bc6 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -54,3 +54,5 @@ class Transform: def make_signal_name(key): """Convert a signal key to the corresponding signal name for the API.""" return f"naats_{key}_7dav" + +NEWLINE="\n" diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 49198c76b..2c1ba5b3d 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -11,7 +11,7 @@ from delphi_utils.geomap import GeoMapper -from .constants import TRANSFORMS, SIGNALS +from .constants import TRANSFORMS, SIGNALS, NEWLINE from .constants import DOWNLOAD_ATTACHMENT, DOWNLOAD_LISTING # YYYYMMDD @@ -20,10 +20,10 @@ # example: "TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)" # example: "TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)" -DATE_EXP = r'(?:(.*) )?([0-9]{1,2})' +DATE_EXP = r'(?:([A-Za-z]*) )?([0-9]{1,2})' DATE_RANGE_EXP = f"{DATE_EXP}-{DATE_EXP}" RE_DATE_FROM_HEADER = re.compile( - rf'TESTING: (.*) WEEK \({DATE_RANGE_EXP}, Test Volume ({DATE_RANGE_EXP})\)' + rf'.*TESTING: (.*) WEEK \({DATE_RANGE_EXP}(?:, Test Volume ({DATE_RANGE_EXP}))? *\)' ) # example: "NAAT positivity rate - last 7 days (may be an underestimate due to delayed reporting)" @@ -37,6 +37,33 @@ class DatasetTimes: column: str positivity_reference_date: datetime.date total_reference_date: datetime.date + + @staticmethod + def from_header(header, publish_date): + """Convert reference dates in overheader to DatasetTimes.""" + assert RE_DATE_FROM_HEADER.match(header), \ + f"Couldn't find reference date in header '{header}'" + findall_result = RE_DATE_FROM_HEADER.findall(header)[0] + def as_date(sub_result): + month = sub_result[2] if sub_result[2] else sub_result[0] + assert month, f"Bad month in header: {header}\nsub_result: {sub_result}" + month_numeric = datetime.datetime.strptime(month, "%B").month + day = sub_result[3] + year = publish_date.year + # year boundary + if month_numeric > publish_date.month: + year -= 1 + return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%B-%d").date() + + column = findall_result[0].lower() + positivity_reference_date = as_date(findall_result[1:5]) + if findall_result[6]: + # Reports published starting 2021-03-17 specify different reference + # dates for positivity and total test volume + total_reference_date = as_date(findall_result[6:10]) + else: + total_reference_date = positivity_reference_date + return DatasetTimes(column, positivity_reference_date, total_reference_date) def __getitem__(self, key): """Use DatasetTimes like a dictionary.""" if key.lower()=="positivity": @@ -53,16 +80,6 @@ def __eq__(self, other): other.positivity_reference_date == self.positivity_reference_date and \ other.total_reference_date == self.total_reference_date -def as_reference_date(header, year=2021): - """Convert reference dates in overheader to DatasetTimes.""" - findall_result = RE_DATE_FROM_HEADER.findall(header)[0] - def as_date(sub_result): - month = sub_result[2] if sub_result[2] else sub_result[0] - day = sub_result[3] - return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%B-%d").date() - column = findall_result[0].lower() - return DatasetTimes(column, as_date(findall_result[1:5]), as_date(findall_result[6:10])) - class Dataset: """All data extracted from a single report file.""" @@ -73,19 +90,17 @@ def __init__(self, config, sheets=TRANSFORMS.keys(), logger=None): Parse the file into data frames at multiple geo levels. """ - self.publish_date = datetime.date( - *[int(x) for x in RE_DATE_FROM_FILENAME.findall(config['filename'])[0]] - ) - + self.publish_date = self.parse_publish_date(config['filename']) self.url = DOWNLOAD_ATTACHMENT.format( - asset_id=config['assetId'], + assetId=config['assetId'], filename=quote_as_url(config['filename']) ) - if logger: - logger.info("Downloading file", filename=config['cached_filename']) - resp = requests.get(self.url) - with open(config['cached_filename'], 'wb') as f: - f.write(resp.content) + if not os.path.exists(config['cached_filename']): + if logger: + logger.info("Downloading file", filename=config['cached_filename']) + resp = requests.get(self.url) + with open(config['cached_filename'], 'wb') as f: + f.write(resp.content) self.workbook = pd.ExcelFile(config['cached_filename']) @@ -97,47 +112,61 @@ def __init__(self, config, sheets=TRANSFORMS.keys(), logger=None): logger.info("Building dfs", sheet=f"{si}", filename=config['cached_filename']) - sheet = TRANSFORMS[sheet] + sheet = TRANSFORMS[si] self._parse_times_for_sheet(sheet) self._parse_sheet(sheet) @staticmethod + def parse_publish_date(report_filename): + """Extract publish date from filename.""" + return datetime.date( + *[int(x) for x in RE_DATE_FROM_FILENAME.findall(report_filename)[0]] + ) + @staticmethod def skip_overheader(header): """Ignore irrelevant overheaders.""" - # include "TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)" - # include "TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)" - return not (isinstance(header, str) and header.startswith("TESTING:") \ + # include "TESTING: [LAST|PREVIOUS] WEEK (October 24-30, Test Volume October 20-26)" + # include "VIRAL (RT-PCR) LAB TESTING: [LAST|PREVIOUS] WEEK (August 24-30, ..." + return not (isinstance(header, str) and \ + (header.startswith("TESTING:") or \ + header.startswith("VIRAL (RT-PCR) LAB TESTING:")) and \ # exclude "TESTING: % CHANGE FROM PREVIOUS WEEK" \ # exclude "TESTING: DEMOGRAPHIC DATA" \ - and header.find("WEEK (") > 0) + header.find("WEEK (") > 0) def _parse_times_for_sheet(self, sheet): """Record reference dates for this sheet.""" # grab reference dates from overheaders - for h in pd.read_excel( + overheaders = pd.read_excel( self.workbook, sheet_name=sheet.name, header=None, nrows=1 - ).values.flatten().tolist(): + ).values.flatten().tolist() + for h in overheaders: if self.skip_overheader(h): continue - dt = as_reference_date(h) + dt = DatasetTimes.from_header(h, self.publish_date) if dt.column in self.times: assert self.times[dt.column] == dt, \ f"Conflicting reference date from {sheet.name} {dt}" + \ f"vs previous {self.times[dt.column]}" else: self.times[dt.column] = dt + assert len(self.times) == 2, \ + f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" @staticmethod def retain_header(header): """Ignore irrelevant headers.""" return all([ - # include "Total NAATs - last 7 days ..." - # include "Total NAATs - previous 7 days ..." - # include "NAAT positivity rate - last 7 days ..." - # include "NAAT positivity rate - previous 7 days ..." - (header.startswith("Total NAATs") or header.startswith("NAAT positivity rate")), + # include "Total NAATs - [last|previous] 7 days ..." + # include "Total RT-PCR diagnostic tests - [last|previous] 7 days ..." + # include "NAAT positivity rate - [last|previous] 7 days ..." + # include "Viral (RT-PCR) lab test positivity rate - [last|previous] 7 days ..." + (header.startswith("Total NAATs") or + header.startswith("NAAT positivity rate") or + header.startswith("Total RT-PCR") or + header.startswith("Viral (RT-PCR)")), # exclude "NAAT positivity rate - absolute change ..." header.find("7 days") > 0, # exclude "NAAT positivity rate - last 7 days - ages <5" @@ -158,8 +187,11 @@ def _parse_sheet(self, sheet): for h in list(df.columns) if self.retain_header(h) ] + for sig in SIGNALS: sig_select = [s for s in select if s[-1].find(sig) >= 0] + assert len(sig_select) > 0, \ + f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}" self.dfs[(sheet.level, sig)] = pd.concat([ pd.DataFrame({ "geo_id": sheet.geo_id_select(df).apply(sheet.geo_id_apply), @@ -190,8 +222,18 @@ def fetch_listing(params): for el in listing if el['filename'].endswith("xlsx") ] - # drop files we already have in the input cache - listing = [el for el in listing if os.path.exists(el['cached_filename'])] + if params['indicator']['reports'] == 'new': + # drop files we already have in the input cache + listing = [el for el in listing if not os.path.exists(el['cached_filename'])] + elif params['indicator']['reports'].find("--") > 0: + # drop files outside the specified publish-date range + start_str, _, end_str = params['indicator']['reports'].partition("--") + start_date = datetime.datetime.strptime(start_str, "%Y-%m-%d").date() + end_date = datetime.datetime.strptime(end_str, "%Y-%m-%d").date() + def keep(attachment): + publish_date = Dataset.parse_publish_date(attachment['filename']) + return start_date <= publish_date <= end_date + listing = [el for el in listing if keep(el)] return listing def download_and_parse(listing, logger): diff --git a/dsew_community_profile/input_cache/.gitignore b/dsew_community_profile/input_cache/.gitignore index e69de29bb..7c1222033 100644 --- a/dsew_community_profile/input_cache/.gitignore +++ b/dsew_community_profile/input_cache/.gitignore @@ -0,0 +1 @@ +*.xlsx diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template index dd61d4a93..5b8d9b1df 100644 --- a/dsew_community_profile/params.json.template +++ b/dsew_community_profile/params.json.template @@ -4,7 +4,8 @@ "log_filename": "dsew_cpr.log" }, "indicator": { - "input_cache": "./input_cache" + "input_cache": "./input_cache", + "reports": "new" }, "validation": { "common": { diff --git a/dsew_community_profile/setup.py b/dsew_community_profile/setup.py index f9ba22053..258126fb1 100644 --- a/dsew_community_profile/setup.py +++ b/dsew_community_profile/setup.py @@ -3,6 +3,7 @@ required = [ "numpy", + "openpyxl", "pandas", "pydocstyle", "pytest", diff --git a/dsew_community_profile/tests/params.json.template b/dsew_community_profile/tests/params.json.template index dd61d4a93..5b8d9b1df 100644 --- a/dsew_community_profile/tests/params.json.template +++ b/dsew_community_profile/tests/params.json.template @@ -4,7 +4,8 @@ "log_filename": "dsew_cpr.log" }, "indicator": { - "input_cache": "./input_cache" + "input_cache": "./input_cache", + "reports": "new" }, "validation": { "common": { diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index 40812227e..16933ec72 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -3,7 +3,7 @@ import pandas as pd import pytest -from delphi_dsew_community_profile.pull import DatasetTimes, as_reference_date +from delphi_dsew_community_profile.pull import DatasetTimes from delphi_dsew_community_profile.pull import Dataset example = namedtuple("example", "given expected") @@ -23,7 +23,7 @@ def test_DatasetTimes(self): with pytest.raises(ValueError): dt["xyzzy"] - def test_as_reference_date(self): + def test_DatasetTimes_from_header(self): examples = [ example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", DatasetTimes("last", date(2021, 10, 30), date(2021, 10, 26))), @@ -31,9 +31,21 @@ def test_as_reference_date(self): DatasetTimes("previous", date(2021, 10, 30), date(2021, 10, 26))), example("TESTING: LAST WEEK (October 24-November 30, Test Volume October 20-26)", DatasetTimes("last", date(2021, 11, 30), date(2021, 10, 26))), + example("VIRAL (RT-PCR) LAB TESTING: LAST WEEK (June 7-13, Test Volume June 3-9 )", + DatasetTimes("last", date(2021, 6, 13), date(2021, 6, 9))), + example("VIRAL (RT-PCR) LAB TESTING: LAST WEEK (March 7-13)", + DatasetTimes("last", date(2021, 3, 13), date(2021, 3, 13))) ] for ex in examples: - assert as_reference_date(ex.given) == ex.expected, ex.given + assert DatasetTimes.from_header(ex.given, date(2021, 12, 31)) == ex.expected, ex.given + + # test year boundary + examples = [ + example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", + DatasetTimes("last", date(2020, 10, 30), date(2020, 10, 26))), + ] + for ex in examples: + assert DatasetTimes.from_header(ex.given, date(2021, 1, 1)) == ex.expected, ex.given def test_Dataset_skip_overheader(self): examples = [ @@ -41,8 +53,14 @@ def test_Dataset_skip_overheader(self): False), example("TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)", False), + example("VIRAL (RT-PCR) LAB TESTING: LAST WEEK (August 24-30, Test Volume August 20-26)", + False), + example("VIRAL (RT-PCR) LAB TESTING: PREVIOUS WEEK (August 17-23, Test Volume August 13-19)", + False), example("TESTING: % CHANGE FROM PREVIOUS WEEK", True), + example("VIRAL (RT-PCR) LAB TESTING: % CHANGE FROM PREVIOUS WEEK", + True), example("TESTING: DEMOGRAPHIC DATA", True) ] @@ -61,6 +79,12 @@ def test_Dataset_retain_header(self): example("NAAT positivity rate - absolute change (may be an underestimate due to delayed reporting)", False), example("NAAT positivity rate - last 7 days - ages <5", + False), + example("Total RT-PCR diagnostic tests - last 7 days (may be an underestimate due to delayed reporting)", + True), + example("Viral (RT-PCR) lab test positivity rate - last 7 days (may be an underestimate due to delayed reporting)", + True), + example("RT-PCR tests per 100k - last 7 days (may be an underestimate due to delayed reporting)", False) ] for ex in examples: From af692a63f3452df0f914bfcef9bd21eab8027964 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Fri, 12 Nov 2021 17:10:52 -0500 Subject: [PATCH 03/32] Documentation --- dsew_community_profile/DETAILS.md | 133 ++++++++++++++++++++++++++++++ dsew_community_profile/README.md | 22 ++++- 2 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 dsew_community_profile/DETAILS.md diff --git a/dsew_community_profile/DETAILS.md b/dsew_community_profile/DETAILS.md new file mode 100644 index 000000000..56816ee06 --- /dev/null +++ b/dsew_community_profile/DETAILS.md @@ -0,0 +1,133 @@ +# Dataset layout + +The Data Strategy and Execution Workgroup (DSEW) publishes a Community Profile +Report each weekday, comprising a pair of files: an Excel workbook (.xlsx) and a +PDF which shows select metrics from the workbook as time series charts and +choropleth maps. These files are listed as attachments on the healthdata.gov +site: + +https://healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9 + +Each Excel file attachment has a filename. The filename contains a date, +presumably the publish date. The attachment also has an alphanumeric +assetId. Both the filename and the assetId are required for downloading the +file. Whether this means that updated versions of a particular file may be +uploaded by DSEW at later times is not known. The attachment does not explicitly +list an upload timestamp. To be safe, we cache our downloads using both the +assetId and the filename. + +# Workbook layout + +Each Excel file is a workbook with multiple sheets. The exemplar file used in +writing this indicator is "Community Profile Report 20211102.xlsx". The sheets +include: + +- User Notes: Instructions for using the workbook +- Overview: US National figures for the last 5 weeks, plus monthly peaks back to + April 2020 +- Regions*: Figures for FEMA regions (double-checked: they match HHS regions + except that FEMA 2 does not include Palau while HHS 2 does) +- States*: Figures for US states and territories +- CBSAs*: Figures for US Census Block Statistical Areas +- Counties*: Figures for US counties +- Weekly Transmission Categories: Lists of high, substantial, and moderate + transmission states and territories +- National Peaks: Monthly national peaks back to April 2020 +- National Historic: Daily national figures back to January 22 2020 +- Data Notes: Source and methods information for all metrics +- Color Thresholds: Color-coding is used extensively in all sheets; these are + the keys + +The starred sheets above have nearly-identical column layouts, and together +cover the county, MSA, state, and HHS geographical levels used in +covidcast. Rather than aggregate them ourselves and risk a mismatch, this +indicator lifts these geographical aggregations directly from the corresponding +sheets of the workbook. + +GeoMapper _is_ used to generate national figures from +state, due to architectural differences between the starred sheets and the +Overview sheet. If we discover that our nation-level figures differ too much +from those listed in the Overview sheet, we can add dedicated parsing for the +Overview sheet and remove GeoMapper from this indicator altogether. + +# Sheet layout + +## Headers + +Each starred sheet has two rows of headers. The first row uses merged cells to +group several columns together under a single "overheader". This overheader +often includes the reference period for that group of columns, such as: + +- CASES/DEATHS: LAST WEEK (October 26-November 1) +- TESTING: LAST WEEK (October 24-30, Test Volume October 20-26) +- TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19) + +Overheaders have changed periodically since the first report. For example, the +"TESTING: LAST WEEK" overheader above has also appeared as "VIRAL (RT-PCR) LAB +TESTING: LAST WEEK", with and without a separate reference date for Test +Volume. All known overheader forms are checked in test_pull.py. + +The second row contains a header for each column. The headers uniquely identify +each column included in the sheet. Column headers include spaces, and typically +specify both the metric and the reference period over which it was calculated, +such as: + +- Total NAATs - last 7 days (may be an underestimate due to delayed reporting) +- NAAT positivity rate - previous 7 days (may be an underestimate due to delayed + reporting) + +Columns headers have also changed periodically since the first report. For +example, the "Total NAATs - last 7 days" header above has also appeared as +"Total RT-PCR diagnostic tests - last 7 days". + +## Contents + +Each starred sheet contains test positivity and total test volume figures for +two reference periods, "last [week]" and "previous [week]". In some reports, the +reference periods for test positivity and total test volume are the same; in +others, they are different, such that the report contains figures for four +distinct reference periods, two for each metric we extract. + +# Time series conversions and parsing notes + +## Reference date + +The reference period in the overheader never includes the year. We guess the +reference year by picking the same year as the publish date (i.e., the date +extracted from the filename), and if the reference month is greater than the +publish month, subtract 1 from the reference year. This adequately covers the +December-January boundary. + +We select as reference date the end date of the reference period for each +metric. Reference periods are always 7 days, so this indicator produces +seven-day averages. We divide the total testing volume by seven and leave the +test positivity alone. + +## Geo ID + +The Counties sheet lists FIPS codes numerically, such that FIPS with a leading +zero only have four digits. We fix this by zero-filling to five characters. + +MSAs are a subset of CBSAs. We fix this by selecting only CBSAs with type +"Metropolitan". + +Most of the starred sheets have the geo id as the first non-index column. The +Region sheet has no such column. We fix this by generating the HHS ids from the +index column instead. + +## Combining multiple reports + +Each report file generates two reference dates for each metric, up to four +reference dates total. Since it's not clear whether new versions of past files +are ever made available, the default mode (params.indicator.reports="new") +fetches any files that are not already in the input cache, then combines the +results into a single data frame before exporting. This will generate correct +behavior should (for instance) a previously-downloaded file get a new assetId. + +For the initial run on an empty input cache, and for runs configured to process +a range of reports (using params.indicator.reports=YYYY-mm-dd--YYYY-mm-dd), this +indicator makes no distinction between figures that came from different +reports. That may not be what you want. If the covidcast issue date needs to +match the date on the report filename, then the indicator must instead be run +repeatedly, with equal start and end dates, keeping the output of each run +separate. diff --git a/dsew_community_profile/README.md b/dsew_community_profile/README.md index e4f3e64e4..38abd8ecf 100644 --- a/dsew_community_profile/README.md +++ b/dsew_community_profile/README.md @@ -1,6 +1,26 @@ # COVID-19 Community Profile Report - +The Data Strategy and Execution Workgroup (DSEW) publishes a Community Profile +Report each weekday at this location: + +https://healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9 + +This indicator extracts COVID-19 test figures from these reports. + +Indicator-specific parameters: + +* input_cache: a directory where Excel (.xlsx) files downloaded from + healthdata.gov will be stored for posterity. Each file is 3.3 MB in size, so + we expect this directory to require ~1GB of disk space for each year of + operation. +* reports: {new | all | YYYY-mm-dd--YYYY-mm-dd} a string indicating which + reports to export. The default, "new", downloads and exports only reports not + already found in the input cache. The "all" setting exports data for all + available reports, downloading them to the input cache if necessary. The date + range setting refers to the date listed in the filename for the report, + presumably the publish date. Only reports named with a date within the + specified range (inclusive) will be downloaded to the input cache if necessary + and exported. ## Running the Indicator From 07559f3df1596312a24faf82b1d5c5255b9fc6be Mon Sep 17 00:00:00 2001 From: Katie Mazaitis Date: Fri, 19 Nov 2021 13:04:15 -0500 Subject: [PATCH 04/32] [dsew] Add suggested dynamic validator params Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- dsew_community_profile/params.json.template | 8 +++++++- dsew_community_profile/tests/params.json.template | 8 +++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template index 5b8d9b1df..89cee4bf0 100644 --- a/dsew_community_profile/params.json.template +++ b/dsew_community_profile/params.json.template @@ -21,6 +21,12 @@ "missing_se_allowed": true, "missing_sample_size_allowed": true }, - "dynamic": {} + "dynamic": { + "ref_window_size": 7, + "smoothed_signals": [ + "naats_total_7dav", + "naats_positivity_7dav" + ] + } } } diff --git a/dsew_community_profile/tests/params.json.template b/dsew_community_profile/tests/params.json.template index 5b8d9b1df..89cee4bf0 100644 --- a/dsew_community_profile/tests/params.json.template +++ b/dsew_community_profile/tests/params.json.template @@ -21,6 +21,12 @@ "missing_se_allowed": true, "missing_sample_size_allowed": true }, - "dynamic": {} + "dynamic": { + "ref_window_size": 7, + "smoothed_signals": [ + "naats_total_7dav", + "naats_positivity_7dav" + ] + } } } From ac57ecc61e9bb5e9af520652faa958f4760152b9 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Mon, 29 Nov 2021 12:43:11 -0500 Subject: [PATCH 05/32] Fixes from code review * Use export_start_date / export_end_date * Correct nation aggregation for positivity as rate --- dsew_community_profile/README.md | 6 ++- .../delphi_dsew_community_profile/pull.py | 22 +++++++-- .../delphi_dsew_community_profile/run.py | 13 ++++- dsew_community_profile/tests/test_pull.py | 49 +++++++++++++++++++ 4 files changed, 82 insertions(+), 8 deletions(-) diff --git a/dsew_community_profile/README.md b/dsew_community_profile/README.md index 38abd8ecf..b070217a1 100644 --- a/dsew_community_profile/README.md +++ b/dsew_community_profile/README.md @@ -9,11 +9,11 @@ This indicator extracts COVID-19 test figures from these reports. Indicator-specific parameters: -* input_cache: a directory where Excel (.xlsx) files downloaded from +* `input_cache`: a directory where Excel (.xlsx) files downloaded from healthdata.gov will be stored for posterity. Each file is 3.3 MB in size, so we expect this directory to require ~1GB of disk space for each year of operation. -* reports: {new | all | YYYY-mm-dd--YYYY-mm-dd} a string indicating which +* `reports`: {new | all | YYYY-mm-dd--YYYY-mm-dd} a string indicating which reports to export. The default, "new", downloads and exports only reports not already found in the input cache. The "all" setting exports data for all available reports, downloading them to the input cache if necessary. The date @@ -21,6 +21,8 @@ Indicator-specific parameters: presumably the publish date. Only reports named with a date within the specified range (inclusive) will be downloaded to the input cache if necessary and exported. +* `export_start_date`: a YYYY-mm-dd string indicating the first date to export. +* `export_end_date`: a YYYY-mm-dd string indicating the final date to export. ## Running the Indicator diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 2c1ba5b3d..669936b2d 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -218,7 +218,11 @@ def fetch_listing(params): # drop the pdf files listing = [ - dict(el, cached_filename=as_cached_filename(params, el)) + dict( + el, + cached_filename=as_cached_filename(params, el), + publish_date=Dataset.parse_publish_date(el['filename']) + ) for el in listing if el['filename'].endswith("xlsx") ] @@ -230,10 +234,18 @@ def fetch_listing(params): start_str, _, end_str = params['indicator']['reports'].partition("--") start_date = datetime.datetime.strptime(start_str, "%Y-%m-%d").date() end_date = datetime.datetime.strptime(end_str, "%Y-%m-%d").date() - def keep(attachment): - publish_date = Dataset.parse_publish_date(attachment['filename']) - return start_date <= publish_date <= end_date - listing = [el for el in listing if keep(el)] + listing = [ + el for el in listing + if start_date <= el['publish_date'] <= end_date + ] + # reference date is guaranteed to be before publish date, so we can trim + # reports that are too early + if 'export_start_date' in params['indicator']: + listing = [ + el for el in listing + if params['indicator']['export_start_date'] < el['publish_date'] + ] + # can't do the same for export_end_date return listing def download_and_parse(listing, logger): diff --git a/dsew_community_profile/delphi_dsew_community_profile/run.py b/dsew_community_profile/delphi_dsew_community_profile/run.py index 1856da215..0276c61de 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/run.py +++ b/dsew_community_profile/delphi_dsew_community_profile/run.py @@ -36,6 +36,16 @@ def run_module(params): logger = get_structured_logger( __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) + def replace_date_param(p): + if p in params["indicator"]: + date_param = datetime.strptime(params["indicator"][p], "%Y-%m-%d").date() + params["indicator"][p] = date_param + replace_date_param("export_start_date") + replace_date_param("export_end_date") + export_params = { + 'start_date': params["indicator"].get("export_start_date", None), + 'end_date': params["indicator"].get("export_end_date", None) + } run_stats = [] dfs = fetch_new_reports(params, logger) @@ -45,7 +55,8 @@ def run_module(params): df, params['common']['export_dir'], geo, - make_signal_name(sig) + make_signal_name(sig), + **export_params ) if len(dates)>0: run_stats.append((max(dates), len(dates))) diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index 16933ec72..15ce1829b 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -1,10 +1,13 @@ from collections import namedtuple from datetime import date +from itertools import chain import pandas as pd import pytest +from unittest.mock import patch, Mock from delphi_dsew_community_profile.pull import DatasetTimes from delphi_dsew_community_profile.pull import Dataset +from delphi_dsew_community_profile.pull import fetch_listing example = namedtuple("example", "given expected") @@ -93,3 +96,49 @@ def test_Dataset_retain_header(self): def test_Dataset_parse_sheet(self): # TODO pass + @patch('requests.get') + @patch('os.path.exists') + def test_fetch_listing(self, mock_listing, mock_exists): + inst = namedtuple("attachment", "assetId filename publish cache") + instances = list(chain(*[ + [ + inst(f"{i}", f"2021010{i}.xlsx", date(2021, 1, i), f"{i}---2021010{i}.xlsx"), + inst(f"p{i}", f"2021010{i}.pdf", date(2021, 1, i), f"p{i}---2021010{i}.pdf"), + ] + for i in [1, 2, 3, 4, 5] + ])) + + mock_listing.return_value = Mock() + mock_listing.return_value.json = Mock( + return_value = { + 'metadata': { + 'attachments': [ + {"assetId": i.assetId, "filename": i.filename} + for i in instances + ] + } + } + ) + + mock_exists.reset_mock(return_value=False) + + def as_listing(instance): + return { + "assetId": instance.assetId, + "filename": instance.filename, + "cached_filename": instance.cache, + "publish_date": instance.publish + } + ex = example( + {'indicator':{'reports':'new'}}, + [ + as_listing(instance) + for i, instance in filter(lambda x: x[0]%2 == 0, enumerate(instances)) + ] + ) + + + for actual, expected in zip(fetch_listing(ex.given), ex.expected): + assert actual == expected + + From e363c065932babbf304d09e49032603c27b2a4b8 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Mon, 10 Jan 2022 14:39:51 -0500 Subject: [PATCH 06/32] Actually fix rate aggregation this time --- .../constants.py | 9 ++-- .../delphi_dsew_community_profile/pull.py | 23 +++++--- dsew_community_profile/tests/test_pull.py | 53 ++++++++++++++++++- 3 files changed, 73 insertions(+), 12 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index 07ceb7bc6..fa648b004 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -46,10 +46,11 @@ class Transform: ) ]} -SIGNALS = [ - "total", - "positivity" -] +# signal id : is_rate +SIGNALS = { + "total": False, + "positivity": True +} def make_signal_name(key): """Convert a signal key to the corresponding signal name for the API.""" diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 669936b2d..dbceefe0e 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -259,6 +259,19 @@ def download_and_parse(listing, logger): datasets[sig].append(df) return datasets +def nation_from_state(df, sig, geomapper): + """Compute nation level from state df.""" + if SIGNALS[sig]: # true if sig is a rate + df = geomapper.add_population_column(df, "state_code") \ + .rename(columns={"population":"weight"}) + df.weight = df.weight / df.weight.sum() + return geomapper.replace_geocode( + df, + 'state_code', + 'nation', + new_col="geo_id" + ) + def fetch_new_reports(params, logger=None): """Retrieve, compute, and collate all data we haven't seen yet.""" listing = fetch_listing(params) @@ -274,12 +287,10 @@ def fetch_new_reports(params, logger=None): # add nation from state geomapper = GeoMapper() for sig in SIGNALS: - df = geomapper.replace_geocode( - ret[("state", sig)].rename(columns={"geo_id":"state_code"}), - 'state_code', - 'nation', - new_col="geo_id" + ret[("nation", sig)] = nation_from_state( + ret[("state", sig)].rename(columns={"geo_id": "state_code"}), + sig, + geomapper ) - ret[("nation", sig)] = df return ret diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index 15ce1829b..067a45742 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -1,13 +1,15 @@ from collections import namedtuple -from datetime import date +from datetime import date, datetime from itertools import chain import pandas as pd import pytest from unittest.mock import patch, Mock +from delphi_utils.geomap import GeoMapper + from delphi_dsew_community_profile.pull import DatasetTimes from delphi_dsew_community_profile.pull import Dataset -from delphi_dsew_community_profile.pull import fetch_listing +from delphi_dsew_community_profile.pull import fetch_listing, nation_from_state example = namedtuple("example", "given expected") @@ -141,4 +143,51 @@ def as_listing(instance): for actual, expected in zip(fetch_listing(ex.given), ex.expected): assert actual == expected + def test_nation_from_state(self): + geomapper = GeoMapper() + state_pop = geomapper.get_crosswalk("state_id", "pop") + + test_df = geomapper.replace_geocode( + pd.DataFrame({ + 'geo_id': ['pa', 'wv'], + 'timestamp': [datetime(year=2020, month=1, day=1)]*2, + 'val': [15., 150.],}), + "state_id", + "state_code", + "geo_id" + ) + pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"]) + wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"]) + tot_pop = pa_pop + wv_pop + + assert True, nation_from_state( + test_df.copy(), + "total", + geomapper + ) + pd.testing.assert_frame_equal( + nation_from_state( + test_df.copy(), + "total", + geomapper + ), + pd.DataFrame({ + 'geo_id': ['us'], + 'timestamp': [datetime(year=2020, month=1, day=1)], + 'val': [15. + 150.],}), + check_like=True + ) + + pd.testing.assert_frame_equal( + nation_from_state( + test_df.copy(), + "positivity", + geomapper + ), + pd.DataFrame({ + 'geo_id': ['us'], + 'timestamp': [datetime(year=2020, month=1, day=1)], + 'val': [15*pa_pop/tot_pop + 150*wv_pop/tot_pop],}), + check_like=True + ) From 691e3e3d46611502c5925fb20590386d96f68a97 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Mon, 10 Jan 2022 14:43:46 -0500 Subject: [PATCH 07/32] remove whitespace --- dsew_community_profile/tests/test_pull.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index 067a45742..cdbf5eaf8 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -98,6 +98,7 @@ def test_Dataset_retain_header(self): def test_Dataset_parse_sheet(self): # TODO pass + @patch('requests.get') @patch('os.path.exists') def test_fetch_listing(self, mock_listing, mock_exists): @@ -109,7 +110,7 @@ def test_fetch_listing(self, mock_listing, mock_exists): ] for i in [1, 2, 3, 4, 5] ])) - + mock_listing.return_value = Mock() mock_listing.return_value.json = Mock( return_value = { @@ -138,8 +139,7 @@ def as_listing(instance): for i, instance in filter(lambda x: x[0]%2 == 0, enumerate(instances)) ] ) - - + for actual, expected in zip(fetch_listing(ex.given), ex.expected): assert actual == expected @@ -156,7 +156,7 @@ def test_nation_from_state(self): "state_code", "geo_id" ) - + pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"]) wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"]) tot_pop = pa_pop + wv_pop From ecd4e4cba7ac7d7b8ef0df369b8d62db3bae26f7 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 11 Jan 2022 16:25:03 -0500 Subject: [PATCH 08/32] initial add hospital admissions harder changes spot fixes almost working --- .../constants.py | 21 ++++- .../delphi_dsew_community_profile/pull.py | 76 ++++++++++++++----- 2 files changed, 75 insertions(+), 22 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index fa648b004..6b2446176 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -46,14 +46,27 @@ class Transform: ) ]} -# signal id : is_rate +# signal id : is_rate, name to report in API SIGNALS = { - "total": False, - "positivity": True + "total": { + "is_rate" : False, + "api_name": "naats_total_7dav" + }, + "positivity": { + "is_rate" : True, + "api_name": "naats_positivity_7dav" + }, + "confirmed covid-19 admissions": { + "is_rate" : False, + "api_name": "confirmed_admissions_covid_1d_7dav", + "date_key": "hosp" + } } +TOTAL_7D_SIGNALS = ("total", "confirmed covid-19 admissions") + def make_signal_name(key): """Convert a signal key to the corresponding signal name for the API.""" - return f"naats_{key}_7dav" + return SIGNALS[key]["api_name"] NEWLINE="\n" diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index dbceefe0e..f7e2fa565 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -11,7 +11,7 @@ from delphi_utils.geomap import GeoMapper -from .constants import TRANSFORMS, SIGNALS, NEWLINE +from .constants import TRANSFORMS, SIGNALS, TOTAL_7D_SIGNALS, NEWLINE from .constants import DOWNLOAD_ATTACHMENT, DOWNLOAD_LISTING # YYYYMMDD @@ -22,10 +22,15 @@ # example: "TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)" DATE_EXP = r'(?:([A-Za-z]*) )?([0-9]{1,2})' DATE_RANGE_EXP = f"{DATE_EXP}-{DATE_EXP}" -RE_DATE_FROM_HEADER = re.compile( +RE_DATE_FROM_TEST_HEADER = re.compile( rf'.*TESTING: (.*) WEEK \({DATE_RANGE_EXP}(?:, Test Volume ({DATE_RANGE_EXP}))? *\)' ) +# example: "HOSPITAL UTILIZATION: LAST WEEK (January 2-8)" +RE_DATE_FROM_HOSP_HEADER = re.compile( + rf'HOSPITAL UTILIZATION: (.*) WEEK \({DATE_RANGE_EXP}\)' +) + # example: "NAAT positivity rate - last 7 days (may be an underestimate due to delayed reporting)" # example: "Total NAATs - last 7 days (may be an underestimate due to delayed reporting)" RE_COLUMN_FROM_HEADER = re.compile('- (.*) 7 days') @@ -37,13 +42,11 @@ class DatasetTimes: column: str positivity_reference_date: datetime.date total_reference_date: datetime.date + hosp_reference_date: datetime.date @staticmethod def from_header(header, publish_date): """Convert reference dates in overheader to DatasetTimes.""" - assert RE_DATE_FROM_HEADER.match(header), \ - f"Couldn't find reference date in header '{header}'" - findall_result = RE_DATE_FROM_HEADER.findall(header)[0] def as_date(sub_result): month = sub_result[2] if sub_result[2] else sub_result[0] assert month, f"Bad month in header: {header}\nsub_result: {sub_result}" @@ -55,23 +58,39 @@ def as_date(sub_result): year -= 1 return datetime.datetime.strptime(f"{year}-{month}-{day}", "%Y-%B-%d").date() - column = findall_result[0].lower() - positivity_reference_date = as_date(findall_result[1:5]) - if findall_result[6]: - # Reports published starting 2021-03-17 specify different reference - # dates for positivity and total test volume - total_reference_date = as_date(findall_result[6:10]) + if RE_DATE_FROM_TEST_HEADER.match(header): + findall_result = RE_DATE_FROM_TEST_HEADER.findall(header)[0] + column = findall_result[0].lower() + positivity_reference_date = as_date(findall_result[1:5]) + if findall_result[6]: + # Reports published starting 2021-03-17 specify different reference + # dates for positivity and total test volume + total_reference_date = as_date(findall_result[6:10]) + else: + total_reference_date = positivity_reference_date + + hosp_reference_date = None + elif RE_DATE_FROM_HOSP_HEADER.match(header): + findall_result = RE_DATE_FROM_HOSP_HEADER.findall(header)[0] + column = SIGNALS["confirmed covid-19 admissions"]["date_key"] + hosp_reference_date = as_date(findall_result[1:5]) + + total_reference_date = None + positivity_reference_date = None else: - total_reference_date = positivity_reference_date - return DatasetTimes(column, positivity_reference_date, total_reference_date) + raise ValueError(f"Couldn't find reference date in header '{header}'") + + return DatasetTimes(column, positivity_reference_date, total_reference_date, hosp_reference_date) def __getitem__(self, key): """Use DatasetTimes like a dictionary.""" if key.lower()=="positivity": return self.positivity_reference_date if key.lower()=="total": return self.total_reference_date + if key.lower()=="confirmed covid-19 admissions": + return self.hosp_reference_date raise ValueError( - f"Bad reference date type request '{key}'; need 'total' or 'positivity'" + f"Bad reference date type request '{key}'; need 'total', 'positivity', or 'confirmed covid-19 admissions'" ) def __eq__(self, other): """Check equality by value.""" @@ -127,11 +146,15 @@ def skip_overheader(header): """Ignore irrelevant overheaders.""" # include "TESTING: [LAST|PREVIOUS] WEEK (October 24-30, Test Volume October 20-26)" # include "VIRAL (RT-PCR) LAB TESTING: [LAST|PREVIOUS] WEEK (August 24-30, ..." + # include "HOSPITAL UTILIZATION: LAST WEEK (January 2-8)" return not (isinstance(header, str) and \ (header.startswith("TESTING:") or \ - header.startswith("VIRAL (RT-PCR) LAB TESTING:")) and \ + header.startswith("VIRAL (RT-PCR) LAB TESTING:") or \ + header.startswith("HOSPITAL UTILIZATION:")) and \ # exclude "TESTING: % CHANGE FROM PREVIOUS WEEK" \ # exclude "TESTING: DEMOGRAPHIC DATA" \ + # exclude "HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK" \ + # exclude "HOSPITAL UTILIZATION: DEMOGRAPHIC DATA" \ header.find("WEEK (") > 0) def _parse_times_for_sheet(self, sheet): """Record reference dates for this sheet.""" @@ -152,7 +175,7 @@ def _parse_times_for_sheet(self, sheet): f"vs previous {self.times[dt.column]}" else: self.times[dt.column] = dt - assert len(self.times) == 2, \ + assert len(self.times) == 3, \ f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" @staticmethod @@ -171,6 +194,16 @@ def retain_header(header): header.find("7 days") > 0, # exclude "NAAT positivity rate - last 7 days - ages <5" header.find(" ages") < 0, + ]) or all([ + # include "Confirmed COVID-19 admissions - last 7 days" + header.startswith("Confirmed COVID-19 admissions"), + # exclude "Confirmed COVID-19 admissions - percent change" + header.find("7 days") > 0, + # exclude "Confirmed COVID-19 admissions - last 7 days - ages <18" + # exclude "Confirmed COVID-19 admissions - last 7 days - age unknown" + header.find(" age") < 0, + # exclude "Confirmed COVID-19 admissions per 100 inpatient beds - last 7 days" + header.find(" beds") < 0, ]) def _parse_sheet(self, sheet): """Extract data frame for this sheet.""" @@ -192,17 +225,24 @@ def _parse_sheet(self, sheet): sig_select = [s for s in select if s[-1].find(sig) >= 0] assert len(sig_select) > 0, \ f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}" + + date_keys = { + si:( SIGNALS[sig]["date_key"] if "date_key" in SIGNALS[sig] else si[0] ) + for si in sig_select + } self.dfs[(sheet.level, sig)] = pd.concat([ pd.DataFrame({ "geo_id": sheet.geo_id_select(df).apply(sheet.geo_id_apply), - "timestamp": pd.to_datetime(self.times[si[0]][sig]), + "timestamp": pd.to_datetime(self.times[date_keys[si]][sig]), "val": df[si[-2]], "se": None, "sample_size": None }) for si in sig_select ]) - self.dfs[(sheet.level, "total")]["val"] /= 7 # 7-day total -> 7-day average + + for sig in TOTAL_7D_SIGNALS: + self.dfs[(sheet.level, sig)]["val"] /= 7 # 7-day total -> 7-day average def as_cached_filename(params, config): From f93c2edcede4ccb6b68e08de8b5768c1dc3ffdaa Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 13 Jan 2022 16:06:35 -0500 Subject: [PATCH 09/32] simplify date keys logic --- .../constants.py | 5 +-- .../delphi_dsew_community_profile/pull.py | 39 +++++++++++++------ 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/constants.py b/dsew_community_profile/delphi_dsew_community_profile/constants.py index 6b2446176..51c62b5ea 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/constants.py +++ b/dsew_community_profile/delphi_dsew_community_profile/constants.py @@ -58,12 +58,11 @@ class Transform: }, "confirmed covid-19 admissions": { "is_rate" : False, - "api_name": "confirmed_admissions_covid_1d_7dav", - "date_key": "hosp" + "api_name": "confirmed_admissions_covid_1d_7dav" } } -TOTAL_7D_SIGNALS = ("total", "confirmed covid-19 admissions") +COUNTS_7D_SIGNALS = {key for key, value in SIGNALS.items() if not value["is_rate"]} def make_signal_name(key): """Convert a signal key to the corresponding signal name for the API.""" diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index f7e2fa565..80fd16b93 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -11,7 +11,7 @@ from delphi_utils.geomap import GeoMapper -from .constants import TRANSFORMS, SIGNALS, TOTAL_7D_SIGNALS, NEWLINE +from .constants import TRANSFORMS, SIGNALS, COUNTS_7D_SIGNALS, NEWLINE from .constants import DOWNLOAD_ATTACHMENT, DOWNLOAD_LISTING # YYYYMMDD @@ -72,7 +72,7 @@ def as_date(sub_result): hosp_reference_date = None elif RE_DATE_FROM_HOSP_HEADER.match(header): findall_result = RE_DATE_FROM_HOSP_HEADER.findall(header)[0] - column = SIGNALS["confirmed covid-19 admissions"]["date_key"] + column = findall_result[0].lower() hosp_reference_date = as_date(findall_result[1:5]) total_reference_date = None @@ -92,6 +92,18 @@ def __getitem__(self, key): raise ValueError( f"Bad reference date type request '{key}'; need 'total', 'positivity', or 'confirmed covid-19 admissions'" ) + def __setitem__(self, key, newvalue): + """Use DatasetTimes like a dictionary.""" + if key.lower()=="positivity": + self.positivity_reference_date = newvalue + if key.lower()=="total": + self.total_reference_date = newvalue + if key.lower()=="confirmed covid-19 admissions": + self.hosp_reference_date = newvalue + else: + raise ValueError( + f"Bad reference date type request '{key}'; need 'total', 'positivity', or 'confirmed covid-19 admissions'" + ) def __eq__(self, other): """Check equality by value.""" return isinstance(other, DatasetTimes) and \ @@ -170,12 +182,19 @@ def _parse_times_for_sheet(self, sheet): dt = DatasetTimes.from_header(h, self.publish_date) if dt.column in self.times: - assert self.times[dt.column] == dt, \ - f"Conflicting reference date from {sheet.name} {dt}" + \ - f"vs previous {self.times[dt.column]}" + # Items that are not None should be the same between sheets. + # Fill None items with the newly calculated version of the + # field from dt. + for sig in SIGNALS: + if self.times[dt.column][sig] is not None and dt[sig] is not None: + assert self.times[dt.column][sig] == dt[sig], \ + f"Conflicting reference date from {sheet.name} {dt[sig]}" + \ + f"vs previous {self.times[dt.column][sig]}" + elif self.times[dt.column][sig] is None: + self.times[dt.column][sig] = dt[sig] else: self.times[dt.column] = dt - assert len(self.times) == 3, \ + assert len(self.times) == 2, \ f"No times extracted from overheaders:\n{NEWLINE.join(str(s) for s in overheaders)}" @staticmethod @@ -226,14 +245,10 @@ def _parse_sheet(self, sheet): assert len(sig_select) > 0, \ f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}" - date_keys = { - si:( SIGNALS[sig]["date_key"] if "date_key" in SIGNALS[sig] else si[0] ) - for si in sig_select - } self.dfs[(sheet.level, sig)] = pd.concat([ pd.DataFrame({ "geo_id": sheet.geo_id_select(df).apply(sheet.geo_id_apply), - "timestamp": pd.to_datetime(self.times[date_keys[si]][sig]), + "timestamp": pd.to_datetime(self.times[si[0]][sig]), "val": df[si[-2]], "se": None, "sample_size": None @@ -241,7 +256,7 @@ def _parse_sheet(self, sheet): for si in sig_select ]) - for sig in TOTAL_7D_SIGNALS: + for sig in COUNTS_7D_SIGNALS: self.dfs[(sheet.level, sig)]["val"] /= 7 # 7-day total -> 7-day average From 2ad52eb1abd4a2f054c0f0a3214c286fb863e532 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 13 Jan 2022 16:29:19 -0500 Subject: [PATCH 10/32] fix state to nation calc --- .../delphi_dsew_community_profile/pull.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 80fd16b93..e2a91a3b8 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -316,13 +316,13 @@ def download_and_parse(listing, logger): def nation_from_state(df, sig, geomapper): """Compute nation level from state df.""" - if SIGNALS[sig]: # true if sig is a rate - df = geomapper.add_population_column(df, "state_code") \ + if SIGNALS[sig]["is_rate"]: # true if sig is a rate + df = geomapper.add_population_column(df, "state_id") \ .rename(columns={"population":"weight"}) df.weight = df.weight / df.weight.sum() return geomapper.replace_geocode( df, - 'state_code', + 'state_id', 'nation', new_col="geo_id" ) @@ -343,7 +343,7 @@ def fetch_new_reports(params, logger=None): geomapper = GeoMapper() for sig in SIGNALS: ret[("nation", sig)] = nation_from_state( - ret[("state", sig)].rename(columns={"geo_id": "state_code"}), + ret[("state", sig)].rename(columns={"geo_id": "state_id"}), sig, geomapper ) From cfbb8a960feaae96000eb3328c7f29d1fd201750 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 13 Jan 2022 17:15:19 -0500 Subject: [PATCH 11/32] linting and tests --- .../delphi_dsew_community_profile/pull.py | 9 ++-- dsew_community_profile/tests/test_pull.py | 50 ++++++++++++------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index e2a91a3b8..a47309a94 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -80,7 +80,8 @@ def as_date(sub_result): else: raise ValueError(f"Couldn't find reference date in header '{header}'") - return DatasetTimes(column, positivity_reference_date, total_reference_date, hosp_reference_date) + return DatasetTimes(column, positivity_reference_date, + total_reference_date, hosp_reference_date) def __getitem__(self, key): """Use DatasetTimes like a dictionary.""" if key.lower()=="positivity": @@ -90,7 +91,8 @@ def __getitem__(self, key): if key.lower()=="confirmed covid-19 admissions": return self.hosp_reference_date raise ValueError( - f"Bad reference date type request '{key}'; need 'total', 'positivity', or 'confirmed covid-19 admissions'" + f"Bad reference date type request '{key}'; " + \ + "need 'total', 'positivity', or 'confirmed covid-19 admissions'" ) def __setitem__(self, key, newvalue): """Use DatasetTimes like a dictionary.""" @@ -102,7 +104,8 @@ def __setitem__(self, key, newvalue): self.hosp_reference_date = newvalue else: raise ValueError( - f"Bad reference date type request '{key}'; need 'total', 'positivity', or 'confirmed covid-19 admissions'" + f"Bad reference date type request '{key}'; " + \ + "need 'total', 'positivity', or 'confirmed covid-19 admissions'" ) def __eq__(self, other): """Check equality by value.""" diff --git a/dsew_community_profile/tests/test_pull.py b/dsew_community_profile/tests/test_pull.py index cdbf5eaf8..e472bf3d6 100644 --- a/dsew_community_profile/tests/test_pull.py +++ b/dsew_community_profile/tests/test_pull.py @@ -16,30 +16,35 @@ class TestPull: def test_DatasetTimes(self): examples = [ - example(DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20)), - DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20))), + example(DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22)), + DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22))), ] for ex in examples: assert ex.given == ex.expected, "Equality" - dt = DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20)) + dt = DatasetTimes("xyzzy", date(2021, 10, 30), date(2021, 10, 20), date(2021, 10, 22)) assert dt["positivity"] == date(2021, 10, 30), "positivity" assert dt["total"] == date(2021, 10, 20), "total" + assert dt["confirmed covid-19 admissions"] == date(2021, 10, 22), "confirmed covid-19 admissions" with pytest.raises(ValueError): dt["xyzzy"] def test_DatasetTimes_from_header(self): examples = [ example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", - DatasetTimes("last", date(2021, 10, 30), date(2021, 10, 26))), + DatasetTimes("last", date(2021, 10, 30), date(2021, 10, 26), None)), example("TESTING: PREVIOUS WEEK (October 24-30, Test Volume October 20-26)", - DatasetTimes("previous", date(2021, 10, 30), date(2021, 10, 26))), + DatasetTimes("previous", date(2021, 10, 30), date(2021, 10, 26), None)), example("TESTING: LAST WEEK (October 24-November 30, Test Volume October 20-26)", - DatasetTimes("last", date(2021, 11, 30), date(2021, 10, 26))), + DatasetTimes("last", date(2021, 11, 30), date(2021, 10, 26), None)), example("VIRAL (RT-PCR) LAB TESTING: LAST WEEK (June 7-13, Test Volume June 3-9 )", - DatasetTimes("last", date(2021, 6, 13), date(2021, 6, 9))), + DatasetTimes("last", date(2021, 6, 13), date(2021, 6, 9), None)), example("VIRAL (RT-PCR) LAB TESTING: LAST WEEK (March 7-13)", - DatasetTimes("last", date(2021, 3, 13), date(2021, 3, 13))) + DatasetTimes("last", date(2021, 3, 13), date(2021, 3, 13), None)), + example("HOSPITAL UTILIZATION: LAST WEEK (June 2-8)", + DatasetTimes("last", None, None, date(2021, 6, 8))), + example("HOSPITAL UTILIZATION: LAST WEEK (June 28-July 8)", + DatasetTimes("last", None, None, date(2021, 7, 8))) ] for ex in examples: assert DatasetTimes.from_header(ex.given, date(2021, 12, 31)) == ex.expected, ex.given @@ -47,7 +52,7 @@ def test_DatasetTimes_from_header(self): # test year boundary examples = [ example("TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)", - DatasetTimes("last", date(2020, 10, 30), date(2020, 10, 26))), + DatasetTimes("last", date(2020, 10, 30), date(2020, 10, 26), None)), ] for ex in examples: assert DatasetTimes.from_header(ex.given, date(2021, 1, 1)) == ex.expected, ex.given @@ -67,6 +72,12 @@ def test_Dataset_skip_overheader(self): example("VIRAL (RT-PCR) LAB TESTING: % CHANGE FROM PREVIOUS WEEK", True), example("TESTING: DEMOGRAPHIC DATA", + True), + example("HOSPITAL UTILIZATION: LAST WEEK (January 2-8)", + False), + example("HOSPITAL UTILIZATION: CHANGE FROM PREVIOUS WEEK", + True), + example("HOSPITAL UTILIZATION: DEMOGRAPHIC DATA", True) ] for ex in examples: @@ -90,6 +101,16 @@ def test_Dataset_retain_header(self): example("Viral (RT-PCR) lab test positivity rate - last 7 days (may be an underestimate due to delayed reporting)", True), example("RT-PCR tests per 100k - last 7 days (may be an underestimate due to delayed reporting)", + False), + example("Confirmed COVID-19 admissions - last 7 days", + True), + example("Confirmed COVID-19 admissions - percent change", + False), + example("Confirmed COVID-19 admissions - last 7 days - ages <18", + False), + example("Confirmed COVID-19 admissions - last 7 days - age unknown", + False), + example("Confirmed COVID-19 admissions per 100 inpatient beds - last 7 days", False) ] for ex in examples: @@ -147,15 +168,10 @@ def test_nation_from_state(self): geomapper = GeoMapper() state_pop = geomapper.get_crosswalk("state_id", "pop") - test_df = geomapper.replace_geocode( - pd.DataFrame({ - 'geo_id': ['pa', 'wv'], + test_df = pd.DataFrame({ + 'state_id': ['pa', 'wv'], 'timestamp': [datetime(year=2020, month=1, day=1)]*2, - 'val': [15., 150.],}), - "state_id", - "state_code", - "geo_id" - ) + 'val': [15., 150.],}) pa_pop = int(state_pop.loc[state_pop.state_id == "pa", "pop"]) wv_pop = int(state_pop.loc[state_pop.state_id == "wv", "pop"]) From 691727edb8ed62d0fe0fce33ba826a25ecfa3c28 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 13 Jan 2022 17:36:56 -0500 Subject: [PATCH 12/32] automate tests --- .github/workflows/python-ci.yml | 2 +- dsew_community_profile/receiving/.gitignore | 0 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 dsew_community_profile/receiving/.gitignore diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml index adeb011a6..a581bab98 100644 --- a/.github/workflows/python-ci.yml +++ b/.github/workflows/python-ci.yml @@ -16,7 +16,7 @@ jobs: if: github.event.pull_request.draft == false strategy: matrix: - packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, covid_act_now, doctor_visits, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts] + packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, covid_act_now, doctor_visits, dsew_community_profile, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts] defaults: run: working-directory: ${{ matrix.packages }} diff --git a/dsew_community_profile/receiving/.gitignore b/dsew_community_profile/receiving/.gitignore deleted file mode 100644 index e69de29bb..000000000 From 3564a399ee493fee44716f789efa5ee50df455c4 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 14 Jan 2022 12:24:24 -0500 Subject: [PATCH 13/32] keep most recent publish date --- .../delphi_dsew_community_profile/pull.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index a47309a94..aa856babc 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -254,7 +254,8 @@ def _parse_sheet(self, sheet): "timestamp": pd.to_datetime(self.times[si[0]][sig]), "val": df[si[-2]], "se": None, - "sample_size": None + "sample_size": None, + "publish_date": self.publish_date }) for si in sig_select ]) @@ -337,10 +338,21 @@ def fetch_new_reports(params, logger=None): # download and parse individual reports datasets = download_and_parse(listing, logger) - # collect like signals together + # collect like signals together, keeping most recent publish date ret = {} for sig, lst in datasets.items(): - ret[sig] = pd.concat(lst) + ret[sig] = pd.concat( + lst + ).groupby( + "timestamp" + ).apply( + lambda x: x[x["publish_date"] == x["publish_date"].max()] + ).drop( + "publish_date", axis=1 + ) + + if ret[sig].index.names and ret[sig].index.names[0] == "timestamp": + ret[sig] = ret[sig].droplevel("timestamp") # add nation from state geomapper = GeoMapper() From 9e2ab62ed4ec5ecdf34b491573f0ed998e914823 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 14 Jan 2022 12:25:18 -0500 Subject: [PATCH 14/32] fix rate weight calculation when multiple days present --- dsew_community_profile/delphi_dsew_community_profile/pull.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index aa856babc..807076624 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -323,7 +323,7 @@ def nation_from_state(df, sig, geomapper): if SIGNALS[sig]["is_rate"]: # true if sig is a rate df = geomapper.add_population_column(df, "state_id") \ .rename(columns={"population":"weight"}) - df.weight = df.weight / df.weight.sum() + df.weight = df.weight / df.weight.sum() * len(df.timestamp.unique()) return geomapper.replace_geocode( df, 'state_id', From 1546af66c3a77be0dbe0ce4c5df93ae450ded5c9 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 14 Jan 2022 12:58:56 -0500 Subject: [PATCH 15/32] add production params --- .../dsew_community_profile-prod.json.j2 | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 ansible/templates/dsew_community_profile-prod.json.j2 diff --git a/ansible/templates/dsew_community_profile-prod.json.j2 b/ansible/templates/dsew_community_profile-prod.json.j2 new file mode 100644 index 000000000..89cee4bf0 --- /dev/null +++ b/ansible/templates/dsew_community_profile-prod.json.j2 @@ -0,0 +1,32 @@ +{ + "common": { + "export_dir": "./receiving", + "log_filename": "dsew_cpr.log" + }, + "indicator": { + "input_cache": "./input_cache", + "reports": "new" + }, + "validation": { + "common": { + "data_source": "dsew_cpr", + "span_length": 14, + "min_expected_lag": {"all": "5"}, + "max_expected_lag": {"all": "9"}, + "dry_run": true, + "suppressed_errors": [] + }, + "static": { + "minimum_sample_size": 0, + "missing_se_allowed": true, + "missing_sample_size_allowed": true + }, + "dynamic": { + "ref_window_size": 7, + "smoothed_signals": [ + "naats_total_7dav", + "naats_positivity_7dav" + ] + } + } +} From 671ae9cbadd0f3ab96e082e5818a546742d27890 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Fri, 14 Jan 2022 16:56:51 -0500 Subject: [PATCH 16/32] make weight normalization more robust to number states per day --- .../delphi_dsew_community_profile/pull.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 807076624..f61b6fc6f 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -323,7 +323,15 @@ def nation_from_state(df, sig, geomapper): if SIGNALS[sig]["is_rate"]: # true if sig is a rate df = geomapper.add_population_column(df, "state_id") \ .rename(columns={"population":"weight"}) - df.weight = df.weight / df.weight.sum() * len(df.timestamp.unique()) + + norm_denom = df.groupby("timestamp").agg(norm_denom=("weight", "sum")) + df = df.join( + norm_denom, on="timestamp", how="left" + ).assign( + weight=lambda x: x.weight / x.norm_denom + ).drop( + "norm_denom", axis=1 + ) return geomapper.replace_geocode( df, 'state_id', From 0090c01410cd376a94f9f3e9e04acd203238ec6e Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Tue, 18 Jan 2022 15:26:00 -0500 Subject: [PATCH 17/32] prevent errors from missing hosp admissions before early jan 2021 --- .../delphi_dsew_community_profile/pull.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index f61b6fc6f..9a029854a 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -244,6 +244,16 @@ def _parse_sheet(self, sheet): ] for sig in SIGNALS: + # Hospital admissions not available at the county or CBSA level prior to Jan 8, 2021. + if (sheet.level == "msa" or sheet.level == "county") \ + and self.publish_date < datetime.date(2021, 1, 8) \ + and sig == "confirmed covid-19 admissions": + self.dfs[(sheet.level, sig)] = pd.DataFrame( + columns = ["geo_id", "timestamp", "val", \ + "se", "sample_size", "publish_date"] + ) + continue + sig_select = [s for s in select if s[-1].find(sig) >= 0] assert len(sig_select) > 0, \ f"No {sig} in any of {select}\n\nAll headers:\n{NEWLINE.join(list(df.columns))}" @@ -349,7 +359,7 @@ def fetch_new_reports(params, logger=None): # collect like signals together, keeping most recent publish date ret = {} for sig, lst in datasets.items(): - ret[sig] = pd.concat( + latest_sig_df = pd.concat( lst ).groupby( "timestamp" @@ -359,8 +369,8 @@ def fetch_new_reports(params, logger=None): "publish_date", axis=1 ) - if ret[sig].index.names and ret[sig].index.names[0] == "timestamp": - ret[sig] = ret[sig].droplevel("timestamp") + if len(latest_sig_df.index) > 0: + ret[sig] = latest_sig_df.reset_index(drop=True) # add nation from state geomapper = GeoMapper() From 481d87573d5db8fe017630fab6493e7edcd83513 Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Tue, 18 Jan 2022 17:21:52 -0500 Subject: [PATCH 18/32] [cpr] fix bugs in export date handling --- .../delphi_dsew_community_profile/pull.py | 5 ++++- .../delphi_dsew_community_profile/run.py | 7 ++++++- dsew_community_profile/params.json.template | 4 +++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/dsew_community_profile/delphi_dsew_community_profile/pull.py b/dsew_community_profile/delphi_dsew_community_profile/pull.py index 9a029854a..f09d0badf 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/pull.py +++ b/dsew_community_profile/delphi_dsew_community_profile/pull.py @@ -375,8 +375,11 @@ def fetch_new_reports(params, logger=None): # add nation from state geomapper = GeoMapper() for sig in SIGNALS: + state_key = ("state", sig) + if state_key not in ret: + continue ret[("nation", sig)] = nation_from_state( - ret[("state", sig)].rename(columns={"geo_id": "state_id"}), + ret[state_key].rename(columns={"geo_id": "state_id"}), sig, geomapper ) diff --git a/dsew_community_profile/delphi_dsew_community_profile/run.py b/dsew_community_profile/delphi_dsew_community_profile/run.py index 0276c61de..9d045187b 100644 --- a/dsew_community_profile/delphi_dsew_community_profile/run.py +++ b/dsew_community_profile/delphi_dsew_community_profile/run.py @@ -18,6 +18,7 @@ from delphi_utils import get_structured_logger from delphi_utils.export import create_export_csv +import pandas as pd from .constants import make_signal_name from .pull import fetch_new_reports @@ -37,7 +38,7 @@ def run_module(params): __name__, filename=params["common"].get("log_filename"), log_exceptions=params["common"].get("log_exceptions", True)) def replace_date_param(p): - if p in params["indicator"]: + if p in params["indicator"] and params["indicator"][p] is not None: date_param = datetime.strptime(params["indicator"][p], "%Y-%m-%d").date() params["indicator"][p] = date_param replace_date_param("export_start_date") @@ -46,6 +47,10 @@ def replace_date_param(p): 'start_date': params["indicator"].get("export_start_date", None), 'end_date': params["indicator"].get("export_end_date", None) } + export_params = { + k: pd.to_datetime(v) if v is not None else v + for k, v in export_params.items() + } run_stats = [] dfs = fetch_new_reports(params, logger) diff --git a/dsew_community_profile/params.json.template b/dsew_community_profile/params.json.template index 89cee4bf0..37096599c 100644 --- a/dsew_community_profile/params.json.template +++ b/dsew_community_profile/params.json.template @@ -5,7 +5,9 @@ }, "indicator": { "input_cache": "./input_cache", - "reports": "new" + "reports": "new", + "export_start_date": null, + "export_end_date": null }, "validation": { "common": { From a58388c75cbe57aa990ef82c86a0d7668217e1ff Mon Sep 17 00:00:00 2001 From: Kathryn M Mazaitis Date: Tue, 18 Jan 2022 17:30:05 -0500 Subject: [PATCH 19/32] Update Makefile for new ci --- dsew_community_profile/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dsew_community_profile/Makefile b/dsew_community_profile/Makefile index 72814c37b..bdea33afd 100644 --- a/dsew_community_profile/Makefile +++ b/dsew_community_profile/Makefile @@ -11,6 +11,12 @@ install: venv pip install -e ../_delphi_utils_python ;\ pip install -e . +install-ci: venv + . env/bin/activate; \ + pip install wheel ; \ + pip install ../_delphi_utils_python ;\ + pip install . + lint: . env/bin/activate; pylint $(dir) . env/bin/activate; pydocstyle $(dir) From 6b8304822297e3e4653bcd0f5756dfff3721d5ba Mon Sep 17 00:00:00 2001 From: QX Teo <37101453+qx-teo@users.noreply.github.com> Date: Wed, 19 Jan 2022 00:50:19 -0500 Subject: [PATCH 20/32] Mock covidcast/meta Added mock for requests.get function. Made sure that signals that are inactive have the 'active' tag as False, which in turn affects whether it is a valid geo_signal combo. --- .../tests/validator/test_datafetcher.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/_delphi_utils_python/tests/validator/test_datafetcher.py b/_delphi_utils_python/tests/validator/test_datafetcher.py index 9eb583e06..8f75bd6ea 100644 --- a/_delphi_utils_python/tests/validator/test_datafetcher.py +++ b/_delphi_utils_python/tests/validator/test_datafetcher.py @@ -21,25 +21,41 @@ def test_make_date_filter(self): assert not date_filter(FILENAME_REGEX.match("20200620_a_b.csv")) assert not date_filter(FILENAME_REGEX.match("202006_a_b.csv")) - # pylint: disable=fixme - # TODO: mock out the advanced meta endpoint /covidcast/meta as well - # https://github.com/cmu-delphi/covidcast-indicators/issues/1456 + # Solution from https://stackoverflow.com/questions/15753390/ + #how-can-i-mock-requests-and-the-response + def mocked_requests_get(*args, **kwargs): + class MockResponse: + def __init__(self, json_data, status_code): + self.json_data = json_data + self.status_code = status_code + + def json(self): + return self.json_data + if kwargs["params"] == {'signal':'chng:inactive'}: + return MockResponse([{"signals": [{"active": False}]}], 200) + else: + return MockResponse([{"signals": [{"active": True}]}], 200) + @mock.patch('requests.get', side_effect=mocked_requests_get) @mock.patch("covidcast.metadata") - def test_get_geo_signal_combos(self, mock_metadata): + def test_get_geo_signal_combos(self, mock_metadata, mock_get): """Test that the geo signal combos are correctly pulled from the covidcast metadata.""" # Need to use actual data_source and signal names since we reference the API + # We let the chng signal "inactive" be an inactive signal mock_metadata.return_value = pd.DataFrame({"data_source": ["chng", "chng", "chng", "covid-act-now", "covid-act-now", - "covid-act-now"], + "covid-act-now", + "chng"], "signal": ["smoothed_outpatient_cli", "smoothed_outpatient_covid", "smoothed_outpatient_covid", "pcr_specimen_positivity_rate", "pcr_specimen_positivity_rate", - "pcr_specimen_total_tests"], + "pcr_specimen_total_tests", + "inactive"], "geo_type": ["state", "state", "county", - "hrr", "msa", "msa"] + "hrr", "msa", "msa", + "state"] }) assert set(get_geo_signal_combos("chng")) == set( From aa16b00e31b35315a508a08789c3ee392f866417 Mon Sep 17 00:00:00 2001 From: QX Teo <37101453+qx-teo@users.noreply.github.com> Date: Wed, 19 Jan 2022 01:04:11 -0500 Subject: [PATCH 21/32] Remove hard-coding from get_geo_signal_combos Uses covidcast/meta to find source-signal pairings. Issue: how to resolve the db_source quidel mapping to multiple sources quidel_covid_ag and quidel_flu? --- .../delphi_utils/validator/datafetcher.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/_delphi_utils_python/delphi_utils/validator/datafetcher.py b/_delphi_utils_python/delphi_utils/validator/datafetcher.py index 17db7c4bb..9b4476299 100644 --- a/_delphi_utils_python/delphi_utils/validator/datafetcher.py +++ b/_delphi_utils_python/delphi_utils/validator/datafetcher.py @@ -111,14 +111,9 @@ def get_geo_signal_combos(data_source): Cross references based on combinations reported available by COVIDcast metadata. """ # Maps data_source name with what's in the API, lists used in case of multiple names - # pylint: disable=fixme - # TODO: Extract this mapping from meta response instead of hard-coding - # https://github.com/cmu-delphi/covidcast-indicators/issues/1457 - source_signal_mappings = { - 'indicator-combination': ['indicator-combination-cases-deaths'], - 'quidel': ['quidel-covid-ag'], - 'safegraph': ['safegraph-weekly'] - } + + source_signal_mappings = {i['db_source']:i['source'] for i in + requests.get("https://api.covidcast.cmu.edu/epidata/covidcast/meta").json()} meta = covidcast.metadata() source_meta = meta[meta['data_source'] == data_source] # Need to convert np.records to tuples so they are hashable and can be used in sets and dicts. From 22d04911bd46dd2e0ee2b6f8408641515974001d Mon Sep 17 00:00:00 2001 From: QX Teo <37101453+qx-teo@users.noreply.github.com> Date: Wed, 19 Jan 2022 01:12:12 -0500 Subject: [PATCH 22/32] Fix lint --- _delphi_utils_python/delphi_utils/validator/datafetcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_delphi_utils_python/delphi_utils/validator/datafetcher.py b/_delphi_utils_python/delphi_utils/validator/datafetcher.py index 9b4476299..7cde0fd45 100644 --- a/_delphi_utils_python/delphi_utils/validator/datafetcher.py +++ b/_delphi_utils_python/delphi_utils/validator/datafetcher.py @@ -112,7 +112,7 @@ def get_geo_signal_combos(data_source): """ # Maps data_source name with what's in the API, lists used in case of multiple names - source_signal_mappings = {i['db_source']:i['source'] for i in + source_signal_mappings = {i['db_source']:i['source'] for i in requests.get("https://api.covidcast.cmu.edu/epidata/covidcast/meta").json()} meta = covidcast.metadata() source_meta = meta[meta['data_source'] == data_source] From dcd0acc2938f1636ad987fae60ada8762392760a Mon Sep 17 00:00:00 2001 From: QX Teo <37101453+qx-teo@users.noreply.github.com> Date: Wed, 19 Jan 2022 10:24:55 -0500 Subject: [PATCH 23/32] Swopping keys and values for source_signal_mappings dict Previously, keys weren't unique but values are. Swopped this to fix the issue. --- _delphi_utils_python/delphi_utils/validator/datafetcher.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/_delphi_utils_python/delphi_utils/validator/datafetcher.py b/_delphi_utils_python/delphi_utils/validator/datafetcher.py index 7cde0fd45..3f24843a2 100644 --- a/_delphi_utils_python/delphi_utils/validator/datafetcher.py +++ b/_delphi_utils_python/delphi_utils/validator/datafetcher.py @@ -112,7 +112,7 @@ def get_geo_signal_combos(data_source): """ # Maps data_source name with what's in the API, lists used in case of multiple names - source_signal_mappings = {i['db_source']:i['source'] for i in + source_signal_mappings = {i['source']:i['db_source'] for i in requests.get("https://api.covidcast.cmu.edu/epidata/covidcast/meta").json()} meta = covidcast.metadata() source_meta = meta[meta['data_source'] == data_source] @@ -125,8 +125,9 @@ def get_geo_signal_combos(data_source): # True/False indicate if status is active, "unknown" means we should check sig_combo_seen = dict() for combo in geo_signal_combos: - if source_signal_mappings.get(data_source): - src_list = source_signal_mappings.get(data_source) + if data_source in source_signal_mappings.values(): + src_list = [key for (key, value) in source_signal_mappings.items() + if value == data_source] else: src_list = [data_source] for src in src_list: From 2be28c5bda0f1f0c1fe89a53c38a5d5bc244c1f2 Mon Sep 17 00:00:00 2001 From: Katie Mazaitis Date: Wed, 19 Jan 2022 10:26:14 -0500 Subject: [PATCH 24/32] Update package description Co-authored-by: nmdefries <42820733+nmdefries@users.noreply.github.com> --- dsew_community_profile/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsew_community_profile/setup.py b/dsew_community_profile/setup.py index 258126fb1..fb5f9d4a9 100644 --- a/dsew_community_profile/setup.py +++ b/dsew_community_profile/setup.py @@ -16,7 +16,7 @@ setup( name="delphi_dsew_community_profile", version="0.1.0", - description="Indicator tracking specimen test results published in the COVID-19 Community Profile Report by the Data Strategy and Execution Workgroup", + description="Indicator tracking specimen test results and hospital admissions published in the COVID-19 Community Profile Report by the Data Strategy and Execution Workgroup", author="", author_email="", url="https://github.com/cmu-delphi/covidcast-indicators", From 89c9cf7975ee140b30d1b77ed488069967b75b6c Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Wed, 19 Jan 2022 13:34:35 -0500 Subject: [PATCH 25/32] Productionize! - Update prod params file with ingestion and log directories. - Add the indicator directory to the Jenkins build list. --- Jenkinsfile | 2 +- ansible/templates/dsew_community_profile-prod.json.j2 | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index a54087336..5372e4554 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ - Keep in sync with '.github/workflows/python-ci.yml'. - TODO: #527 Get this list automatically from python-ci.yml at runtime. */ -def indicator_list = ["changehc", "claims_hosp", "facebook", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph_patterns", "sir_complainsalot", "usafacts"] +def indicator_list = ["changehc", "claims_hosp", "facebook", "google_symptoms", "hhs_hosp", "jhu", "nchs_mortality", "quidel", "quidel_covidtest", "safegraph_patterns", "sir_complainsalot", "usafacts", "dsew_community_profile"] def build_package = [:] def deploy_staging = [:] def deploy_production = [:] diff --git a/ansible/templates/dsew_community_profile-prod.json.j2 b/ansible/templates/dsew_community_profile-prod.json.j2 index 89cee4bf0..c7f7c0604 100644 --- a/ansible/templates/dsew_community_profile-prod.json.j2 +++ b/ansible/templates/dsew_community_profile-prod.json.j2 @@ -1,7 +1,7 @@ { "common": { - "export_dir": "./receiving", - "log_filename": "dsew_cpr.log" + "export_dir": "/common/covidcast/receiving/dsew-cpr", + "log_filename": "/var/log/indicators/dsew_cpr.log" }, "indicator": { "input_cache": "./input_cache", @@ -9,7 +9,7 @@ }, "validation": { "common": { - "data_source": "dsew_cpr", + "data_source": "dsew-cpr", "span_length": 14, "min_expected_lag": {"all": "5"}, "max_expected_lag": {"all": "9"}, From 875a0e151633aa714b12ee2eabaa8481aedac9fa Mon Sep 17 00:00:00 2001 From: Brian Clark Date: Wed, 19 Jan 2022 14:43:40 -0500 Subject: [PATCH 26/32] Rename params file so it gets picked up by Jenkins --- ...le-prod.json.j2 => dsew_community_profile-params-prod.json.j2} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ansible/templates/{dsew_community_profile-prod.json.j2 => dsew_community_profile-params-prod.json.j2} (100%) diff --git a/ansible/templates/dsew_community_profile-prod.json.j2 b/ansible/templates/dsew_community_profile-params-prod.json.j2 similarity index 100% rename from ansible/templates/dsew_community_profile-prod.json.j2 rename to ansible/templates/dsew_community_profile-params-prod.json.j2 From 7bbbb3c58d23e252e19c444b3c3795d66f64eab5 Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Wed, 19 Jan 2022 16:57:33 -0500 Subject: [PATCH 27/32] update pipeline to support Wave 13 --- .../templates/facebook-params-prod.json.j2 | 1 + .../sir_complainsalot-params-prod.json.j2 | 35 ++++++++++++++++++- facebook/delphiFacebook/R/responses.R | 4 ++- facebook/params.json.production.template | 9 +++-- facebook/qsf-tools/qsf-differ.R | 8 +++-- sir_complainsalot/params.json.template | 35 ++++++++++++++++++- 6 files changed, 83 insertions(+), 9 deletions(-) diff --git a/ansible/templates/facebook-params-prod.json.j2 b/ansible/templates/facebook-params-prod.json.j2 index c4b0af5f9..c0732b8bb 100644 --- a/ansible/templates/facebook-params-prod.json.j2 +++ b/ansible/templates/facebook-params-prod.json.j2 @@ -38,6 +38,7 @@ "Survey of COVID-Like Illness - Wave 11": "fb-survey", "Survey of COVID-Like Illness - Wave 12": "fb-survey", "Survey of COVID-Like Illness - Wave 12 - Full Launch": "fb-survey", + "Survey of COVID-Like Illness - Wave 13": "fb-survey", "Survey of COVID-Like Illness - Wave 4": "fb-survey", "Survey of COVID-Like Illness - Wave 5": "fb-survey", "Survey of COVID-Like Illness - Wave 6": "fb-survey", diff --git a/ansible/templates/sir_complainsalot-params-prod.json.j2 b/ansible/templates/sir_complainsalot-params-prod.json.j2 index 94c2c8160..e95eab49f 100644 --- a/ansible/templates/sir_complainsalot-params-prod.json.j2 +++ b/ansible/templates/sir_complainsalot-params-prod.json.j2 @@ -90,7 +90,40 @@ ["smoothed_dontneed_reason_not_high_risk", "hrr"], ["smoothed_wdontneed_reason_not_high_risk", "hrr"], ["smoothed_dontneed_reason_not_serious", "hrr"], ["smoothed_wdontneed_reason_not_serious", "hrr"], ["smoothed_dontneed_reason_other", "hrr"], ["smoothed_wdontneed_reason_other", "hrr"], - ["smoothed_dontneed_reason_precautions", "hrr"], ["smoothed_wdontneed_reason_precautions", "hrr"] + ["smoothed_dontneed_reason_precautions", "hrr"], ["smoothed_wdontneed_reason_precautions", "hrr"], + "smoothed_screening_tested_positive_14d", "smoothed_wscreening_tested_positive_14d", + "smoothed_travel_outside_state_7d", "smoothed_wtravel_outside_state_7d", + "smoothed_belief_vaccinated_mask_unnecessary", "smoothed_wbelief_vaccinated_mask_unnecessary", + "smoothed_belief_children_immune", "smoothed_wbelief_children_immune", + "smoothed_received_2_vaccine_doses", "smoothed_wreceived_2_vaccine_doses", + "smoothed_vaccine_barrier_eligible", "smoothed_wvaccine_barrier_eligible", + "smoothed_vaccine_barrier_no_appointments", "smoothed_wvaccine_barrier_no_appointments", + "smoothed_vaccine_barrier_appointment_time", "smoothed_wvaccine_barrier_appointment_time", + "smoothed_vaccine_barrier_technical_difficulties", "smoothed_wvaccine_barrier_technical_difficulties", + "smoothed_vaccine_barrier_document", "smoothed_wvaccine_barrier_document", + "smoothed_vaccine_barrier_technology_access", "smoothed_wvaccine_barrier_technology_access", + "smoothed_vaccine_barrier_travel", "smoothed_wvaccine_barrier_travel", + "smoothed_vaccine_barrier_language", "smoothed_wvaccine_barrier_language", + "smoothed_vaccine_barrier_childcare", "smoothed_wvaccine_barrier_childcare", + "smoothed_vaccine_barrier_time", "smoothed_wvaccine_barrier_time", + "smoothed_vaccine_barrier_type", "smoothed_wvaccine_barrier_type", + "smoothed_vaccine_barrier_none", "smoothed_wvaccine_barrier_none", + "smoothed_vaccine_barrier_appointment_location", "smoothed_wvaccine_barrier_appointment_location", + "smoothed_vaccine_barrier_other", "smoothed_wvaccine_barrier_other", + "smoothed_vaccine_barrier_eligible_has", "smoothed_wvaccine_barrier_eligible_has", + "smoothed_vaccine_barrier_no_appointments_has", "smoothed_wvaccine_barrier_no_appointments_has", + "smoothed_vaccine_barrier_appointment_time_has", "smoothed_wvaccine_barrier_appointment_time_has", + "smoothed_vaccine_barrier_technical_difficulties_has", "smoothed_wvaccine_barrier_technical_difficulties_has", + "smoothed_vaccine_barrier_document_has", "smoothed_wvaccine_barrier_document_has", + "smoothed_vaccine_barrier_technology_access_has", "smoothed_wvaccine_barrier_technology_access_has", + "smoothed_vaccine_barrier_travel_has", "smoothed_wvaccine_barrier_travel_has", + "smoothed_vaccine_barrier_language_has", "smoothed_wvaccine_barrier_language_has", + "smoothed_vaccine_barrier_childcare_has", "smoothed_wvaccine_barrier_childcare_has", + "smoothed_vaccine_barrier_time_has", "smoothed_wvaccine_barrier_time_has", + "smoothed_vaccine_barrier_type_has", "smoothed_wvaccine_barrier_type_has", + "smoothed_vaccine_barrier_none_has", "smoothed_wvaccine_barrier_none_has", + "smoothed_vaccine_barrier_appointment_location_has", "smoothed_wvaccine_barrier_appointment_location_has", + "smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has" ] }, "quidel": { diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R index fbdc17ad1..8f0c7f2db 100644 --- a/facebook/delphiFacebook/R/responses.R +++ b/facebook/delphiFacebook/R/responses.R @@ -617,6 +617,7 @@ create_complete_responses <- function(input_data, county_crosswalk, params) "I6_1", "I6_2", "I6_3", "I6_4", "I6_5", "I6_6", "I6_7", "I6_8", "I7", "K1", "K2", "V11a", "V12a", "V15a", "V15b", "V16", "V3a", # added in Wave 11 "V1alt", "B13a", "V15c", "P1", "P2", "P3", "P4", "P5", "P6", # added in experimental Wave 12 + "C17b", "V17", "V2a", "V2b", "V2c", # added in Wave 13 "raceethnicity", "token", "wave", "w12_treatment", "module", "UserLanguage", "zip5" # temporarily; we'll filter by this column later and then drop it before writing @@ -694,7 +695,8 @@ surveyID_to_wave <- Vectorize(function(surveyID) { "SV_6PADB8DyF9SIyXk" = 10, "SV_4VEaeffqQtDo33M" = 11, "SV_3TL0r243mLkDzCK" = 12.5, # experimental version of Wave 12 - "SV_eDISRi5wQcNU70G" = 12 # finalized version of Wave 12 + "SV_eDISRi5wQcNU70G" = 12, # finalized version of Wave 12 + "SV_2iv3tPKlYKqnalM" = 13 ) if ( any(names(waves) == surveyID) ) { diff --git a/facebook/params.json.production.template b/facebook/params.json.production.template index 10722e836..e6d89ac38 100644 --- a/facebook/params.json.production.template +++ b/facebook/params.json.production.template @@ -27,13 +27,16 @@ "active": { "Survey of COVID-Like Illness - TODEPLOY ...... - US Expansion": "fb-survey", "Survey of COVID-Like Illness - TODEPLOY- US Expansion - With Translations": "fb-survey", + "Survey of COVID-Like Illness - Wave 10": "fb-survey", + "Survey of COVID-Like Illness - Wave 11": "fb-survey", + "Survey of COVID-Like Illness - Wave 12": "fb-survey", + "Survey of COVID-Like Illness - Wave 12 - Full Launch": "fb-survey", + "Survey of COVID-Like Illness - Wave 13": "fb-survey", "Survey of COVID-Like Illness - Wave 4": "fb-survey", "Survey of COVID-Like Illness - Wave 5": "fb-survey", "Survey of COVID-Like Illness - Wave 6": "fb-survey", "Survey of COVID-Like Illness - Wave 7": "fb-survey", - "Survey of COVID-Like Illness - Wave 8": "fb-survey", - "Survey of COVID-Like Illness - Wave 10": "fb-survey", - "Survey of COVID-Like Illness - Wave 11": "fb-survey" + "Survey of COVID-Like Illness - Wave 8": "fb-survey" }, "dormant": { "COVID-Like Illness 4-question survey - Amazon - DEPLOY": "", diff --git a/facebook/qsf-tools/qsf-differ.R b/facebook/qsf-tools/qsf-differ.R index 4af4a6a7f..4ac05c937 100644 --- a/facebook/qsf-tools/qsf-differ.R +++ b/facebook/qsf-tools/qsf-differ.R @@ -9,6 +9,8 @@ ## Writes the lists of new and changed items to STDOUT, so redirect STDOUT to ## your desired location. +options(warn = 1) + suppressPackageStartupMessages({ library(jsonlite) library(stringr) @@ -31,8 +33,8 @@ diff_qsf_files <- function(old_qsf_path, new_qsf_path) { #' Fetch and format a single .qsf file, keeping block and question info #' #' @param path path to Qualtrics survey file in .qsf format -#' @param keep_items string or character vector of survey item fields to keep. -#' Setting to "all" keeps all fields. +#' @param keep_items character vector of survey item fields to keep. +#' Setting to c("all") keeps all fields. #' #' @return A named list get_qsf_file <- function(path, @@ -59,7 +61,7 @@ get_qsf_file <- function(path, next } - if (keep_items != "all") { + if (!identical(keep_items, c("all"))) { question <- question[names(question) %in% c("QuestionID", keep_items)] } diff --git a/sir_complainsalot/params.json.template b/sir_complainsalot/params.json.template index 2df469f02..6c07dd313 100644 --- a/sir_complainsalot/params.json.template +++ b/sir_complainsalot/params.json.template @@ -89,7 +89,40 @@ ["smoothed_dontneed_reason_not_high_risk", "hrr"], ["smoothed_wdontneed_reason_not_high_risk", "hrr"], ["smoothed_dontneed_reason_not_serious", "hrr"], ["smoothed_wdontneed_reason_not_serious", "hrr"], ["smoothed_dontneed_reason_other", "hrr"], ["smoothed_wdontneed_reason_other", "hrr"], - ["smoothed_dontneed_reason_precautions", "hrr"], ["smoothed_wdontneed_reason_precautions", "hrr"] + ["smoothed_dontneed_reason_precautions", "hrr"], ["smoothed_wdontneed_reason_precautions", "hrr"], + "smoothed_screening_tested_positive_14d", "smoothed_wscreening_tested_positive_14d", + "smoothed_travel_outside_state_7d", "smoothed_wtravel_outside_state_7d", + "smoothed_belief_vaccinated_mask_unnecessary", "smoothed_wbelief_vaccinated_mask_unnecessary", + "smoothed_belief_children_immune", "smoothed_wbelief_children_immune", + "smoothed_received_2_vaccine_doses", "smoothed_wreceived_2_vaccine_doses", + "smoothed_vaccine_barrier_eligible", "smoothed_wvaccine_barrier_eligible", + "smoothed_vaccine_barrier_no_appointments", "smoothed_wvaccine_barrier_no_appointments", + "smoothed_vaccine_barrier_appointment_time", "smoothed_wvaccine_barrier_appointment_time", + "smoothed_vaccine_barrier_technical_difficulties", "smoothed_wvaccine_barrier_technical_difficulties", + "smoothed_vaccine_barrier_document", "smoothed_wvaccine_barrier_document", + "smoothed_vaccine_barrier_technology_access", "smoothed_wvaccine_barrier_technology_access", + "smoothed_vaccine_barrier_travel", "smoothed_wvaccine_barrier_travel", + "smoothed_vaccine_barrier_language", "smoothed_wvaccine_barrier_language", + "smoothed_vaccine_barrier_childcare", "smoothed_wvaccine_barrier_childcare", + "smoothed_vaccine_barrier_time", "smoothed_wvaccine_barrier_time", + "smoothed_vaccine_barrier_type", "smoothed_wvaccine_barrier_type", + "smoothed_vaccine_barrier_none", "smoothed_wvaccine_barrier_none", + "smoothed_vaccine_barrier_appointment_location", "smoothed_wvaccine_barrier_appointment_location", + "smoothed_vaccine_barrier_other", "smoothed_wvaccine_barrier_other", + "smoothed_vaccine_barrier_eligible_has", "smoothed_wvaccine_barrier_eligible_has", + "smoothed_vaccine_barrier_no_appointments_has", "smoothed_wvaccine_barrier_no_appointments_has", + "smoothed_vaccine_barrier_appointment_time_has", "smoothed_wvaccine_barrier_appointment_time_has", + "smoothed_vaccine_barrier_technical_difficulties_has", "smoothed_wvaccine_barrier_technical_difficulties_has", + "smoothed_vaccine_barrier_document_has", "smoothed_wvaccine_barrier_document_has", + "smoothed_vaccine_barrier_technology_access_has", "smoothed_wvaccine_barrier_technology_access_has", + "smoothed_vaccine_barrier_travel_has", "smoothed_wvaccine_barrier_travel_has", + "smoothed_vaccine_barrier_language_has", "smoothed_wvaccine_barrier_language_has", + "smoothed_vaccine_barrier_childcare_has", "smoothed_wvaccine_barrier_childcare_has", + "smoothed_vaccine_barrier_time_has", "smoothed_wvaccine_barrier_time_has", + "smoothed_vaccine_barrier_type_has", "smoothed_wvaccine_barrier_type_has", + "smoothed_vaccine_barrier_none_has", "smoothed_wvaccine_barrier_none_has", + "smoothed_vaccine_barrier_appointment_location_has", "smoothed_wvaccine_barrier_appointment_location_has", + "smoothed_vaccine_barrier_other_has", "smoothed_wvaccine_barrier_other_has" ] }, "quidel": { From e8dac87aa323bb94d13409bd641287fd397ea2ec Mon Sep 17 00:00:00 2001 From: QX Teo <37101453+qx-teo@users.noreply.github.com> Date: Thu, 20 Jan 2022 01:11:39 -0500 Subject: [PATCH 28/32] Fix requests.get mock #1470 mocked out requests.get call, but #1471 uses another requests.get call that did not get mocked properly. We check which call this is and provide the apporpriate mocked response. --- _delphi_utils_python/tests/validator/test_datafetcher.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/_delphi_utils_python/tests/validator/test_datafetcher.py b/_delphi_utils_python/tests/validator/test_datafetcher.py index 8f75bd6ea..6016df3a2 100644 --- a/_delphi_utils_python/tests/validator/test_datafetcher.py +++ b/_delphi_utils_python/tests/validator/test_datafetcher.py @@ -31,7 +31,10 @@ def __init__(self, json_data, status_code): def json(self): return self.json_data - if kwargs["params"] == {'signal':'chng:inactive'}: + if len(kwargs) == 0: + return MockResponse([{'source': 'chng', 'db_source': 'chng'}, + {'source': 'covid-act-now', 'db_source': 'covid-act-now'}], 200) + elif kwargs["params"] == {'signal': 'chng:inactive'}: return MockResponse([{"signals": [{"active": False}]}], 200) else: return MockResponse([{"signals": [{"active": True}]}], 200) From 027769ba335781529a8a77671a043da6e807f35a Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 20 Jan 2022 13:25:12 -0500 Subject: [PATCH 29/32] deduplicate microdata output fields --- facebook/delphiFacebook/R/responses.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R index 8f0c7f2db..8f9942f9a 100644 --- a/facebook/delphiFacebook/R/responses.R +++ b/facebook/delphiFacebook/R/responses.R @@ -617,7 +617,7 @@ create_complete_responses <- function(input_data, county_crosswalk, params) "I6_1", "I6_2", "I6_3", "I6_4", "I6_5", "I6_6", "I6_7", "I6_8", "I7", "K1", "K2", "V11a", "V12a", "V15a", "V15b", "V16", "V3a", # added in Wave 11 "V1alt", "B13a", "V15c", "P1", "P2", "P3", "P4", "P5", "P6", # added in experimental Wave 12 - "C17b", "V17", "V2a", "V2b", "V2c", # added in Wave 13 + "C17b", "V17", "V2b", "V2c", # added in Wave 13 "raceethnicity", "token", "wave", "w12_treatment", "module", "UserLanguage", "zip5" # temporarily; we'll filter by this column later and then drop it before writing From 6f89e0db0c8b1cd9b5055396f9483c9f32e5efdf Mon Sep 17 00:00:00 2001 From: Nat DeFries <42820733+nmdefries@users.noreply.github.com> Date: Thu, 20 Jan 2022 13:36:15 -0500 Subject: [PATCH 30/32] V17 actually reported as two fields --- facebook/delphiFacebook/R/responses.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook/delphiFacebook/R/responses.R b/facebook/delphiFacebook/R/responses.R index 8f9942f9a..ba9cc5347 100644 --- a/facebook/delphiFacebook/R/responses.R +++ b/facebook/delphiFacebook/R/responses.R @@ -617,7 +617,7 @@ create_complete_responses <- function(input_data, county_crosswalk, params) "I6_1", "I6_2", "I6_3", "I6_4", "I6_5", "I6_6", "I6_7", "I6_8", "I7", "K1", "K2", "V11a", "V12a", "V15a", "V15b", "V16", "V3a", # added in Wave 11 "V1alt", "B13a", "V15c", "P1", "P2", "P3", "P4", "P5", "P6", # added in experimental Wave 12 - "C17b", "V17", "V2b", "V2c", # added in Wave 13 + "C17b", "V17_1", "V17_2", "V2b", "V2c", # added in Wave 13 "raceethnicity", "token", "wave", "w12_treatment", "module", "UserLanguage", "zip5" # temporarily; we'll filter by this column later and then drop it before writing From 7fc264bc60fa68dd37d06c28e107751b7f47efb9 Mon Sep 17 00:00:00 2001 From: Delphi Deploy Bot Date: Thu, 20 Jan 2022 22:22:54 +0000 Subject: [PATCH 31/32] chore: bump delphi_utils to 0.2.10 --- _delphi_utils_python/.bumpversion.cfg | 2 +- _delphi_utils_python/delphi_utils/__init__.py | 2 +- _delphi_utils_python/setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/_delphi_utils_python/.bumpversion.cfg b/_delphi_utils_python/.bumpversion.cfg index 6a3eb41c5..fb2939b21 100644 --- a/_delphi_utils_python/.bumpversion.cfg +++ b/_delphi_utils_python/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.9 +current_version = 0.2.10 commit = True message = chore: bump delphi_utils to {new_version} tag = False diff --git a/_delphi_utils_python/delphi_utils/__init__.py b/_delphi_utils_python/delphi_utils/__init__.py index 8e5e8e86a..4fbbba976 100644 --- a/_delphi_utils_python/delphi_utils/__init__.py +++ b/_delphi_utils_python/delphi_utils/__init__.py @@ -15,4 +15,4 @@ from .nancodes import Nans from .weekday import Weekday -__version__ = "0.2.9" +__version__ = "0.2.10" diff --git a/_delphi_utils_python/setup.py b/_delphi_utils_python/setup.py index 064eb64b8..e95f8e2f6 100644 --- a/_delphi_utils_python/setup.py +++ b/_delphi_utils_python/setup.py @@ -26,7 +26,7 @@ setup( name="delphi_utils", - version="0.2.9", + version="0.2.10", description="Shared Utility Functions for Indicators", long_description=long_description, long_description_content_type="text/markdown", From 3af5cdc30c9d4c8f6c10a23f26712c27730c05df Mon Sep 17 00:00:00 2001 From: Delphi Deploy Bot Date: Thu, 20 Jan 2022 22:22:54 +0000 Subject: [PATCH 32/32] chore: bump covidcast-indicators to 0.2.22 --- .bumpversion.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 85ccc58ab..3d6e03577 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.21 +current_version = 0.2.22 commit = True message = chore: bump covidcast-indicators to {new_version} tag = False