Skip to content

Commit

Permalink
Merge pull request #2015 from cmu-delphi/fix_new_date_for_google_symp…
Browse files Browse the repository at this point in the history
…toms

Fix new date for google symptoms
  • Loading branch information
nmdefries authored Aug 12, 2024
2 parents 4168db5 + d5be1bd commit c7bf03e
Show file tree
Hide file tree
Showing 14 changed files with 42,234 additions and 34 deletions.
21 changes: 13 additions & 8 deletions google_symptoms/delphi_google_symptoms/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,24 +73,29 @@ def generate_num_export_days(params: Dict, logger) -> [int]:
"_".join([metric, smoother, "search"]) for metric, smoother in product(COMBINED_METRIC, SMOOTHERS)
)

# Fetch metadata to check how recent each signal is
covidcast.use_api_key(params["indicator"]["api_credentials"])
metadata = covidcast.metadata()
# Filter to only those signals we currently want to produce for `google-symptoms`
gs_metadata = metadata[(metadata.data_source == "google-symptoms") & (metadata.signal.isin(sensor_names))]

num_export_days = params["indicator"]["num_export_days"]
custom_run = False if not params["common"].get("custom_run") else params["common"].get("custom_run", False)

if num_export_days is None and not custom_run:
# Fetch metadata to check how recent each signal is
covidcast.use_api_key(params["indicator"]["api_credentials"])
metadata = covidcast.metadata()
# Filter to only those signals we currently want to produce for `google-symptoms`
gs_metadata = metadata[(metadata.data_source == "google-symptoms") & (metadata.signal.isin(sensor_names))]

if sensor_names.difference(set(gs_metadata.signal)):
# If any signal not in metadata yet, we need to backfill its full history.
logger.warning("Signals missing in the epidata; backfilling full history")
num_export_days = (export_end_date - FULL_BKFILL_START_DATE).days + 1
else:
latest_date_diff = (datetime.today() - to_datetime(min(gs_metadata.max_time))).days + 1
global_max_expected_lag = get_max_lag(params)
expected_date_diff = params["validation"]["common"].get("span_length", 14) + global_max_expected_lag

expected_date_diff = params["validation"]["common"].get("span_length", 14)

# there's an expected lag of 4 days behind if running from today
if export_end_date.date() == datetime.today().date():
global_max_expected_lag = get_max_lag(params)
expected_date_diff += global_max_expected_lag

if latest_date_diff > expected_date_diff:
logger.info(f"Missing dates from: {to_datetime(min(gs_metadata.max_time)).date()}")
Expand Down
2 changes: 1 addition & 1 deletion google_symptoms/delphi_google_symptoms/patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def patch(params):

# Output dir setup
current_issue_yyyymmdd = issue_date.strftime("%Y%m%d")
current_issue_dir = f"""{params["patch"]["patch_dir"]}/issue_{current_issue_yyyymmdd}/google-symptom"""
current_issue_dir = f"""{params["patch"]["patch_dir"]}/issue_{current_issue_yyyymmdd}/google-symptoms"""
makedirs(f"{current_issue_dir}", exist_ok=True)

params["common"]["export_dir"] = f"""{current_issue_dir}"""
Expand Down
3 changes: 1 addition & 2 deletions google_symptoms/delphi_google_symptoms/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,10 @@ def pull_gs_data(credentials, export_start_date, export_end_date, num_export_day

# For state level data
dfs["state"] = pull_gs_data_one_geolevel("state", retrieve_dates)

# For county level data
dfs["county"] = pull_gs_data_one_geolevel("county", retrieve_dates)


# Add District of Columbia as county
try:
df_dc_county = dfs["state"][dfs["state"]["geo_id"] == "dc"].drop(
Expand All @@ -260,5 +260,4 @@ def pull_gs_data(credentials, export_start_date, export_end_date, num_export_day
dfs["county"] = pd.concat([dfs["county"], df_dc_county])
except KeyError:
pass

return dfs
8 changes: 5 additions & 3 deletions google_symptoms/delphi_google_symptoms/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
when the module is run with `python -m delphi_google_symptoms`.
"""
import time
from datetime import datetime, date
from datetime import date, datetime
from itertools import product

import numpy as np
from delphi_utils import create_export_csv, get_structured_logger

from .constants import COMBINED_METRIC, GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP
from .constants import COMBINED_METRIC, FULL_BKFILL_START_DATE, GEO_RESOLUTIONS, SMOOTHERS, SMOOTHERS_MAP
from .date_utils import generate_num_export_days
from .geo import geo_map
from .pull import pull_gs_data
Expand Down Expand Up @@ -47,7 +47,9 @@ def run_module(params, logger=None):
log_exceptions=params["common"].get("log_exceptions", True),
)

export_start_date = datetime.strptime(params["indicator"]["export_start_date"], "%Y-%m-%d")
export_start_date = datetime.strptime(
params["indicator"].get("export_start_date", datetime.strftime(FULL_BKFILL_START_DATE, "%Y-%m-%d")), "%Y-%m-%d"
)
# If end_date not specified, use current date.
export_end_date = datetime.strptime(
params["indicator"].get("export_end_date", datetime.strftime(date.today(), "%Y-%m-%d")), "%Y-%m-%d"
Expand Down
1 change: 1 addition & 0 deletions google_symptoms/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"pylint==2.8.3",
"pytest-cov",
"pytest",
"pytest-freezegun~=0.4.2"
]

setup(
Expand Down
33 changes: 24 additions & 9 deletions google_symptoms/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
import logging
from pathlib import Path
import re

import copy
import pytest
Expand All @@ -27,7 +28,7 @@
# end as open_covid_region_code,
# *
# from `bigquery-public-data.covid19_symptom_search.states_daily_2020` # States by day
# where timestamp(date) between timestamp("2020-07-26") and timestamp("2020-08-11")
# where timestamp(date) between timestamp("2020-07-15") and timestamp("2020-08-22")

# County data is created by running the following query in the BigQuery
# browser console:
Expand All @@ -38,17 +39,16 @@
# end as open_covid_region_code,
# *
# from `bigquery-public-data.covid19_symptom_search.counties_daily_2020` # Counties by day; includes state and county name, + FIPS code
# where timestamp(date) between timestamp("2020-07-26") and timestamp("2020-08-11")
# where timestamp(date) between timestamp("2020-07-15") and timestamp("2020-08-22")


good_input = {
"state": f"{TEST_DIR}/test_data/small_states_daily.csv",
"county": f"{TEST_DIR}/test_data/small_counties_daily.csv"
"state": f"{TEST_DIR}/test_data/small_states_2020_07_15_2020_08_22.csv",
"county": f"{TEST_DIR}/test_data/small_counties_2020_07_15_2020_08_22.csv"
}

patch_input = {
"state": f"{TEST_DIR}/test_data/state_2024-05-16_2024-07-18.csv",
"county": f"{TEST_DIR}/test_data/county_2024-05-16_2024-07-18.csv"

}

symptom_names = ["symptom_" +
Expand Down Expand Up @@ -79,9 +79,9 @@ def params():
"log_filename": f"{TEST_DIR}/test.log",
},
"indicator": {
"export_start_date": "2020-02-20",
"bigquery_credentials": {},
"num_export_days": 14,
"custom_run": False,
"static_file_dir": "../static",
"api_credentials": "fakesecret"
},
Expand Down Expand Up @@ -124,7 +124,22 @@ def run_as_module(params):

with mock.patch("delphi_google_symptoms.pull.initialize_credentials",
return_value=None), \
mock.patch("pandas_gbq.read_gbq", side_effect=[state_data, county_data]), \
mock.patch("pandas_gbq.read_gbq") as mock_read_gbq, \
mock.patch("delphi_google_symptoms.pull.initialize_credentials", return_value=None), \
mock.patch("delphi_google_symptoms.date_utils.covidcast.metadata", return_value=covidcast_metadata):
delphi_google_symptoms.run.run_module(params)
def side_effect(*args, **kwargs):
if "symptom_search_sub_region_1_daily" in args[0]:
df = state_data
pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
start_date, end_date = re.findall(pattern, args[0])
return df[(df["date"] >= start_date) & (df["date"] <= end_date)]
elif "symptom_search_sub_region_2_daily" in args[0]:
df = county_data
pattern = re.compile(r'\d{4}-\d{2}-\d{2}')
start_date, end_date = re.findall(pattern, args[0])
return df[(df["date"] >= start_date) & (df["date"] <= end_date)]
else:
return pd.DataFrame()

mock_read_gbq.side_effect = side_effect
delphi_google_symptoms.run.run_module(params)
Loading

0 comments on commit c7bf03e

Please sign in to comment.