diff --git a/deploy.json b/deploy.json index 59d141ba4..3396dbbf6 100644 --- a/deploy.json +++ b/deploy.json @@ -138,15 +138,6 @@ "add-header-comment": true }, - "// acquisition - norostat", - { - "type": "move", - "src": "src/acquisition/norostat/", - "dst": "[[package]]/acquisition/norostat/", - "match": "^.*\\.(py)$", - "add-header-comment": true - }, - "// acquisition - paho", { "type": "move", diff --git a/docs/api/norostat.md b/docs/api/norostat.md index 6e801116c..dded4ec13 100644 --- a/docs/api/norostat.md +++ b/docs/api/norostat.md @@ -13,6 +13,8 @@ General topics not specific to any particular endpoint are discussed in the [contributing](README.md#contributing), [citing](README.md#citing), and [data licensing](README.md#data-licensing). +**NOTE**: Delphi stopped stopped acquiring data from this data source in November 2020. + ## NoroSTAT Data ... diff --git a/src/acquisition/norostat/norostat_add_history.py b/src/acquisition/norostat/norostat_add_history.py deleted file mode 100644 index 64fd11ff7..000000000 --- a/src/acquisition/norostat/norostat_add_history.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Parses historical versions of the NoroSTAT data-table and updates the -appropriate databases. Currently uses snapshots from the WayBack Machine -(archive.org). A more comprehensive archival service may be mementoweb.org, -which appears to pull from many services that implement the Memento protocol, -including archive.org. Manually downloaded snapshots could be recorded via this -script as well. -""" - -# standard library -import re -import os -import time -import collections - -# first party -from . import norostat_sql -from . import norostat_raw - - - -def main(): - norostat_sql.ensure_tables_exist() - snapshot_dir = os.path.expanduser("~/norostat_history/wayback/websites/www.cdc.gov/norovirus/reporting/norostat/data-table.html/") - snapshot_version_counter = collections.Counter() - for subdir in os.listdir(snapshot_dir): - if re.match(r'[0-9]+', subdir) is not None: - # appears to be snapshot dir - snapshot_version_counter[subdir] = 0 # register that loop found this snapshot directory - for norostat_capitalization in ["norostat","noroSTAT"]: - time.sleep(0.002) # ensure parse times are unique, assuming OS can accurately sleep and measure to ms precision - path = os.path.join(snapshot_dir,subdir,"norovirus","reporting",norostat_capitalization,"data-table.html") - if os.path.isfile(path): - print("Processing file ", path) - with open(path, 'r') as datatable_file: - content = datatable_file.read() - wide_raw = norostat_raw.parse_content_to_wide_raw(content) - long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) - norostat_sql.record_long_raw(long_raw) - snapshot_version_counter[subdir] += 1 - print('Successfully uploaded the following snapshots, with the count indicating the number of data-table versions found inside each snapshot (expected to be 1, or maybe 2 if there was a change in capitalization; 0 indicates the NoroSTAT page was not found within a snapshot directory); just "Counter()" indicates no snapshot directories were found:', snapshot_version_counter) - norostat_sql.update_point() - -if __name__ == '__main__': - main() diff --git a/src/acquisition/norostat/norostat_raw.py b/src/acquisition/norostat/norostat_raw.py deleted file mode 100644 index 582de9684..000000000 --- a/src/acquisition/norostat/norostat_raw.py +++ /dev/null @@ -1,112 +0,0 @@ -""" -Functions to fetch, save, load, and format the NoroSTAT data-table. Formatting -functions include conversion from html content to "wide_raw" --- a wide data -frame in a tuple along with metadata --- and then to "long_raw" --- a long/tall -data frame in a tuple along with metadata. Metadata: release_date, parse_time, -and (constant) location. Here, the location will be (a str representing) a set -of states. -""" - - - -# standard library -import datetime -import re -import pickle - -# third party -import requests -import lxml.html -import pandas as pd - -# first party -from .norostat_utils import * - -def fetch_content(norostat_datatable_url="https://www.cdc.gov/norovirus/reporting/norostat/data-table.html"): - """Download NoroSTAT data-table. Returns the html content.""" - headers = { - 'User-Agent': 'delphibot/1.0 (+https://delphi.cmu.edu/)', - } - resp = requests.get(norostat_datatable_url, headers=headers) - expect_value_eq(resp.status_code, 200, - 'Wanted status code {}. Received: ') - expect_value_eq(resp.headers.get("Content-Type"), "text/html", - 'Expected Content-Type "{}"; Received ') - return resp.content - -def save_sample_content(content, f="sample_content.pickle"): - """Save the content from fetch_content into a pickle file for most testing (don't download unnecessarily).""" - with open(f, "wb") as handle: - pickle.dump(content, handle) - -def load_sample_content(f="sample_content.pickle"): - """Load data from a past call to fetch_content from a pickle file for most testing (don't download unnecessarily).""" - with open(f, "rb") as handle: - content = pickle.load(handle) - return content - -def parse_content_to_wide_raw(content): - """Convert the html content for the data-table into a wide data frame, then stick it in a tuple along with the release_date, parse_time, and (constant) location.""" - parse_time = datetime.datetime.now() - html_root = lxml.html.fromstring(content) - # Extract the release date, a.k.a. dateModified, a.k.a. "Page last updated" date; ~Dec 2018 this is only available in a meta tag; previously, it was available in a visible span - dateModified_meta_elts = html_root.xpath('//meta[@property="cdc:last_updated"]') - dateModified_span_elts = html_root.xpath('//span[@itemprop="dateModified"]') - if len(dateModified_meta_elts) == 1: - [dateModified_meta_elt] = dateModified_meta_elts - dateModified = dateModified_meta_elt.attrib['content'] - elif len(dateModified_span_elts) == 1: - [dateModified_span_elt] = dateModified_span_elts - dateModified = dateModified_span_elt.text - else: - raise Exception("Could not find the expected number of dateModified meta or span tags.") - # FIXME check/enforce locale - release_date = datetime.datetime.strptime(dateModified, "%B %d, %Y").date() - # Check that table description still specifies suspected&confirmed norovirus - # outbreaks (insensitive to case of certain letters and allowing for both old - # "to the" and new "through the" text), then extract list of states from the - # description: - [description_elt] = html_root.xpath('''//p[ - contains(translate(text(), "SCNORHD", "scnorhd"), "suspected and confirmed norovirus outbreaks reported by state health departments in") and - ( - contains(text(), "to the") or - contains(text(), "through the") - ) - ]''') - location = re.match(".*?[Dd]epartments in (.*?) (?:to)|(?:through) the.*$", description_elt.text).group(1) - # Attempt to find exactly 1 table (note: it would be nice to filter on the - # associated caption, but no such caption is present in earlier versions): - [table] = html_root.xpath('//table') - # Convert html table to DataFrame: - # Directly reading in the table with pd.read_html performs unwanted dtype - # inference, but reveals the column names: - [wide_raw_df_with_unwanted_conversions] = pd.read_html(lxml.html.tostring(table)) - # We want all columns to be string columns. However, there does not appear - # to be an option to disable dtype inference in pd.read_html. Hide all - # entries inside 1-tuple wrappers using pre-dtype-inference converters, - # then unpack afterward (the entries fed to the converters should already - # be strings, but "convert" them to strings just in case): - [wide_raw_df_with_wrappers] = pd.read_html( - lxml.html.tostring(table), - converters= {col: lambda entry: (str(entry),) - for col in wide_raw_df_with_unwanted_conversions.columns} - ) - # Unwrap entries: - wide_raw_df = wide_raw_df_with_wrappers.applymap(lambda wrapper: wrapper[0]) - # Check format: - expect_value_eq(wide_raw_df.columns[0], "Week", - 'Expected raw_colnames[0] to be "{}"; encountered ') - for colname in wide_raw_df.columns: - expect_result_eq(dtype_kind, wide_raw_df[colname].head(), "O", - 'Expected (head of) "%s" column to have dtype kind "{}"; instead had dtype kind & head '%(colname)) - # Pack up df with metadata: - wide_raw = (wide_raw_df, release_date, parse_time, location) - return wide_raw - -def melt_wide_raw_to_long_raw(wide_raw): - (wide_raw_df, release_date, parse_time, location) = wide_raw - long_raw_df = wide_raw_df \ - .melt(id_vars=["Week"], var_name="measurement_type", value_name="value") \ - .rename(index=str, columns={"Week": "week"}) - long_raw = (long_raw_df, release_date, parse_time, location) - return long_raw diff --git a/src/acquisition/norostat/norostat_sql.py b/src/acquisition/norostat/norostat_sql.py deleted file mode 100644 index 168e275eb..000000000 --- a/src/acquisition/norostat/norostat_sql.py +++ /dev/null @@ -1,434 +0,0 @@ -# standard library -import re - -# third party -import mysql.connector - -# first party -from .norostat_utils import * -import delphi.operations.secrets as secrets - -# Column names: -# `release_date` :: release date as stated in the web page in the dateModified -# span, displayed on the web page with the label "Page last updated:" -# `parse_time` :: time that we attempted to parse the data out of a downloaded -# version of the web page; when the scraper is running, this may be similar -# to a fetch time, but when loading in past versions that have been saved, -# it probably won't mean the same thing; this is tracked (a) in case the -# provided release date ever is out of date so that the raw data will still -# be recorded and we can recover later on, and (b) to provide a record of -# when parses/fetches happened; if there is a request for the data for a -# particular `release_date` with no restrictions on `parse_time`, the -# version with the latest `parse_time` should be selected -# (`release_date`, `parse_time`) :: uniquely identify a version of the table -# `measurement_type_id` :: "pointer" to an interned measurement_type string -# `measurement_type` :: the name of some column other than "Week" in the -# data-table -# `location_id` :: "pointer" to an interned location string -# `location` :: a string containing the list of reporting states -# `week_id` :: "pointer" to an interned week string -# `week` :: a string entry from the "Week" column -# `value` :: an string entry from some column other than "Week" in the -# data-table -# `new_value` :: an update to a `value` provided by a new version of the data -# table: either a string representing an added or revised entry (or a -# redundant repetition of a value retained from a past issue --- although -# no such entries should be generated by the code in this file), or NULL -# representing a deletion of a cell/entry from the table -# -# Tables: -# `norostat_raw_datatable_version_list` :: list of all versions of the raw -# data-table that have ever been successfully parsed -# `_pool` :: maps each encountered value of string `` to a unique ID -# `_id`, so that the string's character data is not duplicated in the -# tables on disk; serves a purpose similar to Java's interned string pool -# `norostat_raw_datatable_diffs` :: contains diffs between consecutive versions -# of the raw data-table (when arranged according to the tuple -# (`release_date`,`parse_time`) using lexicographical tuple ordering) -# `norostat_raw_datatable_parsed` :: a temporary table to hold the version of -# the raw data-table (in long/melted format) to be recorded; uses string -# values instead of interned string id's, so will need to be joined with -# `*_pool` tables for operations with other tables -# `norostat_raw_datatable_previous` :: a temporary table to hold an -# already-recorded version of the raw data-table with the latest -# `release_date`, `parse_time` before those of the version to be recorded; -# if there is no such version, this table will be empty (as if we recorded -# an empty version of the table before all other versions); uses interned -# string id's -# `norostat_raw_datatable_next` :: a temporary table to hold an -# already-recorded version of the raw data-table with the earliest -# `release_date`, `parse_time` after those of the version to be recorded; -# if there is no such version, this table will not be created or used; uses -# interned string id's - -def ensure_tables_exist(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_version_list` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - PRIMARY KEY (`release_date`, `parse_time`) - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_measurement_type_pool` ( - `measurement_type_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `measurement_type` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_location_pool` ( - `location_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `location` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_week_pool` ( - `week_id` INT NOT NULL PRIMARY KEY AUTO_INCREMENT, - `week` NVARCHAR(255) NOT NULL UNIQUE KEY - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_raw_datatable_diffs` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), - FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - UNIQUE KEY (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`), - PRIMARY KEY (`release_date`, `parse_time`, `measurement_type_id`, `location_id`, `week_id`) - -- (the indices here are larger than the data, but reducing the key - -- sizes and adding an id somehow seems to result in larger index sizes - -- somehow) - ); - ''') - cnx.commit() - finally: - cnx.close() - -def dangerously_drop_all_norostat_tables(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - # Drop tables in reverse order (to avoid foreign key related errors): - cursor.execute(''' - DROP TABLE IF EXISTS `norostat_point_diffs`, - `norostat_point_version_list`, - `norostat_raw_datatable_diffs`, - `norostat_raw_datatable_week_pool`, - `norostat_raw_datatable_location_pool`, - `norostat_raw_datatable_measurement_type_pool`, - `norostat_raw_datatable_version_list`; - ''') - cnx.commit() # (might do nothing; each DROP commits itself anyway) - finally: - cnx.close() - -def record_long_raw(long_raw): - (long_raw_df, release_date, parse_time, location) = long_raw - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cnx.start_transaction(isolation_level='SERIALIZABLE') - # Create, populate `norostat_raw_datatable_parsed`: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_parsed` ( - `measurement_type` NVARCHAR(255) NOT NULL, - `location` NVARCHAR(255) NOT NULL, - `week` NVARCHAR(255) NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - PRIMARY KEY (`measurement_type`, `location`, `week`) - ) ENGINE=MEMORY; - ''') - cursor.executemany(''' - INSERT INTO `norostat_raw_datatable_parsed` (`week`,`measurement_type`,`value`,`location`) - VALUES (%s, %s, %s, %s); - ''', [(week, measurement_type, value, location) for - (week, measurement_type, value) in long_raw_df[["week","measurement_type","value"]].astype(str).itertuples(index=False, name=None) - ]) - # Create, populate `norostat_raw_datatable_previous`: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_previous` ( - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) - ) ENGINE=MEMORY; - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_previous` (`measurement_type_id`, `location_id`, `week_id`, `value`) - SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` - FROM `norostat_raw_datatable_diffs` AS `latest` - -- Get the latest `new_value` by "group" (measurement_type, location, week) - -- using the fact that there are no later measurements belonging to the - -- same group (find NULL entries in `later`.{release_date,parse_time} - -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't - -- include it in the result; it means that the corresponding cell/entry - -- has been removed from the data-table: - LEFT JOIN ( - SELECT * FROM `norostat_raw_datatable_diffs` - WHERE (`release_date`,`parse_time`) <= (%s,%s) - ) `later` - ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND - `latest`.`location_id` = `later`.`location_id` AND - `latest`.`week_id` = `later`.`week_id` AND - (`latest`.`release_date`, `latest`.`parse_time`) < - (`later`.`release_date`, `later`.`parse_time`) - WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND - `later`.`parse_time` IS NULL AND - `latest`.`new_value` IS NOT NULL; - ''', (release_date, parse_time, release_date, parse_time)) - # Find next recorded `release_date`, `parse_time` if any; create, populate - # `norostat_raw_datatable_next` if there is such a version: - cursor.execute(''' - SELECT `release_date`, `parse_time` - FROM `norostat_raw_datatable_version_list` - WHERE (`release_date`, `parse_time`) > (%s,%s) - ORDER BY `release_date`, `parse_time` - LIMIT 1 - ''', (release_date, parse_time)) - next_version_if_any = cursor.fetchall() - expect_result_in(len, next_version_if_any, (0,1), - 'Bug: expected next-version query to return a number of results in {}; instead have len & val ') - if len(next_version_if_any) != 0: - cursor.execute(''' - CREATE TEMPORARY TABLE `norostat_raw_datatable_next` ( - `measurement_type_id` INT NOT NULL, - `location_id` INT NOT NULL, - `week_id` INT NOT NULL, - `value` NVARCHAR(255) NOT NULL, -- forbid NULL; has special external meaning (see above) - -- would like but not allowed: FOREIGN KEY (`measurement_type_id`) REFERENCES `norostat_raw_datatable_measurement_type_pool` (`measurement_type_id`), - -- would like but not allowed: FOREIGN KEY (`location_id`) REFERENCES `norostat_raw_datatable_location_pool` (`location_id`), - -- would like but not allowed: FOREIGN KEY (`week_id`) REFERENCES `norostat_raw_datatable_week_pool` (`week_id`), - PRIMARY KEY (`measurement_type_id`, `location_id`, `week_id`) - ) ENGINE=MEMORY; - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_next` (`measurement_type_id`, `location_id`, `week_id`, `value`) - SELECT `latest`.`measurement_type_id`, `latest`.`location_id`, `latest`.`week_id`, `latest`.`new_value` - FROM `norostat_raw_datatable_diffs` AS `latest` - -- Get the latest `new_value` by "group" (measurement_type, location, week) - -- using the fact that there are no later measurements belonging to the - -- same group (find NULL entries in `later`.{release_date,parse_time} - -- in the LEFT JOIN below); if the latest `new_value` is NULL, don't - -- include it in the result; it means that the corresponding cell/entry - -- has been removed from the data-table: - LEFT JOIN ( - SELECT * FROM `norostat_raw_datatable_diffs` - WHERE (`release_date`,`parse_time`) <= (%s, %s) - ) `later` - ON `latest`.`measurement_type_id` = `later`.`measurement_type_id` AND - `latest`.`location_id` = `later`.`location_id` AND - `latest`.`week_id` = `later`.`week_id` AND - (`latest`.`release_date`, `latest`.`parse_time`) < - (`later`.`release_date`, `later`.`parse_time`) - WHERE (`latest`.`release_date`, `latest`.`parse_time`) <= (%s, %s) AND - `later`.`parse_time` IS NULL AND - `latest`.`new_value` IS NOT NULL -- NULL means value was removed - ''', next_version_if_any[0]+next_version_if_any[0]) - # Register new version in version list: - try: - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_version_list` (`release_date`, `parse_time`) - VALUES (%s, %s) - ''', (release_date, parse_time)) - except mysql.connector.errors.IntegrityError as e: - raise Exception(['Encountered an IntegrityError when updating the norostat_raw_datatable_version_list table; this probably indicates that a version with the same `release_date` and `parse_time` was already added to the database; parse_time has limited resolution, so this can happen from populating the database too quickly when there are duplicate release dates; original error: ', e]) - # Add any new measurement_type, location, or week strings to the associated - # string pools: - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_measurement_type_pool` (`measurement_type`) - SELECT DISTINCT `measurement_type` - FROM `norostat_raw_datatable_parsed` - WHERE `measurement_type` NOT IN ( - SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type` - FROM `norostat_raw_datatable_measurement_type_pool` - ); - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_location_pool` (`location`) - SELECT DISTINCT `location` - FROM `norostat_raw_datatable_parsed` - WHERE `location` NOT IN ( - SELECT `norostat_raw_datatable_location_pool`.`location` - FROM `norostat_raw_datatable_location_pool` - ); - ''') - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_week_pool` (`week`) - SELECT DISTINCT `week` - FROM `norostat_raw_datatable_parsed` - WHERE `week` NOT IN ( - SELECT `norostat_raw_datatable_week_pool`.`week` - FROM `norostat_raw_datatable_week_pool` - ); - ''') - # Record diff: [newly parsed version "minus" previous version] (first, - # record additions/updates, then record deletions): - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( - SELECT `norostat_raw_datatable_previous`.`measurement_type_id`, - `norostat_raw_datatable_previous`.`location_id`, - `norostat_raw_datatable_previous`.`week_id`, - `norostat_raw_datatable_previous`.`value` - FROM `norostat_raw_datatable_previous` - ); - ''', (release_date, parse_time)) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL - FROM `norostat_raw_datatable_previous` - WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( - SELECT `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, - `norostat_raw_datatable_location_pool`.`location_id`, - `norostat_raw_datatable_week_pool`.`week_id` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - ); - ''', (release_date, parse_time)) - # If there is an already-recorded next version, its diff is invalidated by - # the insertion of the newly parsed version; delete the [next version - # "minus" previous version] diff and record the [next version "minus" newly - # parsed] diff: - if len(next_version_if_any) != 0: - cursor.execute(''' - DELETE FROM `norostat_raw_datatable_diffs` - WHERE `release_date`=%s AND `parse_time`=%s; - ''', next_version_if_any[0]) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, `value` - FROM `norostat_raw_datatable_next` - WHERE (`measurement_type_id`, `location_id`, `week_id`, `value`) NOT IN ( - SELECT - `norostat_raw_datatable_measurement_type_pool`.`measurement_type_id`, - `norostat_raw_datatable_location_pool`.`location_id`, - `norostat_raw_datatable_week_pool`.`week_id`, - `norostat_raw_datatable_parsed`.`value` - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - ); - ''', next_version_if_any[0]) - cursor.execute(''' - INSERT INTO `norostat_raw_datatable_diffs` (`measurement_type_id`, `location_id`, `week_id`, `release_date`, `parse_time`, `new_value`) - SELECT `measurement_type_id`, `location_id`, `week_id`, %s, %s, NULL - FROM `norostat_raw_datatable_parsed` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type`) - LEFT JOIN `norostat_raw_datatable_location_pool` USING (`location`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week`) - WHERE (`measurement_type_id`, `location_id`, `week_id`) NOT IN ( - SELECT `norostat_raw_datatable_next`.`measurement_type_id`, - `norostat_raw_datatable_next`.`location_id`, - `norostat_raw_datatable_next`.`week_id` - FROM `norostat_raw_datatable_next` - ); - ''', next_version_if_any[0]) - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_point_version_list` ( - `release_date` DATE NOT NULL, - `parse_time` DATETIME(6) NOT NULL, - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_raw_datatable_version_list` (`release_date`,`parse_time`), - PRIMARY KEY (`release_date`, `parse_time`) - ); - ''') - cursor.execute(''' - CREATE TABLE IF NOT EXISTS `norostat_point_diffs` ( - `release_date` DATE NOT NULL, - `parse_time` datetime(6) NOT NULL, - `location_id` INT NOT NULL, - `epiweek` INT NOT NULL, - `new_value` NVARCHAR(255), -- allow NULL, with meaning "removed" - FOREIGN KEY (`release_date`,`parse_time`) REFERENCES `norostat_point_version_list` (`release_date`,`parse_time`), - FOREIGN KEY (`location_id`) REFERENCES norostat_raw_datatable_location_pool (`location_id`), - UNIQUE KEY (`location_id`, `epiweek`, `release_date`, `parse_time`, `new_value`), - PRIMARY KEY (`release_date`, `parse_time`, `location_id`, `epiweek`) - ); - ''') - cnx.commit() # (might do nothing; each statement above takes effect and/or commits immediately) - finally: - cnx.close() - -def update_point(): - (u, p) = secrets.db.epi - cnx = mysql.connector.connect(user=u, password=p, database='epidata') - try: - cursor = cnx.cursor() - cnx.start_transaction(isolation_level='serializable') - cursor.execute(''' - SELECT `release_date`, `parse_time`, `measurement_type`, `location_id`, `week`, `new_value` - FROM `norostat_raw_datatable_diffs` - LEFT JOIN `norostat_raw_datatable_measurement_type_pool` USING (`measurement_type_id`) - LEFT JOIN `norostat_raw_datatable_week_pool` USING (`week_id`) - WHERE (`release_date`, `parse_time`) NOT IN ( - SELECT `norostat_point_version_list`.`release_date`, - `norostat_point_version_list`.`parse_time` - FROM `norostat_point_version_list` - ); - ''') - raw_datatable_diff_selection = cursor.fetchall() - prog = re.compile(r"[0-9]+-[0-9]+$") - point_diff_insertion = [ - (release_date, parse_time, location_id, - season_db_to_epiweek(measurement_type, week), - int(new_value_str) if new_value_str is not None else None - ) - for (release_date, parse_time, measurement_type, location_id, week, new_value_str) - in raw_datatable_diff_selection - if prog.match(measurement_type) is not None and - new_value_str != "" - ] - cursor.execute(''' - INSERT INTO `norostat_point_version_list` (`release_date`, `parse_time`) - SELECT DISTINCT `release_date`, `parse_time` - FROM `norostat_raw_datatable_version_list` - WHERE (`release_date`, `parse_time`) NOT IN ( - SELECT `norostat_point_version_list`.`release_date`, - `norostat_point_version_list`.`parse_time` - FROM `norostat_point_version_list` - ); - ''') - cursor.executemany(''' - INSERT INTO `norostat_point_diffs` (`release_date`, `parse_time`, `location_id`, `epiweek`, `new_value`) - VALUES (%s, %s, %s, %s, %s) - ''', point_diff_insertion) - cnx.commit() - finally: - cnx.close() - -# note there are more efficient ways to calculate diffs without forming ..._next table -# todo give indices names -# todo trim pool functionality for if data is deleted? -# todo make classes to handle pool, keyval store, and diff table query formation -# todo test mode w/ rollback -# todo record position of rows and columns in raw data-table (using additional diff tables) -# todo consider measurement index mapping to another id -# todo add fetch_time to version list -# xxx replace "import *"'s -# xxx should cursor be closed? -# xxx is cnx auto-closed on errors? -# xxx drop temporary tables? -# fixme time zone issues diff --git a/src/acquisition/norostat/norostat_update.py b/src/acquisition/norostat/norostat_update.py deleted file mode 100644 index 4b0021dd5..000000000 --- a/src/acquisition/norostat/norostat_update.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -=============== -=== Purpose === -=============== - -Fetch NoroSTAT data table from -; -process and record it in the appropriate databases. -""" - -# first party -from . import norostat_sql -from . import norostat_raw - - -def main(): - # Download the data: - # content = norostat_raw.load_sample_content() - content = norostat_raw.fetch_content() - # norostat_raw.save_sample_content(content) - wide_raw = norostat_raw.parse_content_to_wide_raw(content) - long_raw = norostat_raw.melt_wide_raw_to_long_raw(wide_raw) - norostat_sql.ensure_tables_exist() - norostat_sql.record_long_raw(long_raw) - norostat_sql.update_point() - -if __name__ == '__main__': - main() diff --git a/src/acquisition/norostat/norostat_utils.py b/src/acquisition/norostat/norostat_utils.py deleted file mode 100644 index a99a4dc96..000000000 --- a/src/acquisition/norostat/norostat_utils.py +++ /dev/null @@ -1,44 +0,0 @@ -# standard library -import re -import datetime - -# first party -from delphi.utils.epidate import EpiDate - -# helper funs for checking expectations, throwing exceptions on violations: -def expect_value_eq(encountered, expected, mismatch_format): - if encountered != expected: - raise Exception([mismatch_format.format(expected), encountered]) -def expect_result_eq(f, value, expected, mismatch_format): - result = f(value) - if result != expected: - raise Exception([mismatch_format.format(expected), result, value]) -def expect_value_in(encountered, expected_candidates, mismatch_format): - if encountered not in expected_candidates: - raise Exception([mismatch_format.format(expected_candidates), encountered]) -def expect_result_in(f, value, expected_candidates, mismatch_format): - result = f(value) - if result not in expected_candidates: - raise Exception([mismatch_format.format(expected_candidates), result, value]) -def expect_str_contains(encountered, regex, mismatch_format): - if re.search(regex, encountered) is None: - raise Exception([mismatch_format.format(regex), encountered]) - -# helper fun used with expect_* funs to check value of .dtype.kind: -def dtype_kind(numpy_like): - return numpy_like.dtype.kind - -# helper fun used to convert season string ("YYYY-YY" or "YYYY-YYYY") and -# "Week" string (strptime format "%d-%b") to the corresponding epiweek; assumes -# by default that dates >= 1-Aug correspond to weeks of the first year: -def season_db_to_epiweek(season_str, db_date_str, first_db_date_of_season_str="1-Aug"): - year_strs = season_str.split("-") - first_year = int(year_strs[0]) - second_year = first_year + 1 - # FIXME check/enforce locale - first_date_of_season = datetime.datetime.strptime(first_db_date_of_season_str+"-"+str(first_year), "%d-%b-%Y").date() - date_using_first_year = datetime.datetime.strptime(db_date_str+"-"+str(first_year), "%d-%b-%Y").date() - date_using_second_year = datetime.datetime.strptime(db_date_str+"-"+str(second_year), "%d-%b-%Y").date() - date = date_using_first_year if date_using_first_year >= first_date_of_season else date_using_second_year - epiweek = EpiDate(date.year, date.month, date.day).get_ew() - return epiweek diff --git a/src/acquisition/norostat/sample_content.pickle b/src/acquisition/norostat/sample_content.pickle deleted file mode 100644 index 1518dde0d..000000000 Binary files a/src/acquisition/norostat/sample_content.pickle and /dev/null differ