cmu-delphi · aysim319 · Aug 26, 2024 · Aug 29, 2024 · Sep 3, 2024 · Sep 4, 2024
diff --git a/claims_hosp/delphi_claims_hosp/run.py b/claims_hosp/delphi_claims_hosp/run.py
@@ -5,22 +5,25 @@
 when the module is run with `python -m delphi_claims_hosp`.
 """
 
+import os
+
 # standard packages
 import time
-import os
 from datetime import datetime, timedelta
 from pathlib import Path
 
 # third party
 from delphi_utils import get_structured_logger
+from delphi_utils.export import create_export_csv
+
+from .backfill import merge_backfill_file, store_backfill_file
 
 # first party
 from .config import Config
 from .download_claims_ftp_files import download
-from .modify_claims_drops import modify_and_write
 from .get_latest_claims_name import get_latest_filename
+from .modify_claims_drops import modify_and_write
 from .update_indicator import ClaimsHospIndicatorUpdater
-from .backfill import (store_backfill_file, merge_backfill_file)
 
 
 def run_module(params):
@@ -137,11 +140,20 @@ def run_module(params):
                 params["indicator"]["write_se"],
                 signal_name
             )
-            updater.update_indicator(
+            output = updater.update_indicator_to_df(
                 claims_file,
                 params["common"]["export_dir"],
                 logger,
             )
+            filtered_output_df = updater.filter_output(output)
+            create_export_csv(
+                filtered_output_df,
+                export_dir=params["common"]["export_dir"],
+                start_date=startdate,
+                geo_res=geo,
+                sensor=signal_name,
+            )
+
             max_dates.append(updater.output_dates[-1])
             n_csv_export.append(len(updater.output_dates))
         logger.info("finished updating", geo = geo)

diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py
@@ -133,14 +133,76 @@ def geo_reindex(self, data):
         data_frame.fillna(0, inplace=True)
         return data_frame
 
-    def update_indicator(self, input_filepath, outpath, logger):
+    def update_indicator_to_df(self, input_filepath, logger):
         """
         Generate and output indicator values.
 
         Args:
             input_filepath: path to the aggregated claims data
-            outpath: output path for the csv results
+        """
+        self.shift_dates()
+        final_output_inds = (self.burn_in_dates >= self.startdate) & (self.burn_in_dates <= self.enddate)
+
+        # load data
+        base_geo = Config.HRR_COL if self.geo == Config.HRR_COL else Config.FIPS_COL
+        data = load_data(input_filepath, self.dropdate, base_geo)
+        data_frame = self.geo_reindex(data)
 
+        # handle if we need to adjust by weekday
+        wd_params = (
+            Weekday.get_params_legacy(
+                data_frame,
+                "den",
+                ["num"],
+                Config.DATE_COL,
+                [1, 1e5],
+                logger,
+            )
+            if self.weekday
+            else None
+        )
+        output_df = pd.DataFrame()
+        if not self.parallel:
+            for geo_id, sub_data in data_frame.groupby(level=0):
+                sub_data.reset_index(inplace=True)
+                if self.weekday:
+                    sub_data = Weekday.calc_adjustment(wd_params, sub_data, ["num"], Config.DATE_COL)
+                sub_data.set_index(Config.DATE_COL, inplace=True)
+                res = ClaimsHospIndicator.fit(sub_data, self.burnindate, geo_id)
+                output_df = output_df.append(pd.DataFrame(res))
+        else:
+
+            n_cpu = min(Config.MAX_CPU_POOL, cpu_count())
+            logging.debug("starting pool with %d workers", n_cpu)
+            with Pool(n_cpu) as pool:
+                pool_results = []
+                for geo_id, sub_data in data_frame.groupby(level=0, as_index=False):
+                    sub_data.reset_index(inplace=True)
+                    if self.weekday:
+                        sub_data = Weekday.calc_adjustment(wd_params, sub_data, ["num"], Config.DATE_COL)
+                    sub_data.set_index(Config.DATE_COL, inplace=True)
+                    pool_results.append(
+                        pool.apply_async(
+                            ClaimsHospIndicator.fit,
+                            args=(
+                                sub_data,
+                                self.burnindate,
+                                geo_id,
+                            ),
+                        )
+                    )
+                pool_results = [proc.get() for proc in pool_results]
+                for res in pool_results:
+                    output_df.append(pd.DataFrame(res))
+
+        return output_df
+
+    def update_indicator(self, input_filepath, logger):
+        """
+        Generate and output indicator values.
+
+        Args:
+            input_filepath: path to the aggregated claims data
         """
         self.shift_dates()
         final_output_inds = \
@@ -215,10 +277,30 @@ def update_indicator(self, input_filepath, outpath, logger):
             "geo_level": self.geo,
             "include": valid_inds,
         }
-
-        self.write_to_csv(output_dict, outpath)
-        logging.debug("wrote files to %s", outpath)
-
+        return output_dict
+
+    def filter_output(self, df):
+        filtered_df = df[df["incl"]]
+        filtered_df = filtered_df.reset_index()
+        filtered_df.rename(columns={"rate": "val"}, inplace=True)
+        filtered_df["timestamp"] = filtered_df["timestamp"].astype(str)
+        output_df = pd.DataFrame()
+        for geo_id, group in filtered_df.groupby("geo_id"):
+            assert not group.val.isnull().any()
+            assert not group.se.isnull().any()
+            assert np.all(group.se < 5), f"se suspicious, {geo_id}: {np.where(group.se >= 5)[0]}"
+            if np.any(group.val > 90):
+                for sus_val in np.where(group.val > 90):
+                    logging.warning("value suspicious, %s: %d", geo_id, sus_val)
+            if self.write_se:
+                assert np.all(group.val > 0) and np.all(group.se > 0), "p=0, std_err=0 invalid"
+            else:
+                group["se"] = np.NaN
+            group.drop("incl", inplace=True, axis="columns")
+            group["direction"] = np.NaN
+            output_df = output_df.append(group)
+
+        return output_df
     def write_to_csv(self, output_dict, output_path="./receiving"):
         """
         Write values to csv.
@@ -228,6 +310,7 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
             output_path: outfile path to write the csv
 
         """
+
         if self.write_se:
             logging.info("========= WARNING: WRITING SEs TO %s =========",
                          self.signal_name)
@@ -268,3 +351,4 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
                         out_n += 1
 
         logging.debug("wrote %d rows for %d %s", out_n, len(geo_ids), geo_level)
+        logging.debug("wrote files to %s", output_path)
diff --git a/claims_hosp/tests/test_data/EDI_AGG_INPATIENT_1_06092020_1451CDT.csv.gz b/claims_hosp/tests/test_data/EDI_AGG_INPATIENT_1_06092020_1451CDT.csv.gz
diff --git a/claims_hosp/tests/test_modify_claims_drops.py b/claims_hosp/tests/test_modify_claims_drops.py
@@ -13,7 +13,7 @@ def test_modify_and_write(self):
         logger = Mock()
         files, dfs_list = modify_and_write(data_path, logger, test_mode=True)
         expected_colnames = ['PatCountyFIPS', 'Pat HRR Name', 'Pat HRR ID', 'PatAgeGroup']
-        assert len(files) == 1
-        assert len(dfs_list) == 1
+        assert len(files) == 2
+        assert len(dfs_list) == 2
         assert files[0] == Path('./test_data/SYNEDI_AGG_INPATIENT_11062020_1451CDT.csv.gz')
         assert set(expected_colnames).issubset(set(dfs_list[0].columns))
diff --git a/claims_hosp/tests/test_update_indicator.py b/claims_hosp/tests/test_update_indicator.py
@@ -2,6 +2,7 @@
 import os
 from copy import deepcopy
 from os.path import join, exists
+import json
 from tempfile import TemporaryDirectory
 
 # third party
@@ -13,6 +14,7 @@
 # first party
 from delphi_claims_hosp.config import Config, GeoConstants
 from delphi_claims_hosp.update_indicator import ClaimsHospIndicatorUpdater
+from delphi_utils.export import create_export_csv
 
 CONFIG = Config()
 CONSTANTS = GeoConstants()
@@ -35,6 +37,9 @@ class TestClaimsHospIndicatorUpdater:
     weekday = False
     write_se = False
     prefix = "foo"
+    start_date = "02-01-2020"
+    end_date = "06-01-2020"
+    drop_date = "2020-06-12"
     small_test_data = pd.DataFrame({
         "num": [0, 100, 200, 300, 400, 500, 600, 100, 200, 300, 400, 500, 600],
         "hrr": [1.0] * 7 + [2.0] * 6,
@@ -44,9 +49,9 @@ class TestClaimsHospIndicatorUpdater:
 
     def test_shift_dates(self):
         updater = ClaimsHospIndicatorUpdater(
-            "02-01-2020",
-            "06-01-2020",
-            "06-12-2020",
+            self.start_date,
+            self.end_date,
+            self.drop_date,
             self.geo,
             self.parallel,
             self.weekday,
@@ -65,9 +70,9 @@ def test_shift_dates(self):
 
     def test_geo_reindex(self):
         updater = ClaimsHospIndicatorUpdater(
-            "02-01-2020",
-            "06-01-2020",
-            "06-12-2020",
+            self.start_date,
+            self.end_date,
+            self.drop_date,
             self.geo,
             self.parallel,
             self.weekday,
@@ -83,31 +88,32 @@ def test_update_indicator(self):
         for geo in ["state", "hrr", "hhs", "nation"]:
             td = TemporaryDirectory()
             updater = ClaimsHospIndicatorUpdater(
-                "02-01-2020",
-                "06-01-2020",
-                "06-12-2020",
+                self.start_date,
+                self.end_date,
+                self.drop_date,
                 geo,
                 self.parallel,
                 self.weekday,
                 self.write_se,
                 Config.signal_name
             )
 
-            updater.update_indicator(
+            output = updater.update_indicator(
                 DATA_FILEPATH,
-                td.name,
                 TEST_LOGGER
             )
 
+            updater.write_to_csv(output, td.name)
+
             assert len(os.listdir(td.name)) == len(
                 updater.output_dates), f"failed {geo} update_indicator test"
             td.cleanup()
 
     def test_write_to_csv_results(self):
         updater = ClaimsHospIndicatorUpdater(
-            "02-01-2020",
-            "06-01-2020",
-            "06-12-2020",
+            self.start_date,
+            self.end_date,
+            self.drop_date,
             self.geo,
             self.parallel,
             self.weekday,
@@ -185,9 +191,9 @@ def test_write_to_csv_with_se_results(self):
         obfuscated_name = PARAMS["indicator"]["obfuscated_prefix"]
         signal_name = obfuscated_name + "_" + Config.signal_weekday_name
         updater = ClaimsHospIndicatorUpdater(
-            "02-01-2020",
-            "06-01-2020",
-            "06-12-2020",
+            self.start_date,
+            self.end_date,
+            self.drop_date,
             self.geo,
             self.parallel,
             True,
@@ -236,9 +242,9 @@ def test_write_to_csv_with_se_results(self):
 
     def test_write_to_csv_wrong_results(self):
         updater = ClaimsHospIndicatorUpdater(
-            "02-01-2020",
-            "06-01-2020",
-            "06-12-2020",
+            self.start_date,
+            self.end_date,
+            self.drop_date,
             self.geo,
             self.parallel,
             self.weekday,
@@ -289,3 +295,46 @@ def test_write_to_csv_wrong_results(self):
             updater.write_to_csv(res3, td.name)
 
         td.cleanup()
+
+    def test_prefilter_results(self):
+        td = TemporaryDirectory()
+        td2 = TemporaryDirectory()
+
+        updater = ClaimsHospIndicatorUpdater(
+            self.start_date,
+            self.end_date,
+            self.drop_date,
+            "state",
+            self.parallel,
+            self.weekday,
+            self.write_se,
+            Config.signal_name
+        )
+
+        output = updater.update_indicator(
+            "test_data/EDI_AGG_INPATIENT_1_06092020_1451CDT.csv.gz",
+            TEST_LOGGER
+        )
+
+        updater.write_to_csv(output, td.name)
+
+        output_df = updater.update_indicator_to_df(
+            "test_data/EDI_AGG_INPATIENT_1_06092020_1451CDT.csv.gz",
+            TEST_LOGGER
+        )
+
+        filtered_output_df = updater.filter_output(output_df)
+        create_export_csv(filtered_output_df, td2.name,
+                          start_date=self.start_date,
+                          end_date=self.end_date,
+                          geo_res="state",
+                          sensor=Config.signal_name)
+        expected_files = sorted(os.listdir(td.name))
+        actual_files = sorted(os.listdir(td2.name))
+        for expected, actual in zip(expected_files, actual_files):
+            with open(join(td2.name, expected), "rb") as expected_f, \
+                 open(join(td2.name, actual), "rb") as actual_f:
+                expected_df = pd.read_csv(expected_f)
+                actual_df = pd.read_csv(actual_f)
+                pd.testing.assert_frame_equal(expected_df, actual_df)
+        td.cleanup()