From 9513813e239de69608fe16d624441964e49c88ba Mon Sep 17 00:00:00 2001
From: Zeb Burke-Conte <zmbc@uw.edu>
Date: Thu, 19 Oct 2023 11:20:11 -0700
Subject: [PATCH] Noise shards together

---
 src/pseudopeople/interface.py | 72 ++++++++++++++++++++---------------
 src/pseudopeople/noise.py     |  2 +-
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
index f441733f..743b3725 100644
--- a/src/pseudopeople/interface.py
+++ b/src/pseudopeople/interface.py
@@ -67,42 +67,39 @@ def _generate_dataset(
             "Please provide the path to the unmodified root data directory."
         )
     validate_data_path_suffix(data_paths)
-    noised_dataset = []
-    iterator = (
-        tqdm(data_paths, desc="Noising data", leave=False)
-        if len(data_paths) > 1
-        else data_paths
-    )
+    all_data = []
+    iterator = tqdm(data_paths, desc="Loading data") if len(data_paths) > 1 else data_paths
 
-    for data_path_index, data_path in enumerate(iterator):
+    for data_path in iterator:
         logger.debug(f"Loading data from {data_path}.")
         data = _load_data_from_path(data_path, user_filters)
         if data.empty:
             continue
-        data = _reformat_dates_for_noising(data, dataset)
-        data = _coerce_dtypes(data, dataset)
-        # Use a different seed for each data file/shard, otherwise the randomness will duplicate
-        # and the Nth row in each shard will get the same noise
-        data_path_seed = f"{seed}_{data_path_index}"
-        noised_data = noise_dataset(dataset, data, configuration_tree, data_path_seed)
-        noised_data = _extract_columns(dataset.columns, noised_data)
-        noised_dataset.append(noised_data)
+        # FIXME: Right now, Categorical columns in the Rhode Island data
+        # contain a very large number of unnecessary categories. We want
+        # to get rid of these during this loop so that they are never all
+        # in memory at the same time.
+        # TODO: Remove this when we stop Categorical encoding.
+        data = _remove_unused_categories(data, dataset)
+        all_data.append(data)
 
     # Check if all shards for the dataset are empty
-    if len(noised_dataset) == 0:
+    if len(all_data) == 0:
         raise ValueError(
             "Invalid value provided for 'state' or 'year'. No data found with "
-            f"the user provided 'state' or 'year' filters at {data_path}."
+            f"the user provided 'state' or 'year' filters at {source}."
         )
-    noised_dataset = pd.concat(noised_dataset, ignore_index=True)
 
-    # Known pandas bug: pd.concat does not preserve category dtypes so we coerce
-    # again after concat (https://github.com/pandas-dev/pandas/issues/51362)
-    noised_dataset = _coerce_dtypes(noised_dataset, dataset, cleanse_int_cols=True)
+    all_data = pd.concat(all_data, ignore_index=True)
+    _reformat_dates_for_noising(all_data, dataset)
+    all_data = _coerce_dtypes(all_data, dataset)
+    all_data = noise_dataset(dataset, all_data, configuration_tree, seed)
+    all_data = _extract_columns(dataset.columns, all_data)
+    all_data = _coerce_dtypes(all_data, dataset, cleanse_int_cols=True)
 
     logger.debug("*** Finished ***")
 
-    return noised_dataset
+    return all_data
 
 
 def validate_source_compatibility(source: Path):
@@ -151,15 +148,27 @@ def _coerce_dtypes(
     return data
 
 
+def _remove_unused_categories(data: pd.DataFrame, dataset: Dataset) -> pd.DataFrame:
+    for col in data.columns:
+        if data[col].dtype.name == "category" and (
+            # NOTE: We want to avoid dropping categories that just happen not to be used
+            # in columns that are returned as Categorical to the user such as event_type
+            col not in dataset.columns
+            or dataset.columns[col].dtype_name != "category"
+        ):
+            data[col] = data[col].cat.remove_unused_categories()
+
+    return data
+
+
 def _load_data_from_path(data_path: Path, user_filters: List[Tuple]) -> pd.DataFrame:
     """Load data from a data file given a data_path and a year_filter."""
     data = load_standard_dataset_file(data_path, user_filters)
     return data
 
 
-def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
+def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset) -> None:
     """Formats date columns so they can be noised as strings."""
-    data = data.copy()
 
     for date_column in [COLUMNS.dob.name, COLUMNS.ssa_event_date.name]:
         # Format both the actual column, and the shadow version that will be used
@@ -170,19 +179,20 @@ def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
                 # re-parse the format string for each row
                 # https://github.com/pandas-dev/pandas/issues/44764
                 # Year is already guaranteed to be 4-digit: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits
-                year_string = data[column].dt.year.astype(str)
-                month_string = _zfill_fast(data[column].dt.month.astype(str), 2)
-                day_string = _zfill_fast(data[column].dt.day.astype(str), 2)
+                data_column = data[column]
+                year_string = data_column.dt.year.astype(str)
+                month_string = _zfill_fast(data_column.dt.month.astype(str), 2)
+                day_string = _zfill_fast(data_column.dt.day.astype(str), 2)
                 if dataset.date_format == DATEFORMATS.YYYYMMDD:
-                    data[column] = year_string + month_string + day_string
+                    result = year_string + month_string + day_string
                 elif dataset.date_format == DATEFORMATS.MM_DD_YYYY:
-                    data[column] = month_string + "/" + day_string + "/" + year_string
+                    result = month_string + "/" + day_string + "/" + year_string
                 elif dataset.date_format == DATEFORMATS.MMDDYYYY:
-                    data[column] = month_string + day_string + year_string
+                    result = month_string + day_string + year_string
                 else:
                     raise ValueError(f"Invalid date format in {dataset.name}.")
 
-    return data
+                data[column] = result
 
 
 def _zfill_fast(col: pd.Series, desired_length: int) -> pd.Series:
diff --git a/src/pseudopeople/noise.py b/src/pseudopeople/noise.py
index 4a1697e3..e458b32b 100644
--- a/src/pseudopeople/noise.py
+++ b/src/pseudopeople/noise.py
@@ -56,7 +56,7 @@ def noise_dataset(
     # except for the leave_blank kind which is special-cased below
     missingness = (dataset_data == "") | (dataset_data.isna())
 
-    for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type", leave=False):
+    for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type"):
         if isinstance(noise_type, RowNoiseType):
             if (
                 Keys.ROW_NOISE in noise_configuration