From 9513813e239de69608fe16d624441964e49c88ba Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Thu, 19 Oct 2023 11:20:11 -0700 Subject: [PATCH] Noise shards together --- src/pseudopeople/interface.py | 72 ++++++++++++++++++++--------------- src/pseudopeople/noise.py | 2 +- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index f441733f..743b3725 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -67,42 +67,39 @@ def _generate_dataset( "Please provide the path to the unmodified root data directory." ) validate_data_path_suffix(data_paths) - noised_dataset = [] - iterator = ( - tqdm(data_paths, desc="Noising data", leave=False) - if len(data_paths) > 1 - else data_paths - ) + all_data = [] + iterator = tqdm(data_paths, desc="Loading data") if len(data_paths) > 1 else data_paths - for data_path_index, data_path in enumerate(iterator): + for data_path in iterator: logger.debug(f"Loading data from {data_path}.") data = _load_data_from_path(data_path, user_filters) if data.empty: continue - data = _reformat_dates_for_noising(data, dataset) - data = _coerce_dtypes(data, dataset) - # Use a different seed for each data file/shard, otherwise the randomness will duplicate - # and the Nth row in each shard will get the same noise - data_path_seed = f"{seed}_{data_path_index}" - noised_data = noise_dataset(dataset, data, configuration_tree, data_path_seed) - noised_data = _extract_columns(dataset.columns, noised_data) - noised_dataset.append(noised_data) + # FIXME: Right now, Categorical columns in the Rhode Island data + # contain a very large number of unnecessary categories. We want + # to get rid of these during this loop so that they are never all + # in memory at the same time. + # TODO: Remove this when we stop Categorical encoding. + data = _remove_unused_categories(data, dataset) + all_data.append(data) # Check if all shards for the dataset are empty - if len(noised_dataset) == 0: + if len(all_data) == 0: raise ValueError( "Invalid value provided for 'state' or 'year'. No data found with " - f"the user provided 'state' or 'year' filters at {data_path}." + f"the user provided 'state' or 'year' filters at {source}." ) - noised_dataset = pd.concat(noised_dataset, ignore_index=True) - # Known pandas bug: pd.concat does not preserve category dtypes so we coerce - # again after concat (https://github.com/pandas-dev/pandas/issues/51362) - noised_dataset = _coerce_dtypes(noised_dataset, dataset, cleanse_int_cols=True) + all_data = pd.concat(all_data, ignore_index=True) + _reformat_dates_for_noising(all_data, dataset) + all_data = _coerce_dtypes(all_data, dataset) + all_data = noise_dataset(dataset, all_data, configuration_tree, seed) + all_data = _extract_columns(dataset.columns, all_data) + all_data = _coerce_dtypes(all_data, dataset, cleanse_int_cols=True) logger.debug("*** Finished ***") - return noised_dataset + return all_data def validate_source_compatibility(source: Path): @@ -151,15 +148,27 @@ def _coerce_dtypes( return data +def _remove_unused_categories(data: pd.DataFrame, dataset: Dataset) -> pd.DataFrame: + for col in data.columns: + if data[col].dtype.name == "category" and ( + # NOTE: We want to avoid dropping categories that just happen not to be used + # in columns that are returned as Categorical to the user such as event_type + col not in dataset.columns + or dataset.columns[col].dtype_name != "category" + ): + data[col] = data[col].cat.remove_unused_categories() + + return data + + def _load_data_from_path(data_path: Path, user_filters: List[Tuple]) -> pd.DataFrame: """Load data from a data file given a data_path and a year_filter.""" data = load_standard_dataset_file(data_path, user_filters) return data -def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset): +def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset) -> None: """Formats date columns so they can be noised as strings.""" - data = data.copy() for date_column in [COLUMNS.dob.name, COLUMNS.ssa_event_date.name]: # Format both the actual column, and the shadow version that will be used @@ -170,19 +179,20 @@ def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset): # re-parse the format string for each row # https://github.com/pandas-dev/pandas/issues/44764 # Year is already guaranteed to be 4-digit: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits - year_string = data[column].dt.year.astype(str) - month_string = _zfill_fast(data[column].dt.month.astype(str), 2) - day_string = _zfill_fast(data[column].dt.day.astype(str), 2) + data_column = data[column] + year_string = data_column.dt.year.astype(str) + month_string = _zfill_fast(data_column.dt.month.astype(str), 2) + day_string = _zfill_fast(data_column.dt.day.astype(str), 2) if dataset.date_format == DATEFORMATS.YYYYMMDD: - data[column] = year_string + month_string + day_string + result = year_string + month_string + day_string elif dataset.date_format == DATEFORMATS.MM_DD_YYYY: - data[column] = month_string + "/" + day_string + "/" + year_string + result = month_string + "/" + day_string + "/" + year_string elif dataset.date_format == DATEFORMATS.MMDDYYYY: - data[column] = month_string + day_string + year_string + result = month_string + day_string + year_string else: raise ValueError(f"Invalid date format in {dataset.name}.") - return data + data[column] = result def _zfill_fast(col: pd.Series, desired_length: int) -> pd.Series: diff --git a/src/pseudopeople/noise.py b/src/pseudopeople/noise.py index 4a1697e3..e458b32b 100644 --- a/src/pseudopeople/noise.py +++ b/src/pseudopeople/noise.py @@ -56,7 +56,7 @@ def noise_dataset( # except for the leave_blank kind which is special-cased below missingness = (dataset_data == "") | (dataset_data.isna()) - for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type", leave=False): + for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type"): if isinstance(noise_type, RowNoiseType): if ( Keys.ROW_NOISE in noise_configuration