Skip to content

Commit

Permalink
Noise shards together
Browse files Browse the repository at this point in the history
  • Loading branch information
zmbc committed Oct 27, 2023
1 parent e379ef0 commit 9513813
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 32 deletions.
72 changes: 41 additions & 31 deletions src/pseudopeople/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,42 +67,39 @@ def _generate_dataset(
"Please provide the path to the unmodified root data directory."
)
validate_data_path_suffix(data_paths)
noised_dataset = []
iterator = (
tqdm(data_paths, desc="Noising data", leave=False)
if len(data_paths) > 1
else data_paths
)
all_data = []
iterator = tqdm(data_paths, desc="Loading data") if len(data_paths) > 1 else data_paths

for data_path_index, data_path in enumerate(iterator):
for data_path in iterator:
logger.debug(f"Loading data from {data_path}.")
data = _load_data_from_path(data_path, user_filters)
if data.empty:
continue
data = _reformat_dates_for_noising(data, dataset)
data = _coerce_dtypes(data, dataset)
# Use a different seed for each data file/shard, otherwise the randomness will duplicate
# and the Nth row in each shard will get the same noise
data_path_seed = f"{seed}_{data_path_index}"
noised_data = noise_dataset(dataset, data, configuration_tree, data_path_seed)
noised_data = _extract_columns(dataset.columns, noised_data)
noised_dataset.append(noised_data)
# FIXME: Right now, Categorical columns in the Rhode Island data
# contain a very large number of unnecessary categories. We want
# to get rid of these during this loop so that they are never all
# in memory at the same time.
# TODO: Remove this when we stop Categorical encoding.
data = _remove_unused_categories(data, dataset)
all_data.append(data)

# Check if all shards for the dataset are empty
if len(noised_dataset) == 0:
if len(all_data) == 0:
raise ValueError(
"Invalid value provided for 'state' or 'year'. No data found with "
f"the user provided 'state' or 'year' filters at {data_path}."
f"the user provided 'state' or 'year' filters at {source}."
)
noised_dataset = pd.concat(noised_dataset, ignore_index=True)

# Known pandas bug: pd.concat does not preserve category dtypes so we coerce
# again after concat (https://github.com/pandas-dev/pandas/issues/51362)
noised_dataset = _coerce_dtypes(noised_dataset, dataset, cleanse_int_cols=True)
all_data = pd.concat(all_data, ignore_index=True)
_reformat_dates_for_noising(all_data, dataset)
all_data = _coerce_dtypes(all_data, dataset)
all_data = noise_dataset(dataset, all_data, configuration_tree, seed)
all_data = _extract_columns(dataset.columns, all_data)
all_data = _coerce_dtypes(all_data, dataset, cleanse_int_cols=True)

logger.debug("*** Finished ***")

return noised_dataset
return all_data


def validate_source_compatibility(source: Path):
Expand Down Expand Up @@ -151,15 +148,27 @@ def _coerce_dtypes(
return data


def _remove_unused_categories(data: pd.DataFrame, dataset: Dataset) -> pd.DataFrame:
for col in data.columns:
if data[col].dtype.name == "category" and (
# NOTE: We want to avoid dropping categories that just happen not to be used
# in columns that are returned as Categorical to the user such as event_type
col not in dataset.columns
or dataset.columns[col].dtype_name != "category"
):
data[col] = data[col].cat.remove_unused_categories()

return data


def _load_data_from_path(data_path: Path, user_filters: List[Tuple]) -> pd.DataFrame:
"""Load data from a data file given a data_path and a year_filter."""
data = load_standard_dataset_file(data_path, user_filters)
return data


def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset) -> None:
"""Formats date columns so they can be noised as strings."""
data = data.copy()

for date_column in [COLUMNS.dob.name, COLUMNS.ssa_event_date.name]:
# Format both the actual column, and the shadow version that will be used
Expand All @@ -170,19 +179,20 @@ def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
# re-parse the format string for each row
# https://github.com/pandas-dev/pandas/issues/44764
# Year is already guaranteed to be 4-digit: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits
year_string = data[column].dt.year.astype(str)
month_string = _zfill_fast(data[column].dt.month.astype(str), 2)
day_string = _zfill_fast(data[column].dt.day.astype(str), 2)
data_column = data[column]
year_string = data_column.dt.year.astype(str)
month_string = _zfill_fast(data_column.dt.month.astype(str), 2)
day_string = _zfill_fast(data_column.dt.day.astype(str), 2)
if dataset.date_format == DATEFORMATS.YYYYMMDD:
data[column] = year_string + month_string + day_string
result = year_string + month_string + day_string
elif dataset.date_format == DATEFORMATS.MM_DD_YYYY:
data[column] = month_string + "/" + day_string + "/" + year_string
result = month_string + "/" + day_string + "/" + year_string
elif dataset.date_format == DATEFORMATS.MMDDYYYY:
data[column] = month_string + day_string + year_string
result = month_string + day_string + year_string
else:
raise ValueError(f"Invalid date format in {dataset.name}.")

return data
data[column] = result


def _zfill_fast(col: pd.Series, desired_length: int) -> pd.Series:
Expand Down
2 changes: 1 addition & 1 deletion src/pseudopeople/noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def noise_dataset(
# except for the leave_blank kind which is special-cased below
missingness = (dataset_data == "") | (dataset_data.isna())

for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type", leave=False):
for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type"):
if isinstance(noise_type, RowNoiseType):
if (
Keys.ROW_NOISE in noise_configuration
Expand Down

0 comments on commit 9513813

Please sign in to comment.