Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Noise shards together #345

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 34 additions & 31 deletions src/pseudopeople/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,46 +68,39 @@ def _generate_dataset(
"Please provide the path to the unmodified root data directory."
)
validate_data_path_suffix(data_paths)
noised_dataset = []
iterator = (
tqdm(data_paths, desc="Noising data", leave=False)
if len(data_paths) > 1
else data_paths
)
all_data = []
iterator = tqdm(data_paths, desc="Loading data") if len(data_paths) > 1 else data_paths

for data_path_index, data_path in enumerate(iterator):
for data_path in iterator:
logger.debug(f"Loading data from {data_path}.")
data = _load_data_from_path(data_path, user_filters)
if data.empty:
continue
data = _reformat_dates_for_noising(data, dataset)
data = _coerce_dtypes(data, dataset)
# Use a different seed for each data file/shard, otherwise the randomness will duplicate
# and the Nth row in each shard will get the same noise
data_path_seed = f"{seed}_{data_path_index}"
noised_data = noise_dataset(dataset, data, configuration_tree, data_path_seed)
noised_data = _extract_columns(dataset.columns, noised_data)
noised_dataset.append(noised_data)
# FIXME: Right now, Categorical columns in the Rhode Island data
# contain a very large number of unnecessary categories. We want
# to get rid of these during this loop so that they are never all
# in memory at the same time.
# TODO: Remove this when we stop Categorical encoding.
data = _remove_unused_categories(data, dataset)
all_data.append(data)

# Check if all shards for the dataset are empty
if len(noised_dataset) == 0:
if len(all_data) == 0:
raise ValueError(
"Invalid value provided for 'state' or 'year'. No data found with "
f"the user provided 'state' or 'year' filters at {data_path}."
f"the user provided 'state' or 'year' filters at {source}."
)
noised_dataset = pd.concat(noised_dataset, ignore_index=True)

# Known pandas bug: pd.concat does not preserve category dtypes so we coerce
# again after concat (https://github.com/pandas-dev/pandas/issues/51362)
noised_dataset = _coerce_dtypes(
noised_dataset,
dataset,
cleanse_int_cols=True,
)

all_data = pd.concat(all_data, ignore_index=True)
_reformat_dates_for_noising(all_data, dataset)
all_data = _coerce_dtypes(all_data, dataset)
all_data = noise_dataset(dataset, all_data, configuration_tree, seed)
all_data = _extract_columns(dataset.columns, all_data)
all_data = _coerce_dtypes(all_data, dataset, cleanse_int_cols=True)

logger.debug("*** Finished ***")

return noised_dataset
return all_data


def validate_source_compatibility(source: Path, dataset: Dataset):
Expand Down Expand Up @@ -168,15 +161,27 @@ def _coerce_dtypes(
return data


def _remove_unused_categories(data: pd.DataFrame, dataset: Dataset) -> pd.DataFrame:
for col in data.columns:
if data[col].dtype.name == "category" and (
# NOTE: We want to avoid dropping categories that just happen not to be used
# in columns that are returned as Categorical to the user such as event_type
col not in dataset.columns
or dataset.columns[col].dtype_name != "category"
):
data[col] = data[col].cat.remove_unused_categories()

return data


def _load_data_from_path(data_path: Path, user_filters: List[Tuple]) -> pd.DataFrame:
"""Load data from a data file given a data_path and a year_filter."""
data = load_standard_dataset_file(data_path, user_filters)
return data


def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset) -> None:
"""Formats date columns so they can be noised as strings."""
data = data.copy()

for date_column in [COLUMNS.dob.name, COLUMNS.ssa_event_date.name]:
# Format both the actual column, and the shadow version that will be used
Expand Down Expand Up @@ -204,8 +209,6 @@ def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
data[column] = pd.Series(np.nan, dtype=str)
data.loc[~is_na, column] = result

return data


def _zfill_fast(col: pd.Series, desired_length: int) -> pd.Series:
"""Performs the same operation as col.str.zfill(desired_length), but vectorized."""
Expand Down
2 changes: 1 addition & 1 deletion src/pseudopeople/noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def noise_dataset(
# except for the leave_blank kind which is special-cased below
missingness = (dataset_data == "") | (dataset_data.isna())

for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type", leave=False):
for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type"):
if isinstance(noise_type, RowNoiseType):
if (
Keys.ROW_NOISE in noise_configuration
Expand Down
Loading