Skip to content

Commit

Permalink
fix: correct data preparation function to handle different size scena…
Browse files Browse the repository at this point in the history
…rios
  • Loading branch information
SverreNystad committed Mar 26, 2024
1 parent 57efa3f commit b4dc78d
Showing 1 changed file with 29 additions and 8 deletions.
37 changes: 29 additions & 8 deletions src/features/ml_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ def prepare_data(
validation_size: float = 0.1,
test_size: float = 0.1,
loader: DataLoader = create_data_loader(),
) -> tuple:
) -> tuple[
pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame
]:
"""
Prepare the data for the machine learning model.
This includes loading the data, feature engineering, and splitting the data into
Expand Down Expand Up @@ -41,26 +43,45 @@ def prepare_data(

engineered_features = engineer_features(x)

empty_x = pd.DataFrame(columns=engineered_features.columns)
empty_y = pd.DataFrame(columns=y.columns)

# Determine the total size of the dataset to be allocated to training
holdout_size = validation_size + test_size
if holdout_size > 1:
raise ValueError(
f"The sum of validation_size and test_size should be less than 1. Got {holdout_size}."
)

# Ensure validation_size and test_size are correctly calculated as proportions of the remaining dataset
# Define a random state for consistency in splits
RANDOM_STATE = 42

# Split the data into training, validation, and test sets
if holdout_size == 0:
# All data goes into training
return engineered_features, empty_x, empty_x, y, empty_y, empty_y
elif test_size == 0:
# No test data
x_train, x_validate, y_train, y_validate = train_test_split(
engineered_features, y, test_size=validation_size, random_state=RANDOM_STATE
)
return x_train, x_validate, empty_x, y_train, y_validate, empty_y
elif validation_size == 0:
# No validation data
x_train, x_test, y_train, y_test = train_test_split(
engineered_features, y, test_size=test_size, random_state=RANDOM_STATE
)
return x_train, empty_x, x_test, y_train, empty_y, y_test

# Calculate test size as a proportion of the holdout set
temp_test_size = test_size / holdout_size

# Split the data into training and a temporary set
RANDOM_STATE: int = 42
x_train, x_temp, y_train, y_temp = train_test_split(
engineered_features,
y,
test_size=(validation_size + test_size),
random_state=RANDOM_STATE,
engineered_features, y, test_size=holdout_size, random_state=RANDOM_STATE
)

# Split the temporary set into validation and test sets
# Further split the temporary set into validation and test sets
x_validate, x_test, y_validate, y_test = train_test_split(
x_temp, y_temp, test_size=temp_test_size, random_state=RANDOM_STATE
)
Expand Down

0 comments on commit b4dc78d

Please sign in to comment.