Skip to content

Commit

Permalink
Merge pull request #231 from DashAISoftware/feat/regression
Browse files Browse the repository at this point in the history
Feat/regression
cristian-tamblay authored Jan 13, 2025
2 parents 8ec179c + 438efb9 commit ad872bb
Showing 33 changed files with 2,580 additions and 40 deletions.
18 changes: 17 additions & 1 deletion DashAI/back/container.py
Original file line number Diff line number Diff line change
@@ -19,18 +19,24 @@
PermutationFeatureImportance,
)
from DashAI.back.job import ExplainerJob, ModelJob
from DashAI.back.metrics import F1, Accuracy, Bleu, Precision, Recall
from DashAI.back.metrics import F1, MAE, RMSE, Accuracy, Bleu, Precision, Recall
from DashAI.back.models import (
SVC,
BagOfWordsTextClassificationModel,
DecisionTreeClassifier,
DistilBertTransformer,
DummyClassifier,
GradientBoostingR,
HistGradientBoostingClassifier,
KNeighborsClassifier,
LinearRegression,
LinearSVR,
LogisticRegression,
MLPRegression,
OpusMtEnESTransformer,
RandomForestClassifier,
RandomForestRegression,
RidgeRegression,
ViTTransformer,
)
from DashAI.back.optimizers import (
@@ -39,6 +45,7 @@
)
from DashAI.back.tasks import (
ImageClassificationTask,
RegressionTask,
TabularClassificationTask,
TextClassificationTask,
TranslationTask,
@@ -53,18 +60,25 @@
TextClassificationTask,
TranslationTask,
ImageClassificationTask,
RegressionTask,
# Models
SVC,
DecisionTreeClassifier,
DummyClassifier,
GradientBoostingR,
HistGradientBoostingClassifier,
KNeighborsClassifier,
LogisticRegression,
MLPRegression,
RandomForestClassifier,
RandomForestRegression,
DistilBertTransformer,
ViTTransformer,
OpusMtEnESTransformer,
BagOfWordsTextClassificationModel,
RidgeRegression,
LinearSVR,
LinearRegression,
# Dataloaders
CSVDataLoader,
JSONDataLoader,
@@ -76,6 +90,8 @@
Precision,
Recall,
Bleu,
MAE,
RMSE,
# Optimizers
OptunaOptimizer,
HyperOptOptimizer,
443 changes: 443 additions & 0 deletions DashAI/back/example_datasets/diabetes.csv

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions DashAI/back/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -4,4 +4,6 @@
from DashAI.back.metrics.classification.f1 import F1
from DashAI.back.metrics.classification.precision import Precision
from DashAI.back.metrics.classification.recall import Recall
from DashAI.back.metrics.regression.mae import MAE
from DashAI.back.metrics.regression.rmse import RMSE
from DashAI.back.metrics.translation.bleu import Bleu
31 changes: 31 additions & 0 deletions DashAI/back/metrics/regression/mae.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""DashAI MAE regression metric implementation."""

import numpy as np
from sklearn.metrics import mean_absolute_error

from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset
from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric


class MAE(RegressionMetric):
"""Mean Absolute Error metric for regression tasks."""

@staticmethod
def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float:
"""Calculate the MAE between true values and predicted values.
Parameters
----------
true_values : DashAIDataset
A DashAI dataset with true values.
predicted_values : np.ndarray
A one-dimensional array with the predicted values
for each instance.
Returns
-------
float
MAE score between true values and predicted values
"""
true_values, pred_values = prepare_to_metric(true_values, predicted_values)
return mean_absolute_error(true_values, pred_values)
31 changes: 31 additions & 0 deletions DashAI/back/metrics/regression/rmse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""DashAI RMSE regression metric implementation."""

import numpy as np
from sklearn.metrics import mean_squared_error

from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset
from DashAI.back.metrics.regression_metric import RegressionMetric, prepare_to_metric


class RMSE(RegressionMetric):
"""Root Mean Squared Error metric for regression tasks."""

@staticmethod
def score(true_values: DashAIDataset, predicted_values: np.ndarray) -> float:
"""Calculate the RMSE between true values and predicted values.
Parameters
----------
true_values : DashAIDataset
A DashAI dataset with true values.
predicted_values : np.ndarray
A one-dimensional array with the predicted values
for each instance.
Returns
-------
float
RMSE score between true values and predicted values
"""
true_values, pred_values = prepare_to_metric(true_values, predicted_values)
return mean_squared_error(true_values, pred_values, squared=False)
53 changes: 53 additions & 0 deletions DashAI/back/metrics/regression_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from typing import Tuple

import numpy as np

from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset
from DashAI.back.metrics.base_metric import BaseMetric


class RegressionMetric(BaseMetric):
"""Class for metrics associated with regression models."""

COMPATIBLE_COMPONENTS = ["RegressionTask"]


def validate_inputs(true_values: np.ndarray, pred_values: np.ndarray) -> None:
"""Validate inputs.
Parameters
----------
true_values : ndarray
True values.
pred_values : ndarray
Predicted values by the model.
"""
if len(true_values) != len(pred_values):
raise ValueError(
"The length of the true and the predicted values must be equal, "
f"given: len(true_values) = {len(true_values)} and "
f"len(pred_values) = {len(pred_values)}."
)


def prepare_to_metric(
y: DashAIDataset, predicted_values: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
"""Prepare true and predicted values to be used later in metrics.
Parameters
----------
y : DashAIDataset
A DashAIDataset with the output columns of the data.
predicted_values: np.ndarray
A one-dimensional array with the predicted values for each instance.
Returns
-------
Tuple[np.ndarray, np.ndarray]
A tuple with the true and predicted values in numpy format.
"""
column_name = y.column_names[0]
true_values = np.array(y[column_name])
validate_inputs(true_values, predicted_values)
return true_values, predicted_values
14 changes: 14 additions & 0 deletions DashAI/back/models/__init__.py
Original file line number Diff line number Diff line change
@@ -12,13 +12,27 @@
DecisionTreeClassifier,
)
from DashAI.back.models.scikit_learn.dummy_classifier import DummyClassifier
from DashAI.back.models.scikit_learn.gradient_boosting_regression import (
GradientBoostingR,
)
from DashAI.back.models.scikit_learn.hist_gradient_boosting_classifier import (
HistGradientBoostingClassifier,
)
from DashAI.back.models.scikit_learn.k_neighbors_classifier import KNeighborsClassifier
from DashAI.back.models.scikit_learn.linear_regression import LinearRegression
from DashAI.back.models.scikit_learn.linearSVR import LinearSVR
from DashAI.back.models.scikit_learn.logistic_regression import LogisticRegression
from DashAI.back.models.scikit_learn.random_forest_classifier import (
RandomForestClassifier,
)
from DashAI.back.models.scikit_learn.random_forest_regression import (
RandomForestRegression,
)
from DashAI.back.models.scikit_learn.mlp_regression import MLPRegression
from DashAI.back.models.scikit_learn.ridge_regression import RidgeRegression
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.scikit_learn.sklearn_like_regressor import SklearnLikeRegressor
from DashAI.back.models.scikit_learn.svc import SVC
239 changes: 239 additions & 0 deletions DashAI/back/models/parameters/models_schemas/GradientBoostingR.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
{
"additionalProperties": false,
"error_msg": "The parameters for Gradient Boosting regression must be one or more of ['loss', 'learning_rate', 'n_estimators', 'subsample', 'criterion', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_depth', 'min_impurity_decrease', 'init', 'random_state', 'max_features', 'alpha', 'verbose', 'max_leaf_nodes', 'warm_start', 'validation_fraction', 'n_iter_no_change', 'tol', 'ccp_alpha'].",
"description": "Gradient Boosting regression builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions.",
"properties": {
"loss": {
"oneOf": [
{
"error_msg": "The 'loss' parameter must be one of 'squared_error', 'absolute_error', 'huber', or 'quantile'.",
"description": "The 'loss' parameter specifies the loss function to be optimized.",
"type": "string",
"default": "squared_error",
"enum": ["squared_error", "absolute_error", "huber", "quantile"]
}
]
},
"learning_rate": {
"oneOf": [
{
"error_msg": "The 'learning_rate' parameter must be a positive number.",
"description": "The 'learning_rate' parameter specifies the learning rate shrinks the contribution of each tree.",
"type": "number",
"minimum": 0,
"default": 0.1
}
]
},
"n_estimators": {
"oneOf": [
{
"error_msg": "The 'n_estimators' parameter must be a positive integer.",
"description": "The 'n_estimators' parameter specifies the number of boosting stages to be run.",
"type": "integer",
"minimum": 1,
"default": 100
}
]
},
"subsample": {
"oneOf": [
{
"error_msg": "The 'subsample' parameter must be a number between 0 and 1.",
"description": "The 'subsample' parameter specifies the fraction of samples to be used for fitting the individual base learners.",
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 1.0
}
]
},
"criterion": {
"oneOf": [
{
"error_msg": "The 'criterion' parameter must be one of 'friedman_mse' or 'squared_error'.",
"description": "The 'criterion' parameter specifies the function to measure the quality of a split.",
"type": "string",
"default": "friedman_mse",
"enum": ["friedman_mse", "squared_error"]
}
]
},
"min_samples_split": {
"oneOf": [
{
"error_msg": "The 'min_samples_split' parameter must be ...",
"description": "The 'min_samples_split' parameter specifies the minimum number of samples required to split an internal node.",
"type": "number",
"minimum": 2,
"default": 2
}
]
},
"min_samples_leaf": {
"oneOf": [
{
"error_msg": "The 'min_samples_leaf' parameter must be a positive integer.",
"description": "The 'min_samples_leaf' parameter specifies the minimum number of samples required to be at a leaf node.",
"type": "number",
"minimum": 1,
"default": 1
}
]
},
"min_weight_fraction_leaf": {
"oneOf": [
{
"error_msg": "The 'min_weight_fraction_leaf' parameter must be a number between 0 and 0.5.",
"description": "The 'min_weight_fraction_leaf' parameter specifies the minimum weighted fraction of the sum total of weights required to be at a leaf node.",
"type": "number",
"minimum": 0,
"maximum": 0.5,
"default": 0.0
}
]
},
"max_depth": {
"oneOf": [
{
"error_msg": "The 'max_depth' parameter must be an integer greater than or equal to 1, or null.",
"description": "The 'max_depth' parameter specifies the maximum depth of the individual regression estimators.",
"type": ["integer", "null"],
"minimum": 1,
"default": 3
}
]
},
"min_impurity_decrease": {
"oneOf": [
{
"error_msg": "The 'min_impurity_decrease' parameter must be a non-negative number.",
"description": "The 'min_impurity_decrease' parameter specifies a node will be split if this split induces a decrease of the impurity greater than or equal to this value.",
"type": "number",
"minimum": 0,
"default": 0.0
}
]
},
"init": {
"oneOf": [
{
"error_msg": "The 'init' parameter must be a string, estimator object implementing 'fit', 'partial_fit', 'predict', or None.",
"description": "The 'init' parameter specifies the estimator object to use for the initial predictions.",
"type": ["string", "null"],
"default": null,
"enum": ["fit", "partial_fit", "predict"]
}
]
},
"random_state": {
"oneOf": [
{
"error_msg": "The 'random_state' parameter must be an integer, a RandomState instance, or None.",
"description": "The 'random_state' parameter controls the random number generator.",
"type": ["integer", "null"],
"default": null
}
]
},
"max_features": {
"oneOf": [
{
"error_msg": "The 'max_features' parameter must be an integer, float, string, or None.",
"description": "The 'max_features' parameter specifies the number of features to consider when looking for the best split.",
"type": ["number", "null"],
"default": null
}
]
},
"alpha": {
"oneOf": [
{
"error_msg": "The 'alpha' parameter must be a number between 0 and 1.",
"description": "The 'alpha' parameter specifies the quantile loss function.",
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0.9
}
]
},
"verbose": {
"oneOf": [
{
"error_msg": "The 'verbose' parameter must be an integer.",
"description": "The 'verbose' parameter specifies the verbosity level.",
"type": "integer",
"default": 0
}
]
},
"max_leaf_nodes": {
"oneOf": [
{
"error_msg": "The 'max_leaf_nodes' parameter must be an integer greater than 1, or null.",
"description": "The 'max_leaf_nodes' parameter specifies the maximum number of leaf nodes.",
"type": ["integer", "null"],
"minimum": 2,
"default": null
}
]
},
"warm_start": {
"oneOf": [
{
"error_msg": "The 'warm_start' parameter must be of type boolean.",
"description": "The 'warm_start' parameter specifies whether to reuse the solution of the previous call to fit and add more estimators to the ensemble.",
"type": "boolean",
"default": false
}
]
},
"validation_fraction": {
"oneOf": [
{
"error_msg": "The 'validation_fraction' parameter must be a number between 0 and 1.",
"description": "The 'validation_fraction' parameter specifies the proportion of training data to set aside as validation set for early stopping.",
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0.1
}
]
},
"n_iter_no_change": {
"oneOf": [
{
"error_msg": "The 'n_iter_no_change' parameter must be a positive integer, or null.",
"description": "The 'n_iter_no_change' parameter specifies the number of iterations with no improvement to wait before early stopping.",
"type": ["integer", "null"],
"minimum": 1,
"default": null
}
]
},
"tol": {
"oneOf": [
{
"error_msg": "The 'tol' parameter must be a positive number.",
"description": "The 'tol' parameter specifies the tolerance for the early stopping.",
"type": "number",
"exclusiveMinimum": 0,
"default": 0.0001
}
]
},
"ccp_alpha": {
"oneOf": [
{
"error_msg": "The 'ccp_alpha' parameter must be a non-negative number.",
"description": "The 'ccp_alpha' parameter specifies the complexity parameter used for Minimal Cost-Complexity Pruning.",
"type": "number",
"minimum": 0,
"default": 0.0
}
]
}
},
"type": "object"
}
49 changes: 49 additions & 0 deletions DashAI/back/models/parameters/models_schemas/LinearRegression.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"additionalProperties": false,
"error_msg": "The parameters for Linear Regression must be one or more of ['fit_intercept', 'copy_X', 'n_jobs', 'positive'].",
"description": "Linear Regression is a linear approach for modeling the relationship between a dependent variable and one or more independent variables.",
"properties": {
"fit_intercept": {
"oneOf": [
{
"error_msg": "The 'fit_intercept' parameter must be of type boolean.",
"description": "The 'fit_intercept' parameter determines whether to calculate the intercept for this model. It must be of type boolean.",
"type": "boolean",
"default": true
}
]
},
"copy_X": {
"oneOf": [
{
"error_msg": "The 'copy_X' parameter must be of type boolean.",
"description": "The 'copy_X' parameter determines whether to copy the input variables. It must be of type boolean.",
"type": "boolean",
"default": true
}
]
},
"n_jobs": {
"oneOf": [
{
"error_msg": "The 'n_jobs' parameter must be an integer or null.",
"description": "The 'n_jobs' parameter specifies the number of jobs to use for computation. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors.",
"type": ["integer", "null"],
"default": null,
"minimum": -1
}
]
},
"positive": {
"oneOf": [
{
"error_msg": "The 'positive' parameter must be of type boolean.",
"description": "The 'positive' parameter determines when set to True, forces the coefficients to be positive. It must be of type boolean.",
"type": "boolean",
"default": false
}
]
}
},
"type": "object"
}
115 changes: 115 additions & 0 deletions DashAI/back/models/parameters/models_schemas/LinearSVR.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
{
"additionalProperties": false,
"error_msg": "The parameters for LinearSVR must be one or more of ['epsilon', 'tol', 'C', 'loss', 'fit_intercept', 'intercept_scaling', 'dual', 'verbose', 'random_state', 'max_iter'].",
"description": "Linear Support Vector Regression (LinearSVR) is a linear model that applies Support Vector Machine regression using a linear kernel.",
"properties": {
"epsilon": {
"oneOf": [
{
"error_msg": "The 'epsilon' parameter must be a non-negative number.",
"description": "The 'epsilon' parameter specifies the epsilon-tube within which no penalty is associated in the training loss function with points predicted within a distance epsilon from the actual value.",
"type": "number",
"minimum": 0,
"default": 0.0
}
]
},
"tol": {
"oneOf": [
{
"error_msg": "The 'tol' parameter must be a positive number.",
"description": "The 'tol' parameter specifies the tolerance for stopping criterion.",
"type": "number",
"exclusiveMinimum": 0,
"default": 0.0001
}
]
},
"C": {
"oneOf": [
{
"error_msg": "The 'C' parameter must be a positive number.",
"description": "The 'C' parameter specifies the regularization strength. It must be a positive number.",
"type": "number",
"exclusiveMinimum": 0,
"default": 1.0
}
]
},
"loss": {
"oneOf": [
{
"error_msg": "The 'loss' parameter must be one of 'epsilon_insensitive', 'squared_epsilon_insensitive'.",
"description": "The 'loss' parameter specifies the loss function. It must be one of 'epsilon_insensitive' or 'squared_epsilon_insensitive'.",
"type": "string",
"default": "epsilon_insensitive",
"enum": ["epsilon_insensitive", "squared_epsilon_insensitive"]
}
]
},
"fit_intercept": {
"oneOf": [
{
"error_msg": "The 'fit_intercept' parameter must be of type boolean.",
"description": "The 'fit_intercept' parameter specifies whether to calculate the intercept for this model.",
"type": "boolean",
"default": true
}
]
},
"intercept_scaling": {
"oneOf": [
{
"error_msg": "The 'intercept_scaling' parameter must be a positive number.",
"description": "The 'intercept_scaling' parameter is useful only when the solver 'liblinear' is used and the intercept needs to be scaled.",
"type": "number",
"exclusiveMinimum": 0,
"default": 1.0
}
]
},
"dual": {
"oneOf": [
{
"error_msg": "The 'dual' parameter must be of type boolean.",
"description": "The 'dual' parameter selects the algorithm to either solve the dual or primal optimization problem. Prefer dual=False when n_samples > n_features.",
"type": "boolean",
"default": true
}
]
},
"verbose": {
"oneOf": [
{
"error_msg": "The 'verbose' parameter must be of type boolean.",
"description": "The 'verbose' parameter enables verbose output.",
"type": "boolean",
"default": false
}
]
},
"random_state": {
"oneOf": [
{
"error_msg": "The 'random_state' parameter must be an integer greater than or equal to 0, or null.",
"description": "The 'random_state' parameter determines the seed used by the random number generator.",
"type": ["integer", "null"],
"default": null,
"minimum": 0
}
]
},
"max_iter": {
"oneOf": [
{
"error_msg": "The 'max_iter' parameter must be a positive integer, or -1 to indicate that there is no iteration limit.",
"description": "The 'max_iter' parameter specifies the maximum number of iterations to run.",
"type": "integer",
"default": 1000,
"minimum": 1
}
]
}
},
"type": "object"
}
260 changes: 260 additions & 0 deletions DashAI/back/models/parameters/models_schemas/MLPRegression.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
{
"additionalProperties": false,
"error_msg": "The parameters for MLP regression must be one or more of ['hidden_layer_sizes', 'activation', 'solver', 'alpha', 'batch_size', 'learning_rate', 'learning_rate_init', 'power_t', 'max_iter', 'shuffle', 'random_state', 'tol', 'verbose', 'warm_start', 'momentum', 'nesterovs_momentum', 'early_stopping', 'validation_fraction', 'beta_1', 'beta_2', 'epsilon', 'n_iter_no_change', 'max_fun'].",
"description": "MLP regression is a model that uses multi-layer perceptron to predict continuous values.",
"properties": {
"activation": {
"oneOf": [
{
"error_msg": "The 'activation' parameter must be one of 'identity', 'logistic', 'tanh', or 'relu'.",
"description": "The 'activation' parameter specifies the activation function for the hidden layer.",
"type": "string",
"default": "relu",
"enum": ["identity", "logistic", "tanh", "relu"]
}
]
},
"solver": {
"oneOf": [
{
"error_msg": "The 'solver' parameter must be one of 'lbfgs', 'sgd', or 'adam'.",
"description": "The 'solver' parameter specifies the solver for weight optimization.",
"type": "string",
"default": "adam",
"enum": ["lbfgs", "sgd", "adam"]
}
]
},
"alpha": {
"oneOf": [
{
"error_msg": "The 'alpha' parameter must be a positive number.",
"description": "The 'alpha' parameter specifies the L2 penalty (regularization term) parameter.",
"type": "number",
"exclusiveMinimum": 0,
"default": 0.0001
}
]
},
"batch_size": {
"oneOf": [
{
"error_msg": "The 'batch_size' parameter must be an integer.",
"description": "The 'batch_size' parameter specifies the size of minibatches for stochastic optimizers.",
"type": "integer",
"default": null
},
{
"error_msg": "The 'batch_size' parameter must be 'auto'.",
"description": "The 'batch_size' parameter specifies the size of minibatches for stochastic optimizers.",
"type": "string",
"enum": ["auto"],
"default": "auto"
}
]
},
"learning_rate": {
"oneOf": [
{
"error_msg": "The 'learning_rate' parameter must be one of 'constant', 'invscaling', or 'adaptive'.",
"description": "The 'learning_rate' parameter specifies the learning rate schedule for weight updates.",
"type": "string",
"default": "constant",
"enum": ["constant", "invscaling", "adaptive"]
}
]
},
"learning_rate_init": {
"oneOf": [
{
"error_msg": "The 'learning_rate_init' parameter must be a positive number.",
"description": "The 'learning_rate_init' parameter specifies the initial learning rate used.",
"type": "number",
"exclusiveMinimum": 0,
"default": 0.001
}
]
},
"power_t": {
"oneOf": [
{
"error_msg": "The 'power_t' parameter must be a positive number.",
"description": "The 'power_t' parameter specifies the exponent for inverse scaling learning rate.",
"type": "number",
"exclusiveMinimum": 0,
"default": 0.5
}
]
},
"max_iter": {
"oneOf": [
{
"error_msg": "The 'max_iter' parameter must be a positive integer.",
"description": "The 'max_iter' parameter specifies the maximum number of iterations.",
"type": "integer",
"minimum": 1,
"default": 200
}
]
},
"shuffle": {
"oneOf": [
{
"error_msg": "The 'shuffle' parameter must be of type boolean.",
"description": "The 'shuffle' parameter specifies whether to shuffle samples in each iteration.",
"type": "boolean",
"default": true
}
]
},
"random_state": {
"oneOf": [
{
"error_msg": "The 'random_state' parameter must be an integer.",
"description": "The 'random_state' parameter controls the random number generator.",
"type": "integer",
"default": null
},
{
"error_msg": "The 'random_state' parameter must be null.",
"description": "The 'random_state' parameter controls the random number generator.",
"type": "null",
"default": null
}
]
},
"tol": {
"oneOf": [
{
"error_msg": "The 'tol' parameter must be a positive number.",
"description": "The 'tol' parameter specifies the tolerance for the optimization.",
"type": "number",
"exclusiveMinimum": 0,
"default": 0.0001
}
]
},
"verbose": {
"oneOf": [
{
"error_msg": "The 'verbose' parameter must be of type boolean.",
"description": "The 'verbose' parameter specifies whether to print progress messages to stdout.",
"type": "boolean",
"default": false
}
]
},
"warm_start": {
"oneOf": [
{
"error_msg": "The 'warm_start' parameter must be of type boolean.",
"description": "The 'warm_start' parameter specifies whether to reuse the solution of the previous call to fit and add more estimators to the ensemble.",
"type": "boolean",
"default": false
}
]
},
"momentum": {
"oneOf": [
{
"error_msg": "The 'momentum' parameter must be a number between 0 and 1.",
"description": "The 'momentum' parameter specifies the momentum for gradient descent update.",
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0.9
}
]
},
"nesterovs_momentum": {
"oneOf": [
{
"error_msg": "The 'nesterovs_momentum' parameter must be of type boolean.",
"description": "The 'nesterovs_momentum' parameter specifies whether to use Nesterov's momentum.",
"type": "boolean",
"default": true
}
]
},
"early_stopping": {
"oneOf": [
{
"error_msg": "The 'early_stopping' parameter must be of type boolean.",
"description": "The 'early_stopping' parameter specifies whether to use early stopping to terminate training when validation score is not improving.",
"type": "boolean",
"default": false
}
]
},
"validation_fraction": {
"oneOf": [
{
"error_msg": "The 'validation_fraction' parameter must be a number between 0 and 1.",
"description": "The 'validation_fraction' parameter specifies the proportion of training data to set aside as validation set for early stopping.",
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0.1
}
]
},
"beta_1": {
"oneOf": [
{
"error_msg": "The 'beta_1' parameter must be a number between 0 and 1.",
"description": "The 'beta_1' parameter specifies the exponential decay rate for estimates of first moment vector in Adam, should be in [0, 1).",
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0.9
}
]
},
"beta_2": {
"oneOf": [
{
"error_msg": "The 'beta_2' parameter must be a number between 0 and 1.",
"description": "The 'beta_2' parameter specifies the exponential decay rate for estimates of second moment vector in Adam, should be in [0, 1).",
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0.999
}
]
},
"epsilon": {
"oneOf": [
{
"error_msg": "The 'epsilon' parameter must be a positive number.",
"description": "The 'epsilon' parameter specifies the value for numerical stability in Adam.",
"type": "number",
"exclusiveMinimum": 0,
"default": 1e-8
}
]
},
"n_iter_no_change": {
"oneOf": [
{
"error_msg": "The 'n_iter_no_change' parameter must be a positive integer.",
"description": "The 'n_iter_no_change' parameter specifies the number of iterations with no improvement to wait before stopping.",
"type": "integer",
"minimum": 1,
"default": 10
}
]
},
"max_fun": {
"oneOf": [
{
"error_msg": "The 'max_fun' parameter must be a positive integer.",
"description": "The 'max_fun' parameter specifies the maximum number of function evaluations.",
"type": "integer",
"minimum": 1,
"default": 15000
}
]
}
},
"type": "object"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
{
"additionalProperties": false,
"error_msg": "The parameters for Random Forest regression must be one or more of ['n_estimators', 'criterion', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'min_weight_fraction_leaf', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'bootstrap', 'oob_score', 'n_jobs', 'random_state', 'verbose', 'warm_start', 'ccp_alpha', 'max_samples', 'monotonic_cst'].",
"description": "Random Forest regression is an ensemble learning method that fits multiple decision trees and averages their predictions.",
"properties": {
"n_estimators": {
"oneOf": [
{
"error_msg": "The 'n_estimators' parameter must be a positive integer greater than or equal to 1.",
"description": "The 'n_estimators' parameter specifies the number of trees in the forest. It must be a positive integer greater than or equal to 1.",
"type": "integer",
"default": 100,
"minimum": 1
}
]
},
"criterion": {
"oneOf": [
{
"error_msg": "The 'criterion' parameter must be one of 'squared_error', 'absolute_error', 'friedman_mse' or 'poisson'.",
"description": "The 'criterion' parameter specifies the function to measure the quality of a split.",
"type": "string",
"default": "squared_error",
"enum": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
}
]
},
"max_depth": {
"oneOf": [
{
"error_msg": "The 'max_depth' parameter must be an integer greater than or equal to 1, or null.",
"description": "The 'max_depth' parameter corresponds to the maximum depth of the tree. It must be an integer greater than or equal to 1, or null.",
"type": ["integer", "null"],
"default": null,
"minimum": 1
}
]
},
"min_samples_split": {
"oneOf": [
{
"error_msg": "The 'min_samples_split' parameter must be a number greater than or equal to 2.",
"description": "The 'min_samples_split' parameter is the minimum number of samples required to split an internal node. It must be a number greater than or equal to 2.",
"type": "integer",
"default": 2,
"minimum": 2
}
]
},
"min_samples_leaf": {
"oneOf": [
{
"error_msg": "The 'min_samples_leaf' parameter must be a number greater than or equal to 1.",
"description": "The 'min_samples_leaf' parameter is the minimum number of samples required to be at a leaf node. It must be a number greater than or equal to 1.",
"type": "integer",
"default": 1,
"minimum": 1
}
]
},
"min_weight_fraction_leaf": {
"oneOf": [
{
"error_msg": "The 'min_weight_fraction_leaf' parameter must be a number between 0 and 0.5.",
"description": "The 'min_weight_fraction_leaf' parameter specifies the minimum weighted fraction of the sum total of weights required to be at a leaf node. It must be a number between 0 and 0.5.",
"type": "number",
"minimum": 0,
"maximum": 0.5,
"default": 0.0
}
]
},
"max_features": {
"oneOf": [
{
"error_msg": "The 'max_features' parameter must be an integer or a float.",
"description": "The 'max_features' parameter specifies the number of features to consider when looking for the best split.",
"type": "integer",
"default": 1.0
}
]
},
"max_leaf_nodes": {
"oneOf": [
{
"error_msg": "The 'max_leaf_nodes' parameter must be an integer greater than 2, or null.",
"description": "The 'max_leaf_nodes' parameter parameter specifies the maximum number of leaf nodes. It must be an integer greater than 2, or null.",
"type": ["integer", "null"],
"default": null,
"minimum": 2
}
]
},
"min_impurity_decrease": {
"oneOf": [
{
"error_msg": "The 'min_impurity_decrease' parameter must be a positive number.",
"description": "The 'min_impurity_decrease' parameter specifies a node will be split if this split induces a decrease of the impurity greater than or equal to this value. It must be a positive number.",
"type": "number",
"minimum": 0,
"default": 0.0
}
]
},
"bootstrap": {
"oneOf": [
{
"error_msg": "The 'bootstrap' parameter must be of type boolean.",
"description": "The 'bootstrap' parameter specifies whether bootstrap samples are used when building trees.",
"type": "boolean",
"default": true
}
]
},
"oob_score": {
"oneOf": [
{
"error_msg": "The 'oob_score' parameter must be of type boolean.",
"description": "The 'oob_score' parameter specifies whether to use out-of-bag samples to estimate the generalization score.",
"type": "boolean",
"default": false
}
]
},
"n_jobs": {
"oneOf": [
{
"error_msg": "The 'n_jobs' parameter must be an integer or null.",
"description": "The 'n_jobs' parameter specifies the number of jobs to run in parallel. None means 1, -1 means using all processors.",
"type": ["integer", "null"],
"default": null
}
]
},
"random_state": {
"oneOf": [
{
"error_msg": "The 'random_state' parameter must be an integer greater than or equal to 0, a RandomState instance, or null.",
"description": "The 'random_state' parameter controls the random number generator. It must be an integer greater than or equal to 0, a RandomState instance, or null.",
"type": ["integer", "null"],
"default": null,
"minimum": 0
}
]
},
"verbose": {
"oneOf": [
{
"error_msg": "The 'verbose' parameter must be an integer.",
"description": "The 'verbose' parameter specifies the verbosity level. It must be an integer.",
"type": "integer",
"default": 0
}
]
},
"warm_start": {
"oneOf": [
{
"error_msg": "The 'warm_start' parameter must be of type boolean.",
"description": "The 'warm_start' parameter specifies whether to reuse the solution of the previous call to fit and add more estimators to the ensemble.",
"type": "boolean",
"default": false
}
]
},
"ccp_alpha": {
"oneOf": [
{
"error_msg": "The 'ccp_alpha' parameter must be a non-negative number.",
"description": "The 'ccp_alpha' parameter specifies the complexity parameter used for Minimal Cost-Complexity Pruning. It must be a non-negative number.",
"type": "number",
"minimum": 0,
"default": 0.0
}
]
},
"max_samples": {
"oneOf": [
{
"error_msg": "The 'max_samples' parameter must be a positive integer or float, or null.",
"description": "The 'max_samples' parameter specifies the number of samples to draw from X to train each base estimator. It must be a positive integer or float, or null.",
"type": ["number", "null"],
"default": null
}
]
}
},
"type": "object"
}
93 changes: 93 additions & 0 deletions DashAI/back/models/parameters/models_schemas/RidgeRegression.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"additionalProperties": false,
"error_msg": "The parameters for Ridge regression must be one or more of ['alpha', 'fit_intercept', 'normalize', 'copy_X', 'max_iter', 'tol', 'solver', 'random_state'].",
"description": "Ridge regression is a linear model that includes L2 regularization, which can help mitigate issues of multicollinearity in linear regression.",
"properties": {
"alpha": {
"oneOf": [
{
"error_msg": "The 'alpha' parameter must be a positive number.",
"description": "The 'alpha' parameter specifies the regularization strength. It must be a positive number.",
"type": "number",
"exclusiveMinimum": 0,
"default": 1.0
}
]
},
"fit_intercept": {
"oneOf": [
{
"error_msg": "The 'fit_intercept' parameter must be of type boolean.",
"description": "The 'fit_intercept' parameter determines whether to calculate the intercept for this model. It must be of type boolean.",
"type": "boolean",
"default": true
}
]
},
"copy_X": {
"oneOf": [
{
"error_msg": "The 'copy_X' parameter must be of type boolean.",
"description": "The 'copy_X' parameter determines whether to copy the input variables. It must be of type boolean.",
"type": "boolean",
"default": true
}
]
},
"max_iter": {
"oneOf": [
{
"error_msg": "The 'max_iter' parameter must be a positive integer, or -1 to indicate that there is no iteration limit.",
"description": "The 'max_iter' parameter determines the maximum number of iterations for the solver. It must be a positive integer or -1 to indicate no limit.",
"type": "integer",
"default": null,
"minimum": 1
}
]
},
"tol": {
"oneOf": [
{
"error_msg": "The 'tol' parameter must be a positive number.",
"description": "The 'tol' parameter determines the tolerance for the optimization. It must be a positive number.",
"type": "number",
"exclusiveMinimum": 0,
"default": 0.001
}
]
},
"solver": {
"oneOf": [
{
"error_msg": "The 'solver' parameter must be one of 'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', or 'lbfgs'.",
"description": "The 'solver' parameter determines the solver to use in the computational routines. It must be one of 'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', or 'lbfgs'.",
"type": "string",
"default": "auto",
"enum": ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"]
}
]
},
"positive": {
"oneOf": [
{
"error_msg": "The 'positive' parameter must be of type boolean.",
"description": "The 'positive' parameter determines when set to True, forces the coefficients to be positive. It must be of type boolean.",
"type": "boolean",
"default": false
}
]
},
"random_state": {
"oneOf": [
{
"error_msg": "The 'random_state' parameter must be an integer greater than or equal to 0, or null.",
"description": "The 'random_state' parameter determines the seed used by the random number generator. It must be an integer greater than or equal to 0, or null.",
"type": ["integer", "null"],
"default": null,
"minimum": 0
}
]
}
},
"type": "object"
}
7 changes: 7 additions & 0 deletions DashAI/back/models/regression_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from DashAI.back.models.base_model import BaseModel


class RegressionModel(BaseModel):
"""Class for models associated to RegressionTask."""

COMPATIBLE_COMPONENTS = ["RegressionTask"]
6 changes: 4 additions & 2 deletions DashAI/back/models/scikit_learn/decision_tree_classifier.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,9 @@
optimizer_int_field,
schema_field,
)
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.tabular_classification_model import TabularClassificationModel


@@ -64,7 +66,7 @@ class DecisionTreeClassifierSchema(BaseSchema):


class DecisionTreeClassifier(
TabularClassificationModel, SklearnLikeModel, _DecisionTreeClassifier
TabularClassificationModel, SklearnLikeClassifier, _DecisionTreeClassifier
):
"""Scikit-learn's Decision Tree Classifier wrapper for DashAI."""

8 changes: 6 additions & 2 deletions DashAI/back/models/scikit_learn/dummy_classifier.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from sklearn.dummy import DummyClassifier as _DummyClassifier

from DashAI.back.core.schema_fields import BaseSchema, enum_field, schema_field
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.tabular_classification_model import TabularClassificationModel


@@ -15,7 +17,9 @@ class DummyClassifierSchema(BaseSchema):
) # type: ignore


class DummyClassifier(TabularClassificationModel, SklearnLikeModel, _DummyClassifier):
class DummyClassifier(
TabularClassificationModel, SklearnLikeClassifier, _DummyClassifier
):
"""Scikit-learn's DummyClassifier wrapper for DashAI."""

SCHEMA = DummyClassifierSchema
222 changes: 222 additions & 0 deletions DashAI/back/models/scikit_learn/gradient_boosting_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
from sklearn.ensemble import GradientBoostingRegressor as _GBRegressor

from DashAI.back.core.schema_fields import (
BaseSchema,
bool_field,
enum_field,
none_type,
optimizer_float_field,
optimizer_int_field,
schema_field,
union_type,
)
from DashAI.back.models.regression_model import RegressionModel
from DashAI.back.models.scikit_learn.sklearn_like_regressor import (
SklearnLikeRegressor,
)


class GradientBoostingRSchema(BaseSchema):
"""Gradient Boosting for regression."""

loss: schema_field(
enum_field(enum=["squared_error", "absolute_error", "huber", "quantile"]),
placeholder="squared_error",
description="Loss function to be optimized.",
) # type: ignore

learning_rate: schema_field(
optimizer_float_field(gt=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.1,
"lower_bound": 0.01,
"upper_bound": 1.0,
},
description="Learning rate shrinks the contribution of each tree.",
) # type: ignore

n_estimators: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 100,
"lower_bound": 10,
"upper_bound": 1000,
},
description="The number of boosting stages to be run.",
) # type: ignore

subsample: schema_field(
optimizer_float_field(gt=0.0, le=1.0),
placeholder={
"optimize": False,
"fixed_value": 1.0,
"lower_bound": 0.1,
"upper_bound": 1.0,
},
description="The fraction of samples to be used for fitting the "
"individual base learners.",
) # type: ignore

criterion: schema_field(
enum_field(enum=["friedman_mse", "mse", "mae"]),
placeholder="friedman_mse",
description="The function to measure the quality of a split.",
) # type: ignore

min_samples_split: schema_field(
optimizer_float_field(gt=0.0, le=1.0),
placeholder={
"optimize": False,
"fixed_value": 2,
"lower_bound": 2,
"upper_bound": 20,
},
description="The minimum number of samples required to split "
"an internal node.",
) # type: ignore

min_samples_leaf: schema_field(
optimizer_float_field(gt=0.0, le=0.5),
placeholder={
"optimize": False,
"fixed_value": 1,
"lower_bound": 1,
"upper_bound": 20,
},
description="The minimum number of samples required to be at a leaf node.",
) # type: ignore

min_weight_fraction_leaf: schema_field(
optimizer_float_field(ge=0.0, le=0.5),
placeholder={
"optimize": False,
"fixed_value": 0.0,
"lower_bound": 0.0,
"upper_bound": 0.5,
},
description="The minimum weighted fraction of the sum total of weights"
" (of all the input samples) required to be at a leaf node.",
) # type: ignore

max_depth: schema_field(
union_type(optimizer_int_field(ge=1), none_type(int)),
placeholder=3,
description="The maximum depth of the individual regression estimators.",
) # type: ignore

min_impurity_decrease: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0,
"lower_bound": 0.0,
"upper_bound": 0.5,
},
description="A node will be split if this split induces a decrease of "
"the impurity greater than or equal to this value.",
) # type: ignore

random_state: schema_field(
union_type(optimizer_int_field(ge=0), none_type(int)),
placeholder=None,
description="The seed of the pseudo-random number generator to use"
" when shuffling the data.",
) # type: ignore

max_features: schema_field(
union_type(
optimizer_float_field(gt=0.0, le=1.0),
enum_field(enum=["sqrt", "log2", None]),
),
placeholder=None,
description="The number of features to consider when looking for "
"the best split.",
) # type: ignore

alpha: schema_field(
optimizer_float_field(gt=0.0, le=1.0),
placeholder={
"optimize": False,
"fixed_value": 0.9,
"lower_bound": 0.1,
"upper_bound": 1.0,
},
description="The alpha-quantile of the Huber loss function and the"
" quantile loss function.",
) # type: ignore

verbose: schema_field(
optimizer_int_field(ge=0),
placeholder={
"optimize": False,
"fixed_value": 0,
"lower_bound": 0,
"upper_bound": 100,
},
description="Enable verbose output.",
) # type: ignore

max_leaf_nodes: schema_field(
union_type(optimizer_int_field(ge=1), none_type(int)),
placeholder=None,
description="Grow trees with max_leaf_nodes in best-first fashion.",
) # type: ignore

warm_start: schema_field(
bool_field,
placeholder=False,
description="When set to True, reuse the solution of the previous call"
"to fit and add more estimators to the ensemble.",
) # type: ignore

validation_fraction: schema_field(
optimizer_float_field(gt=0.0, le=1.0),
placeholder={
"optimize": False,
"fixed_value": 0.1,
"lower_bound": 0.1,
"upper_bound": 0.5,
},
description="The proportion of training data to set aside as "
"validation set for early stopping.",
) # type: ignore

n_iter_no_change: schema_field(
union_type(optimizer_int_field(ge=1), none_type(int)),
placeholder=None,
description="The number of iterations with no improvement to wait "
"before stopping the training.",
) # type: ignore

tol: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0001,
"lower_bound": 1e-5,
"upper_bound": 1e-1,
},
description="Tolerance for the early stopping.",
) # type: ignore

ccp_alpha: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0,
"lower_bound": 0.0,
"upper_bound": 1.0,
},
description="Complexity parameter used for Minimal Cost-Complexity Pruning.",
) # type: ignore


class GradientBoostingR(RegressionModel, SklearnLikeRegressor, _GBRegressor):
"""Scikit-learn's Ridge Regression wrapper for DashAI."""

SCHEMA = GradientBoostingRSchema

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
Original file line number Diff line number Diff line change
@@ -8,7 +8,9 @@
optimizer_int_field,
schema_field,
)
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.tabular_classification_model import TabularClassificationModel


@@ -88,7 +90,7 @@ class HistGradientBoostingClassifierSchema(BaseSchema):


class HistGradientBoostingClassifier(
TabularClassificationModel, SklearnLikeModel, _HistGradientBoostingClassifier
TabularClassificationModel, SklearnLikeClassifier, _HistGradientBoostingClassifier
):
"""Scikit-learn's HistGradientBoostingRegressor wrapper for DashAI."""

6 changes: 4 additions & 2 deletions DashAI/back/models/scikit_learn/k_neighbors_classifier.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,9 @@
optimizer_int_field,
schema_field,
)
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.tabular_classification_model import TabularClassificationModel


@@ -41,7 +43,7 @@ class KNeighborsClassifierSchema(BaseSchema):


class KNeighborsClassifier(
TabularClassificationModel, SklearnLikeModel, _KNeighborsClassifier
TabularClassificationModel, SklearnLikeClassifier, _KNeighborsClassifier
):
"""Scikit-learn's K-Nearest Neighbors (KNN) classifier wrapper for DashAI."""

127 changes: 127 additions & 0 deletions DashAI/back/models/scikit_learn/linearSVR.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from sklearn.svm import LinearSVR as _LinearSVR

from DashAI.back.core.schema_fields import (
BaseSchema,
bool_field,
enum_field,
none_type,
optimizer_float_field,
optimizer_int_field,
schema_field,
union_type,
)
from DashAI.back.models.regression_model import RegressionModel
from DashAI.back.models.scikit_learn.sklearn_like_regressor import (
SklearnLikeRegressor,
)


class LinearSVRSchema(BaseSchema):
"""Support Vector Regression (SVR) using a linear kernel."""

epsilon: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0,
"lower_bound": 0.0,
"upper_bound": 1,
},
description="Epsilon parameter that specifies the epsilon-tube within "
"which no penalty is associated.",
) # type: ignore

tol: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0001,
"lower_bound": 1e-5,
"upper_bound": 1e-1,
},
description="Tolerance for stopping criterion.",
) # type: ignore

C: schema_field(
optimizer_float_field(gt=0.0),
placeholder={
"optimize": False,
"fixed_value": 1.0,
"lower_bound": 0.1,
"upper_bound": 10,
},
description="Regularization parameter. The strength of the regularization "
"is inversely proportional to C.",
) # type: ignore

loss: schema_field(
enum_field(enum=["epsilon_insensitive", "squared_epsilon_insensitive"]),
placeholder="epsilon_insensitive",
description="Specifies the loss function. 'epsilon_insensitive' is "
"the standard SVR loss.",
) # type: ignore

fit_intercept: schema_field(
bool_field,
placeholder=True,
description="Whether to calculate the intercept for this model.",
) # type: ignore

intercept_scaling: schema_field(
optimizer_float_field(gt=0.0),
placeholder={
"optimize": False,
"fixed_value": 1.0,
"lower_bound": 0.1,
"upper_bound": 10,
},
description="When fit_intercept is True, instance vector x becomes "
"[x, self.intercept_scaling] in the primal problem.",
) # type: ignore

dual: schema_field(
bool_field,
placeholder=True,
description="Select the algorithm to either solve the dual or primal"
" optimization problem.",
) # type: ignore

verbose: schema_field(
optimizer_int_field(ge=0),
placeholder={
"optimize": False,
"fixed_value": 0,
"lower_bound": 0,
"upper_bound": 100,
},
description="Enable verbose output. Note that this setting takes "
"advantage of a per-process runtime setting in libsvm.",
) # type: ignore

random_state: schema_field(
union_type(optimizer_int_field(ge=0), none_type(int)),
placeholder=None,
description="The seed of the pseudo-random number generator to use"
" when shuffling the data.",
) # type: ignore

max_iter: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 1000,
"lower_bound": 100,
"upper_bound": 10000,
},
description="The maximum number of iterations to be run.",
) # type: ignore


class LinearSVR(RegressionModel, SklearnLikeRegressor, _LinearSVR):
"""Scikit-learn's Linear Support Vector Regression (LinearSVR)
wrapper for DashAI."""

SCHEMA = LinearSVRSchema

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
54 changes: 54 additions & 0 deletions DashAI/back/models/scikit_learn/linear_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from sklearn.linear_model import LinearRegression as _LinearRegression

from DashAI.back.core.schema_fields import (
BaseSchema,
bool_field,
none_type,
optimizer_int_field,
schema_field,
union_type,
)
from DashAI.back.models.regression_model import RegressionModel
from DashAI.back.models.scikit_learn.sklearn_like_regressor import (
SklearnLikeRegressor,
)


class LinearRegressionSchema(BaseSchema):
"""Linear regression model with optional intercept."""

fit_intercept: schema_field(
bool_field,
placeholder=True,
description="Whether to calculate the intercept for this model. "
"If set to False, no intercept will be used in calculations "
"(e.g., data is expected to be centered).",
) # type: ignore

copy_x: schema_field(
bool_field,
placeholder=True,
description="If True, X will be copied; else, it may be overwritten.",
) # type: ignore

n_jobs: schema_field(
union_type(optimizer_int_field(ge=1), none_type(int)),
placeholder=None,
description="The number of jobs to use for the computation. "
"None means 1 job, while -1 means using all processors.",
) # type: ignore

positive: schema_field(
bool_field,
placeholder=False,
description="When set to True, forces the coefficients to be positive.",
) # type: ignore


class LinearRegression(RegressionModel, SklearnLikeRegressor, _LinearRegression):
"""Scikit-learn's Linear Regression wrapper for DashAI."""

SCHEMA = LinearRegressionSchema

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
6 changes: 4 additions & 2 deletions DashAI/back/models/scikit_learn/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,9 @@
optimizer_int_field,
schema_field,
)
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.tabular_classification_model import TabularClassificationModel


@@ -56,7 +58,7 @@ class LogisticRegressionSchema(BaseSchema):


class LogisticRegression(
TabularClassificationModel, SklearnLikeModel, _LogisticRegression
TabularClassificationModel, SklearnLikeClassifier, _LogisticRegression
):
"""Scikit-learn's Logistic Regression wrapper for DashAI."""

228 changes: 228 additions & 0 deletions DashAI/back/models/scikit_learn/mlp_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
from sklearn.neural_network import MLPRegressor as _MLPregressor

from DashAI.back.core.schema_fields import (
BaseSchema,
bool_field,
enum_field,
none_type,
optimizer_float_field,
optimizer_int_field,
schema_field,
union_type,
)
from DashAI.back.models.regression_model import RegressionModel
from DashAI.back.models.scikit_learn.sklearn_like_regressor import (
SklearnLikeRegressor,
)


class MLPRegressorSchema(BaseSchema):
"""MLP Regressor for DashAI."""

activation: schema_field(
enum_field(enum=["identity", "logistic", "tanh", "relu"]),
placeholder="relu",
description="Activation function for the hidden layer.",
) # type: ignore

solver: schema_field(
enum_field(enum=["lbfgs", "sgd", "adam"]),
placeholder="adam",
description="The solver for weight optimization.",
) # type: ignore

alpha: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0001,
"lower_bound": 1e-6,
"upper_bound": 1e-1,
},
description="L2 penalty (regularization term) parameter.",
) # type: ignore

batch_size: schema_field(
union_type(optimizer_int_field(ge=1), enum_field(enum=["auto"])),
placeholder="auto",
description="Size of minibatches for stochastic optimizers.",
) # type: ignore

learning_rate: schema_field(
enum_field(enum=["constant", "invscaling", "adaptive"]),
placeholder="constant",
description="Learning rate schedule for weight updates.",
) # type: ignore

learning_rate_init: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.001,
"lower_bound": 1e-5,
"upper_bound": 1e-1,
},
description="The initial learning rate used.",
) # type: ignore

power_t: schema_field(
optimizer_float_field(gt=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.5,
"lower_bound": 0.1,
"upper_bound": 0.9,
},
description="The exponent for inverse scaling learning rate.",
) # type: ignore

max_iter: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 200,
"lower_bound": 50,
"upper_bound": 1000,
},
description="Maximum number of iterations.",
) # type: ignore

shuffle: schema_field(
bool_field,
placeholder=True,
description="Whether to shuffle samples in each iteration.",
) # type: ignore

random_state: schema_field(
union_type(optimizer_int_field(ge=0), none_type(int)),
placeholder=None,
description="The seed of the pseudo-random number generator to use "
"when shuffling the data.",
) # type: ignore

tol: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0001,
"lower_bound": 1e-6,
"upper_bound": 1e-2,
},
description="Tolerance for the optimization.",
) # type: ignore

verbose: schema_field(
bool_field,
placeholder=False,
description="Whether to print progress messages to stdout.",
) # type: ignore

warm_start: schema_field(
bool_field,
placeholder=False,
description="When set to True, reuse the solution of the previous call"
" to fit as initialization.",
) # type: ignore

momentum: schema_field(
optimizer_float_field(ge=0.0, le=1.0),
placeholder={
"optimize": False,
"fixed_value": 0.9,
"lower_bound": 0.0,
"upper_bound": 1.0,
},
description="Momentum for gradient descent update.",
) # type: ignore

nesterovs_momentum: schema_field(
bool_field,
placeholder=True,
description="Whether to use Nesterov’s momentum.",
) # type: ignore

early_stopping: schema_field(
bool_field,
placeholder=False,
description="Whether to use early stopping to terminate training when"
" validation score is not improving.",
) # type: ignore

validation_fraction: schema_field(
optimizer_float_field(gt=0.0, le=1.0),
placeholder={
"optimize": False,
"fixed_value": 0.1,
"lower_bound": 0.1,
"upper_bound": 0.5,
},
description="The proportion of training data to set aside as "
"validation set for early stopping.",
) # type: ignore

beta_1: schema_field(
optimizer_float_field(gt=0.0, lt=1.0),
placeholder={
"optimize": False,
"fixed_value": 0.9,
"lower_bound": 0.1,
"upper_bound": 0.999,
},
description="Exponential decay rate for estimates of first moment"
" vector in Adam optimizer.",
) # type: ignore

beta_2: schema_field(
optimizer_float_field(gt=0.0, lt=1.0),
placeholder={
"optimize": False,
"fixed_value": 0.999,
"lower_bound": 0.1,
"upper_bound": 0.999,
},
description="Exponential decay rate for estimates of second moment"
" vector in Adam optimizer.",
) # type: ignore

epsilon: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 1e-08,
"lower_bound": 1e-10,
"upper_bound": 1e-6,
},
description="Value for numerical stability in Adam optimizer.",
) # type: ignore

n_iter_no_change: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 10,
"lower_bound": 1,
"upper_bound": 50,
},
description="Maximum number of epochs to not meet tol improvement.",
) # type: ignore

max_fun: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 15000,
"lower_bound": 1000,
"upper_bound": 20000,
},
description="Maximum number of loss function calls. Only used "
" if solver='lbfgs'.",
) # type: ignore


class MLPRegression(RegressionModel, SklearnLikeRegressor, _MLPregressor):
"""Scikit-learn's MLP Regression wrapper for DashAI."""

SCHEMA = MLPRegressorSchema

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
10 changes: 4 additions & 6 deletions DashAI/back/models/scikit_learn/random_forest_classifier.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from sklearn.ensemble import RandomForestClassifier as _RandomForestClassifier

from DashAI.back.core.schema_fields import (
BaseSchema,
optimizer_int_field,
schema_field,
from DashAI.back.core.schema_fields import BaseSchema, optimizer_int_field, schema_field
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.tabular_classification_model import TabularClassificationModel


@@ -92,7 +90,7 @@ class RandomForestClassifierSchema(BaseSchema):


class RandomForestClassifier(
TabularClassificationModel, SklearnLikeModel, _RandomForestClassifier
TabularClassificationModel, SklearnLikeClassifier, _RandomForestClassifier
):
"""Scikit-learn's Random Forest classifier wrapper for DashAI."""

184 changes: 184 additions & 0 deletions DashAI/back/models/scikit_learn/random_forest_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
from sklearn.ensemble import RandomForestRegressor as _RandomForestRegressor

from DashAI.back.core.schema_fields import (
BaseSchema,
bool_field,
enum_field,
none_type,
optimizer_float_field,
optimizer_int_field,
schema_field,
union_type,
)
from DashAI.back.models.regression_model import RegressionModel
from DashAI.back.models.scikit_learn.sklearn_like_regressor import SklearnLikeRegressor


class RandomForestRegressionSchema(BaseSchema):
"""Random Forest Regressor for DashAI."""

n_estimators: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 100,
"lower_bound": 10,
"upper_bound": 1000,
},
description="The number of trees in the forest.",
) # type: ignore

criterion: schema_field(
enum_field(enum=["squared_error", "absolute_error", "poisson"]),
placeholder="squared_error",
description="The function to measure the quality of a split.",
) # type: ignore

max_depth: schema_field(
union_type(optimizer_int_field(ge=1), none_type(int)),
placeholder=None,
description="The maximum depth of the tree.",
) # type: ignore

min_samples_split: schema_field(
optimizer_int_field(ge=2),
placeholder={
"optimize": False,
"fixed_value": 2,
"lower_bound": 2,
"upper_bound": 20,
},
description="The minimum number of samples required to split "
"an internal node.",
) # type: ignore

min_samples_leaf: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 1,
"lower_bound": 1,
"upper_bound": 20,
},
description="The minimum number of samples required to be at a leaf node.",
) # type: ignore

min_weight_fraction_leaf: schema_field(
optimizer_float_field(ge=0.0, le=0.5),
placeholder={
"optimize": False,
"fixed_value": 0.0,
"lower_bound": 0.0,
"upper_bound": 0.5,
},
description="The minimum weighted fraction of the sum total of weights"
" required to be at a leaf node.",
) # type: ignore

max_features: schema_field(
union_type(
optimizer_float_field(gt=0.0, le=1.0),
enum_field(enum=["auto", "sqrt", "log2", None]),
),
placeholder="auto",
description="The number of features to consider when looking for the"
" best split.",
) # type: ignore

max_leaf_nodes: schema_field(
union_type(optimizer_int_field(ge=1), none_type(int)),
placeholder=None,
description="Grow trees with max_leaf_nodes in best-first fashion.",
) # type: ignore

min_impurity_decrease: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0,
"lower_bound": 0.0,
"upper_bound": 0.5,
},
description="A node will be split if this split induces a decrease of"
" the impurity greater than or equal to this value.",
) # type: ignore

bootstrap: schema_field(
bool_field,
placeholder=True,
description="Whether bootstrap samples are used when building trees.",
) # type: ignore

oob_score: schema_field(
bool_field,
placeholder=False,
description="Whether to use out-of-bag samples to estimate the "
"generalization score.",
) # type: ignore

n_jobs: schema_field(
union_type(optimizer_int_field(ge=1), none_type(int)),
placeholder=None,
description="The number of jobs to run in parallel for both fit and predict.",
) # type: ignore

random_state: schema_field(
union_type(optimizer_int_field(ge=0), none_type(int)),
placeholder=None,
description="The seed of the pseudo-random number generator to use"
" when shuffling the data.",
) # type: ignore

verbose: schema_field(
optimizer_int_field(ge=0),
placeholder={
"optimize": False,
"fixed_value": 0,
"lower_bound": 0,
"upper_bound": 100,
},
description="Controls the verbosity when fitting and predicting.",
) # type: ignore

warm_start: schema_field(
bool_field,
placeholder=False,
description="When set to True, reuse the solution of the previous "
"call to fit and add more estimators to the ensemble.",
) # type: ignore

ccp_alpha: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.0,
"lower_bound": 0.0,
"upper_bound": 1.0,
},
description="Complexity parameter used for Minimal Cost-Complexity Pruning.",
) # type: ignore

max_samples: schema_field(
union_type(optimizer_float_field(gt=0.0, le=1.0), none_type(float)),
placeholder=None,
description="If bootstrap is True, the number of samples to draw from"
" X to train each base estimator.",
) # type: ignore

monotonic_cst: schema_field(
none_type((float)),
placeholder=None,
description="A constraint vector indicating the monotonicity "
"constraint on each feature.",
) # type: ignore


class RandomForestRegression(
RegressionModel, SklearnLikeRegressor, _RandomForestRegressor
):
"""Scikit-learn's Ridge Regression wrapper for DashAI."""

SCHEMA = RandomForestRegressionSchema

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
96 changes: 96 additions & 0 deletions DashAI/back/models/scikit_learn/ridge_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from sklearn.linear_model import Ridge as _Ridge

from DashAI.back.core.schema_fields import (
BaseSchema,
bool_field,
enum_field,
none_type,
optimizer_float_field,
optimizer_int_field,
schema_field,
union_type,
)
from DashAI.back.models.regression_model import RegressionModel
from DashAI.back.models.scikit_learn.sklearn_like_regressor import (
SklearnLikeRegressor,
)


class RidgeRegressionSchema(BaseSchema):
"""Ridge regression is a linear model that includes L2 regularization."""

alpha: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 1.0,
"lower_bound": 0.1,
"upper_bound": 10.0,
},
description="Regularization strength; must be a positive float. "
"Larger values specify stronger regularization.",
) # type: ignore

fit_intercept: schema_field(
bool_field,
placeholder=True,
description="Whether to calculate the intercept for this model. "
"If set to False, no intercept will be used in calculations "
"(e.g., data is expected to be centered).",
) # type: ignore

copy_x: schema_field(
bool_field,
placeholder=True,
description="If True, X will be copied; else, it may be overwritten.",
) # type: ignore

max_iter: schema_field(
optimizer_int_field(ge=1),
placeholder={
"optimize": False,
"fixed_value": 1000,
"lower_bound": 100,
"upper_bound": 10000,
},
description="Maximum number of iterations for conjugate gradient solver.",
) # type: ignore
tol: schema_field(
optimizer_float_field(ge=0.0),
placeholder={
"optimize": False,
"fixed_value": 0.001,
"lower_bound": 1e-5,
"upper_bound": 1e-1,
},
description="Precision of the solution.",
) # type: ignore
solver: schema_field(
enum_field(
enum=["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
),
placeholder="auto",
description="Solver to use in the computation. ‘auto’ chooses the "
"solver automatically based on the type of data.",
) # type: ignore
positive: schema_field(
bool_field,
placeholder=False,
description="When set to True, forces the coefficients to be positive.",
) # type: ignore
random_state: schema_field(
union_type(optimizer_int_field(ge=0), none_type(int)),
placeholder=None,
description="The seed of the pseudo random number generator to use "
"when shuffling the data. Pass an int for reproducible output across "
"multiple function calls, or None to not set a specific seed.",
) # type: ignore


class RidgeRegression(RegressionModel, SklearnLikeRegressor, _Ridge):
"""Scikit-learn's Ridge regression wrapper for DashAI."""

SCHEMA = RidgeRegressionSchema

def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
25 changes: 25 additions & 0 deletions DashAI/back/models/scikit_learn/sklearn_like_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import numpy as np

from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel


class SklearnLikeClassifier(SklearnLikeModel):
"""Class for handling sklearn-like classifier models."""

def predict(self, x_pred: DashAIDataset) -> np.ndarray:
"""Make a prediction with the model.
Parameters
----------
x_pred : DashAIDataset
Dataset with the input data columns.
Returns
-------
np.ndarray
Array with the predicted target values for x_pred
"""
if isinstance(x_pred, DashAIDataset):
x_pred = x_pred.to_pandas()
return super().predict_proba(x_pred)
20 changes: 1 addition & 19 deletions DashAI/back/models/scikit_learn/sklearn_like_model.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Type, Union
from typing import Type

import joblib
import pandas as pd

from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset
from DashAI.back.models.base_model import BaseModel
@@ -42,20 +41,3 @@ def fit(
x_pandas = x_train.to_pandas()
y_pandas = y_train.to_pandas()
return super().fit(x_pandas, y_pandas)

def predict(self, x_pred: Union[DashAIDataset, pd.DataFrame]):
"""Make a prediction with the model.
Parameters
----------
x_pred : Union[DashAIDataset, pd.DataFrame]
Dataset with the input data columns.
Returns
-------
array-like
Array with the predicted target values for x_pred
"""
if isinstance(x_pred, DashAIDataset):
x_pred = x_pred.to_pandas()
return super().predict_proba(x_pred)
23 changes: 23 additions & 0 deletions DashAI/back/models/scikit_learn/sklearn_like_regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import numpy as np

from DashAI.back.dataloaders.classes.dashai_dataset import DashAIDataset
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel


class SklearnLikeRegressor(SklearnLikeModel):
"""Class for handling sklearn-like regressor models."""

def predict(self, x_pred: DashAIDataset) -> np.ndarray:
"""Make a prediction with the model.
Parameters
----------
x_pred : DashAIDataset
Dataset with the input data columns.
Returns
-------
np.ndarray
Array with the predicted target values for x_pred
"""
return super().predict(x_pred.to_pandas())
6 changes: 4 additions & 2 deletions DashAI/back/models/scikit_learn/svc.py
Original file line number Diff line number Diff line change
@@ -8,7 +8,9 @@
optimizer_int_field,
schema_field,
)
from DashAI.back.models.scikit_learn.sklearn_like_model import SklearnLikeModel
from DashAI.back.models.scikit_learn.sklearn_like_classifier import (
SklearnLikeClassifier,
)
from DashAI.back.models.tabular_classification_model import TabularClassificationModel


@@ -104,7 +106,7 @@ class SVCSchema(BaseSchema):
) # type: ignore


class SVC(TabularClassificationModel, SklearnLikeModel, _SVC):
class SVC(TabularClassificationModel, SklearnLikeClassifier, _SVC):
"""Scikit-learn's Support Vector Machine (SVM) classifier wrapper for DashAI."""

SCHEMA = SVCSchema
5 changes: 3 additions & 2 deletions DashAI/back/optimizers/optuna_optimizer.py
Original file line number Diff line number Diff line change
@@ -32,7 +32,7 @@ class OptunaSchema(BaseSchema):
". Must be in string format and can be 'scale' or 'auto'.",
) # type: ignore
metric: schema_field(
enum_field(enum=["Accuracy", "F1", "Precision", "Recall"]),
enum_field(enum=["MAE", "RMSE"]),
placeholder="Accuracy",
description="Coefficient for 'rbf', 'poly' and 'sigmoid' kernels."
"Must be in string format and can be 'scale' or 'auto'.",
@@ -46,6 +46,7 @@ class OptunaOptimizer(BaseOptimizer):
"TabularClassificationTask",
"TextClassificationTask",
"TranslationTask",
"RegressionTask",
]

def __init__(self, n_trials=None, sampler=None, pruner=None, metric=None):
@@ -73,7 +74,7 @@ def optimize(self, model, input_dataset, output_dataset, parameters, task):
self.output_dataset = output_dataset
self.parameters = parameters

if self.metric["name"] in ["Accuracy", "F1", "Precision", "Recall"]:
if self.metric["name"] in ["MAE", "RMSE"]:
study = optuna.create_study(
direction="maximize", sampler=self.sampler(), pruner=self.pruner
)
1 change: 1 addition & 0 deletions DashAI/back/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# flake8: noqa
from DashAI.back.tasks.base_task import BaseTask
from DashAI.back.tasks.image_classification_task import ImageClassificationTask
from DashAI.back.tasks.regression_task import RegressionTask
from DashAI.back.tasks.tabular_classification_task import TabularClassificationTask
from DashAI.back.tasks.text_classification_task import TextClassificationTask
from DashAI.back.tasks.translation_task import TranslationTask
43 changes: 43 additions & 0 deletions DashAI/back/tasks/regression_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import List

from datasets import DatasetDict, Value

from DashAI.back.tasks.base_task import BaseTask


class RegressionTask(BaseTask):
"""Base class for regression tasks.
Here you can change the methods provided by class Task.
"""

DESCRIPTION: str = """
Regression in machine learning involves predicting continuous values for
structured data organized in tabular form (rows and columns).
Models are trained to learn patterns and relationships in the data,
enabling accurate prediction of new instances."""
metadata: dict = {
"inputs_types": [Value],
"outputs_types": [Value],
"inputs_cardinality": "n",
"outputs_cardinality": 1,
}

def prepare_for_task(
self, datasetdict: DatasetDict, outputs_columns: List[str]
) -> DatasetDict:
"""Change the column types to suit the regression task.
A copy of the dataset is created.
Parameters
----------
datasetdict : DatasetDict
Dataset to be changed
Returns
-------
DatasetDict
Dataset with the new types
"""
return datasetdict

0 comments on commit ad872bb

Please sign in to comment.