diff --git a/autointent/context/context.py b/autointent/context/context.py index b6b11d4..a8d8a65 100644 --- a/autointent/context/context.py +++ b/autointent/context/context.py @@ -1,5 +1,5 @@ from .data_handler import DataHandler -from .optimization_logs import OptimizationLogs +from .optimization_info import OptimizationInfo from .vector_index import VectorIndex @@ -25,7 +25,7 @@ def __init__( regex_sampling, seed, ) - self.optimization_logs = OptimizationLogs() + self.optimization_info = OptimizationInfo() self.vector_index = VectorIndex(db_dir, device, self.data_handler.multilabel, self.data_handler.n_classes) self.device = device @@ -34,5 +34,5 @@ def __init__( self.seed = seed def get_best_collection(self): - model_name = self.optimization_logs.get_best_embedder() + model_name = self.optimization_info.get_best_embedder() return self.vector_index.get_collection(model_name) diff --git a/autointent/context/optimization_info/__init__.py b/autointent/context/optimization_info/__init__.py new file mode 100644 index 0000000..79b4521 --- /dev/null +++ b/autointent/context/optimization_info/__init__.py @@ -0,0 +1,2 @@ +from .data_models import RetrieverArtifact, ScorerArtifact +from .optimization_info import OptimizationInfo diff --git a/autointent/context/optimization_info/data_models.py b/autointent/context/optimization_info/data_models.py new file mode 100644 index 0000000..dc0e597 --- /dev/null +++ b/autointent/context/optimization_info/data_models.py @@ -0,0 +1,91 @@ +from typing import Any + +import numpy as np +from numpy.typing import NDArray +from pydantic import BaseModel, ConfigDict, Field + + +class Artifact(BaseModel): + ... + + +class RegexpArtifact(Artifact): + ... + + +class RetrieverArtifact(Artifact): + """ + Name of the embedding model chosen after retrieval optimization + """ + embedder_name: str + + +class ScorerArtifact(Artifact): + """ + Outputs from best scorer, numpy arrays of shape (n_samples, n_classes) + """ + model_config = ConfigDict(arbitrary_types_allowed=True) + test_scores: NDArray[np.float64] | None = Field(None, description="Scorer outputs for test utterances") + oos_scores: NDArray[np.float64] | None = Field(None, description="Scorer outputs for out-of-scope utterances") + + +class PredictorArtifact(Artifact): + """ + Outputs from best predictor, numpy array of shape (n_samples,) or + (n_samples, n_classes) depending on classification mode (multi-class or multi-label) + """ + model_config = ConfigDict(arbitrary_types_allowed=True) + labels: NDArray[np.float64] + + +class Artifacts(BaseModel): + """ + Modules hyperparams and outputs. The best ones are transmitted between nodes of the pipeline + """ + model_config = ConfigDict(arbitrary_types_allowed=True) + + regexp: list[RegexpArtifact] = [] + retrieval: list[RetrieverArtifact] = [] + scoring: list[ScorerArtifact] = [] + prediction: list[PredictorArtifact] = [] + + def __getitem__(self, node_type: str) -> list: + return getattr(self, node_type) + + +class Trial(BaseModel): + """ + Detailed representation of one optimization trial + """ + module_type: str + module_params: dict[str, Any] + metric_name: str + metric_value: float + +class Trials(BaseModel): + """ + Detailed representation of optimization results + """ + regexp: list[Trial] = [] + retrieval: list[Trial] = [] + scoring: list[Trial] = [] + prediction: list[Trial] = [] + + def __getitem__(self, node_type: str) -> list[Trial]: + return getattr(self, node_type) + + +class TrialsIds(BaseModel): + """ + Detailed representation of optimization results + """ + regexp: int | None = None + retrieval: int | None = None + scoring: int | None = None + prediction: int | None = None + + def __getitem__(self, node_type: str) -> list[Trial]: + return getattr(self, node_type) + + def __setitem__(self, node_type: str, idx: int) -> None: + setattr(self, node_type, idx) diff --git a/autointent/context/optimization_info/logger.py b/autointent/context/optimization_info/logger.py new file mode 100644 index 0000000..0464efc --- /dev/null +++ b/autointent/context/optimization_info/logger.py @@ -0,0 +1,25 @@ +import logging +from pprint import pformat + + +def get_logger() -> logging.Logger: + logger = logging.getLogger(__name__) + + formatter = PPrintFormatter() + ch = logging.StreamHandler() + ch.setFormatter(formatter) + logger.addHandler(ch) + + return logger + + +class PPrintFormatter(logging.Formatter): + def __init__(self): + super().__init__(fmt="{asctime} - {name} - {levelname} - {message}", style="{") + + def format(self, record): + if isinstance(record.msg, dict): + format_msg = "module scoring results:\n" + dct_to_str = pformat(record.msg) + record.msg = format_msg + dct_to_str + return super().format(record) diff --git a/autointent/context/optimization_info/optimization_info.py b/autointent/context/optimization_info/optimization_info.py new file mode 100644 index 0000000..8eb9fac --- /dev/null +++ b/autointent/context/optimization_info/optimization_info.py @@ -0,0 +1,80 @@ +import numpy as np + +from .data_models import Artifact, Artifacts, NDArray, RetrieverArtifact, ScorerArtifact, Trial, Trials, TrialsIds +from .logger import get_logger + + +class OptimizationInfo: + """TODO continous IO with file system (to be able to restore the state of optimization)""" + + def __init__(self): + self._logger = get_logger() + + self.artifacts = Artifacts() + self.trials = Trials() + self._trials_best_ids = TrialsIds() + + def log_module_optimization( + self, + node_type: str, + module_type: str, + module_params: dict, + metric_value: float, + metric_name: str, + artifact: Artifact, + ): + """ + Purposes: + - save optimization results in a text form (hyperparameters and corresponding metrics) + - update best assets + """ + + # save trial + trial = Trial( + module_type=module_type, + metric_name=metric_name, + metric_value=metric_value, + module_params=module_params, + ) + self.trials[node_type].append(trial) + self._logger.info(trial.model_dump()) + + # save artifact + self.artifacts[node_type].append(artifact) + + def _get_metrics_values(self, node_type: str) -> list[float]: + return [trial.metric_value for trial in self.trials[node_type]] + + def _get_best_trial_idx(self, node_type: str) -> int: + res = self._trials_best_ids[node_type] + if res is not None: + return res + res = np.argmax(self._get_metrics_values(node_type)) + self._trials_best_ids[node_type] = res + return res + + def _get_best_artifact(self, node_type: str) -> Artifact: + i_best = self._get_best_trial_idx(node_type) + return self.artifacts[node_type][i_best] + + def get_best_embedder(self) -> str: + best_retriever_artifact: RetrieverArtifact = self._get_best_artifact(node_type="retrieval") + return best_retriever_artifact.embedder_name + + def get_best_test_scores(self) -> NDArray[np.float64]: + best_scorer_artifact: ScorerArtifact = self._get_best_artifact(node_type="scoring") + return best_scorer_artifact.test_scores + + def get_best_oos_scores(self) -> NDArray[np.float64]: + best_scorer_artifact: ScorerArtifact = self._get_best_artifact(node_type="scoring") + return best_scorer_artifact.oos_scores + + def dump(self): + node_wise_metrics = { + node_type: self._get_metrics_values(node_type) + for node_type in ["regexp", "retrieval", "scoring", "prediction"] + } + return { + "metrics": node_wise_metrics, + "configs": self.trials.model_dump(), + } diff --git a/autointent/context/optimization_logs.py b/autointent/context/optimization_logs.py deleted file mode 100644 index 69144ce..0000000 --- a/autointent/context/optimization_logs.py +++ /dev/null @@ -1,91 +0,0 @@ -import logging -from pprint import pformat - - -class OptimizationLogs: - """TODO continous IO with file system (to be able to restore the state of optimization)""" - - def __init__(self): - self._logger = self._get_logger() - - self.cache = { - "best_assets": { - "regexp": None, # TODO: choose the format - "retrieval": None, # str, name of best retriever - "scoring": { - "test_scores": None, "oos_scores": None - }, # dict with values of two np.ndarrays of shape (n_samples, n_classes), from best scorer - "prediction": None, # np.ndarray of shape (n_samples,), from best predictor - }, - "metrics": {"regexp": [], "retrieval": [], "scoring": [], "prediction": []}, - "configs": {"regexp": [], "retrieval": [], "scoring": [], "prediction": []}, - } - - def log_module_optimization( - self, - node_type: str, - module_type: str, - module_config: dict, - metric_value: float, - metric_name: str, - assets, - ): - """ - Purposes: - - save optimization results in a text form (hyperparameters and corresponding metrics) - - update best assets - """ - - # "update leaderboard" if it's a new best metric - metrics_list = self.cache["metrics"][node_type] - previous_best = max(metrics_list, default=-float("inf")) - if metric_value > previous_best: - self.cache["best_assets"][node_type] = assets - - # logging - logs = dict( - module_type=module_type, - metric_name=metric_name, - metric_value=metric_value, - **module_config, - ) - self.cache["configs"][node_type].append(logs) - self._logger.info(logs) - metrics_list.append(metric_value) - - def get_best_embedder(self): - return self.cache["best_assets"]["retrieval"] - - def get_best_test_scores(self): - return self.cache["best_assets"]["scoring"]["test_scores"] - - def get_best_oos_scores(self): - return self.cache["best_assets"]["scoring"]["oos_scores"] - - def dump(self): - return { - "metrics": self.cache["metrics"], - "configs": self.cache["configs"], - } - - def _get_logger(self): - logger = logging.getLogger(__name__) - - formatter = PPrintFormatter() - ch = logging.StreamHandler() - ch.setFormatter(formatter) - logger.addHandler(ch) - - return logger - - -class PPrintFormatter(logging.Formatter): - def __init__(self): - super().__init__(fmt="{asctime} - {name} - {levelname} - {message}", style="{") - - def format(self, record): - if isinstance(record.msg, dict): - format_msg = "module scoring results:\n" - dct_to_str = pformat(record.msg) - record.msg = format_msg + dct_to_str - return super().format(record) diff --git a/autointent/modules/prediction/base.py b/autointent/modules/prediction/base.py index 7f8361d..804e0d9 100644 --- a/autointent/modules/prediction/base.py +++ b/autointent/modules/prediction/base.py @@ -30,9 +30,9 @@ def clear_cache(self): def get_prediction_evaluation_data(context: Context): labels = context.data_handler.labels_test - scores = context.optimization_logs.get_best_test_scores() + scores = context.optimization_info.get_best_test_scores() - oos_scores = context.optimization_logs.get_best_oos_scores() + oos_scores = context.optimization_info.get_best_oos_scores() if oos_scores is not None: oos_labels = [[0] * context.n_classes] * len(oos_scores) if context.multilabel else [-1] * len(oos_scores) labels = np.concatenate([labels, oos_labels]) diff --git a/autointent/modules/prediction/tunable.py b/autointent/modules/prediction/tunable.py index 768e575..f83e5f9 100644 --- a/autointent/modules/prediction/tunable.py +++ b/autointent/modules/prediction/tunable.py @@ -5,7 +5,7 @@ from optuna.trial import Trial from sklearn.metrics import f1_score -from .base import Context, PredictionModule +from .base import Context, PredictionModule, get_prediction_evaluation_data from .threshold import multiclass_predict, multilabel_predict @@ -24,9 +24,10 @@ def fit(self, context: Context): ) thresh_optimizer = ThreshOptimizer(n_classes=context.n_classes, multilabel=context.multilabel) + labels, scores = get_prediction_evaluation_data(context) thresh_optimizer.fit( - probas=context.optimization_logs.get_best_test_scores(), - labels=context.data_handler.labels_test, + probas=scores, + labels=labels, seed=context.seed, tags=self.tags, ) diff --git a/autointent/modules/retrieval/vectordb.py b/autointent/modules/retrieval/vectordb.py index 023f257..5bc0d82 100644 --- a/autointent/modules/retrieval/vectordb.py +++ b/autointent/modules/retrieval/vectordb.py @@ -4,6 +4,7 @@ from chromadb import Collection from autointent.context import Context +from autointent.context.optimization_info import RetrieverArtifact from autointent.metrics import RetrievalMetricFn from .base import RetrievalModule @@ -26,8 +27,8 @@ def score(self, context: Context, metric_fn: RetrievalMetricFn) -> tuple[float, ) return metric_fn(context.data_handler.labels_test, labels_pred) - def get_assets(self): - return self.model_name + def get_assets(self) -> RetrieverArtifact: + return RetrieverArtifact(embedder_name=self.model_name) def clear_cache(self): model = self.collection._embedding_function._model # noqa: SLF001 diff --git a/autointent/modules/scoring/base.py b/autointent/modules/scoring/base.py index 646e0c3..a44e7e2 100644 --- a/autointent/modules/scoring/base.py +++ b/autointent/modules/scoring/base.py @@ -2,6 +2,7 @@ import numpy as np +from autointent.context.optimization_info import ScorerArtifact from autointent.metrics import ScoringMetricFn from autointent.modules.base import Context, Module @@ -21,8 +22,8 @@ def score(self, context: Context, metric_fn: ScoringMetricFn) -> tuple[float, np self._oos_scores = self.predict(context.data_handler.oos_utterances) return res - def get_assets(self): - return {"test_scores": self._test_scores, "oos_scores": self._oos_scores} + def get_assets(self) -> ScorerArtifact: + return ScorerArtifact(test_scores=self._test_scores, oos_scores=self._oos_scores) @abstractmethod def predict(self, utterances: list[str]): diff --git a/autointent/nodes/base.py b/autointent/nodes/base.py index ab971dc..1e0f44b 100644 --- a/autointent/nodes/base.py +++ b/autointent/nodes/base.py @@ -45,7 +45,7 @@ def fit(self, context: Context): metric_value = module.score(context, self.metrics_available[self.metric_name]) assets = module.get_assets() - context.optimization_logs.log_module_optimization( + context.optimization_info.log_module_optimization( self.node_type, module_type, module_kwargs, diff --git a/autointent/pipeline/pipeline.py b/autointent/pipeline/pipeline.py index 20d1b69..ac50f59 100644 --- a/autointent/pipeline/pipeline.py +++ b/autointent/pipeline/pipeline.py @@ -42,7 +42,7 @@ def optimize(self, context: Context): def dump(self, logs_dir: str, run_name: str): self._logger.debug("dumping logs...") - optimization_results = self.context.optimization_logs.dump() + optimization_results = self.context.optimization_info.dump() # create appropriate directory logs_dir = Path.cwd() if logs_dir == "" else Path(logs_dir) diff --git a/pyproject.toml b/pyproject.toml index 6c7526b..9705f55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ sre-yield = "^1.2" optuna = "^4.0.0" xeger = "^0.4.0" pathlib = "^1.0.1" +pydantic = "^2.9.2" [tool.poetry.group.dev] @@ -25,6 +26,8 @@ optional = true [tool.poetry.group.dev.dependencies] datasets = "2.20.0" tach = "^0.11.3" +ipykernel = "^6.29.5" +ipywidgets = "^8.1.5" [tool.poetry.group.test] optional = true diff --git a/tests/test_modules/test_scoring/test_knn.py b/tests/test_modules/test_scoring/test_knn.py index 756d443..2f76c89 100644 --- a/tests/test_modules/test_scoring/test_knn.py +++ b/tests/test_modules/test_scoring/test_knn.py @@ -1,7 +1,7 @@ import numpy as np from autointent import Context -from autointent.metrics import scoring_roc_auc +from autointent.metrics import retrieval_hit_rate, scoring_roc_auc from autointent.modules import KNNScorer, VectorDBModule from autointent.pipeline.main import get_db_dir, get_run_name, load_data, setup_logging @@ -24,11 +24,22 @@ def test_base_knn(): seed=0, ) - vector_db = VectorDBModule(k=3, model_name="sergeyzh/rubert-tiny-turbo") + retrieval_params = {"k": 3, "model_name": "sergeyzh/rubert-tiny-turbo"} + vector_db = VectorDBModule(**retrieval_params) vector_db.fit(context) + metric_value = vector_db.score(context, retrieval_hit_rate) + artifact = vector_db.get_assets() + context.optimization_info.log_module_optimization( + node_type="retrieval", + module_type="vector_db", + module_params=retrieval_params, + metric_value=metric_value, + metric_name="retrieval_hit_rate_macro", + artifact=artifact, + ) + scorer = KNNScorer(k=3, weights="distance") - context.optimization_logs.cache["best_assets"]["retrieval"] = "sergeyzh/rubert-tiny-turbo" scorer.fit(context) assert scorer.score(context, scoring_roc_auc) == 1 predictions = scorer.predict( diff --git a/tests/test_modules/test_scoring/test_mlknn.py b/tests/test_modules/test_scoring/test_mlknn.py index 984e21a..9773f2f 100644 --- a/tests/test_modules/test_scoring/test_mlknn.py +++ b/tests/test_modules/test_scoring/test_mlknn.py @@ -1,7 +1,7 @@ import numpy as np from autointent import Context -from autointent.metrics import scoring_f1 +from autointent.metrics import retrieval_hit_rate_macro, scoring_f1 from autointent.modules import VectorDBModule from autointent.modules.scoring.mlknn.mlknn import MLKnnScorer from autointent.pipeline.main import get_db_dir, get_run_name, load_data, setup_logging @@ -35,13 +35,24 @@ def test_base_mlknn(): seed=0, ) - vector_db = VectorDBModule(k=3, model_name="sergeyzh/rubert-tiny-turbo") + retrieval_params = {"k": 3, "model_name": "sergeyzh/rubert-tiny-turbo"} + vector_db = VectorDBModule(**retrieval_params) vector_db.fit(context) - scorer = MLKnnScorer(k=3) + metric_value = vector_db.score(context, retrieval_hit_rate_macro) + artifact = vector_db.get_assets() + context.optimization_info.log_module_optimization( + node_type="retrieval", + module_type="vector_db", + module_params=retrieval_params, + metric_value=metric_value, + metric_name="retrieval_hit_rate_macro", + artifact=artifact, + ) - context.optimization_logs.cache["best_assets"]["retrieval"] = "sergeyzh/rubert-tiny-turbo" + scorer = MLKnnScorer(k=3) scorer.fit(context) np.testing.assert_almost_equal(0.6663752913752914, scorer.score(context, scoring_f1)) + predictions = scorer.predict_labels( [ "why is there a hold on my american saving bank account",