Source code for mlexp.trainers.sklearn_trainer

import os
import pickle
from typing import Callable, Iterable, Type

import mlflow
import numpy as np
import sklearn
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate

from mlexp.trainers._base_logger import _BaseLogger
from mlexp.trainers._base_trainer import _BaseTrainer
from mlexp.trainers._utils import _save_models


[docs]class SklearnTrainer(_BaseTrainer, _BaseLogger):
    """Training, logging and hyperparameters search for scikit-learn models."""

    def __init__(
        self,
        sklearn_estimator: Type[sklearn.base.BaseEstimator],
        validation_metric: Callable[[np.ndarray, np.ndarray], float],
        direction: str,
        saved_files_path: str,
        optimization_metric: str = "metric_mean_cv",
    ):
        """
        :param sklearn_estimator: Scikit-learn estimator to be fitted.
        :param validation_metric: Score function or loss function with signature validation_metric(y_true, y_pred),
            must return float/integer value of metric.
        :param direction: Direction of optimization.
        :param saved_files_path: Directory to save logging files.
        :param optimization_metric: Metric to optimize.
        """

        assert isinstance(validation_metric, Callable)

        super().__init__(
            direction,
            saved_files_path,
            optimization_metric,
            validation_metric,
            model_type="sklearn",
        )

        self.sklearn_estimator = sklearn_estimator

    def _initiate_neptune_run(
        self, neptune_run_params: dict, upload_files: Iterable[str] = []
    ) -> None:
        """Initiation of neptune run.

        :param neptune_run_params: Neptune run parameters (will be passed to `neptune.init_run <https://docs.neptune.ai/api-reference/neptune#.init_run>`_).
        :param upload_files: List of paths to files which will be logged in neptune run.
        """

        file_name = r"{}/saved_utils/sklearn_estimator.pickle".format(
            self.saved_files_path
        )
        with open(file_name, "wb") as f:
            pickle.dump(self.sklearn_estimator, f)
        self.run[file_name.split(self.saved_files_path)[-1]].upload(
            file_name, wait=True
        )
        os.remove(file_name)

    def _initiate_mlflow_run(
        self,
        tracking_uri: str,
        experiment_name: str,
        mlflow_run_params: dict,
        upload_files: Iterable[str] = [],
    ) -> None:
        """Initiation of mlflow run.

        :param tracking_uri: URI of mlflow server (will be passed to `mlflow.set_tracking_uri <https://www.mlflow.org/docs/latest/python_api/mlflow.html#mlflow.set_tracking_uri>`_).
        :param experiment_name: Name of mlflow experiment for logging (will be passed to `mlflow.set_experiment <https://www.mlflow.org/docs/latest/python_api/mlflow.html#mlflow.set_experiment>`_)
        :param mlflow_run_params: Mlflow run parameters (will be passed to `mlflow.start_run <https://www.mlflow.org/docs/latest/python_api/mlflow.html#mlflow.start_run>`_).
        :param upload_files: List of paths to files which will be logged in mlflow run.
        """

        file_name = r"{}/saved_utils/sklearn_estimator.pickle".format(
            self.saved_files_path
        )
        with open(file_name, "wb") as f:
            pickle.dump(self.sklearn_estimator, f)
        mlflow.log_artifact(
            file_name, file_name.split(self.saved_files_path)[-1].split("/")[1]
        )
        os.remove(file_name)

    def _run_iteration(self, X, y, cv, params, trial_number):
        """Train, evaluate scikit-learn model with defined parameters and log metrics.

        :param X: training data
        :type X: ndarray
        :param y: target values
        :type y: ndarray
        :param cv: indexes of cross validation
        :type cv: iterable of (train_inex, test_index)
        :param params: dictionary of parameters
        :type params: dict
        :param trial_number: number of current trial
        :type trial_number: int
        :return: metrics of current iteration
        :rtype: dictionary with keys ('metric_mean_cv' - mean metric on cross validation,
                                      'metric_std_cv' - standard deviation of metric on cross validation
                                      'metric_test' - metric on test data)
        """

        if self.direction == "maximize":
            is_higher_better = True
        else:
            is_higher_better = False

        sklearn_scoring = make_scorer(
            self.validation_metric, greater_is_better=is_higher_better
        )

        estimator = self.sklearn_estimator
        estimator_initialised = estimator(**params["model_params"])
        cv_results = cross_validate(
            estimator_initialised,
            X,
            y,
            cv=cv,
            scoring=sklearn_scoring,
            return_estimator=True,
            error_score="raise",
        )

        model_file_paths = _save_models(
            cv_results["estimator"], self.saved_files_path, trial_number
        )

        if self.direction == "maximize":
            coef = 1
        else:
            coef = -1

        return {
            "metrics": {
                "metric_mean_cv": np.mean(cv_results["test_score"][:-1] * coef),
                "metric_std_cv": np.std(cv_results["test_score"][:-1]),
                "metric_test": cv_results["test_score"][-1] * coef,
            },
            "file_paths": [*model_file_paths],
            "params": params,
        }