Source code for tempor.metrics.prediction.one_off.plugin_builtin_classification

"""Module with built-in metric plugins for the category: prediction -> one-off -> classification."""

from typing import Any, List, Tuple, cast

import numpy as np
import sklearn.metrics
from sklearn.preprocessing import label_binarize

from tempor.core import plugins
from tempor.metrics import metric, metric_typing


def _cast_to_y_pred(y_pred_proba: np.ndarray) -> np.ndarray:
    """Turn an array of class probabilities ``y_pred_proba`` into an array of class predictions ``y_pred``.

    Args:
        y_pred_proba (np.ndarray): Predicted probabilities.

    Returns:
        np.ndarray: Predicted classes.
    """
    if y_pred_proba.ndim == 2:
        # Interpret as probabilities.
        y_pred = np.argmax(np.asarray(y_pred_proba), axis=1)
    elif y_pred_proba.ndim == 1:
        # Interpret as class labels.
        y_pred = np.asarray(y_pred_proba)
    else:
        raise ValueError(f"Invalid shape of y_pred_proba: {y_pred_proba.shape}")
    return y_pred


def _get_y_pred_proba_hlpr(y_pred_proba: np.ndarray, nclasses: int) -> np.ndarray:
    """A helper utility for inferring the correct y_pred_proba for multiclass situations, specifically in the case
    of binary classification. See source code for the specifics.

    Args:
        y_pred_proba (np.ndarray): Predicted probabilities.
        nclasses (int): Number of classes.

    Returns:
        np.ndarray: The correctly inferred ``y_pred_proba``.
    """
    if nclasses == 2:
        if len(y_pred_proba.shape) < 2:
            return y_pred_proba

        if y_pred_proba.shape[1] == 2:
            return y_pred_proba[:, 1]

    return y_pred_proba


def _prep_auc_multiclass(y_test: np.ndarray, y_pred_proba: np.ndarray) -> Tuple[np.ndarray, np.ndarray, int, List]:
    y_test = np.asarray(y_test)
    y_pred_proba = np.asarray(y_pred_proba)

    nnan = sum(np.ravel(np.isnan(y_pred_proba)))
    if nnan:
        raise ValueError("NaNs in predictions, aborting.")

    n_classes = len(set(np.ravel(y_test)))
    classes = sorted(set(np.ravel(y_test)))

    y_pred_proba_tmp = _get_y_pred_proba_hlpr(y_pred_proba, n_classes)

    return y_test, y_pred_proba_tmp, n_classes, classes


def _evaluate_aucroc_multiclass(
    y_test: np.ndarray,
    y_pred_proba: np.ndarray,
) -> float:
    """Helper for evaluating AUCROC for any number of classes."""

    y_test, y_pred_proba_tmp, n_classes, classes = _prep_auc_multiclass(y_test, y_pred_proba)

    if n_classes > 2:
        fpr = dict()
        tpr = dict()
        precision = dict()
        recall = dict()
        roc_auc: dict = dict()

        y_test = cast(np.ndarray, label_binarize(y_test, classes=classes, sparse_output=False))

        fpr["micro"], tpr["micro"], _ = sklearn.metrics.roc_curve(y_test.ravel(), y_pred_proba_tmp.ravel())
        roc_auc["micro"] = sklearn.metrics.auc(fpr["micro"], tpr["micro"])
        precision["micro"], recall["micro"], _ = sklearn.metrics.precision_recall_curve(
            y_test.ravel(), y_pred_proba_tmp.ravel()
        )

        aucroc = roc_auc["micro"]
    else:
        aucroc = sklearn.metrics.roc_auc_score(np.ravel(y_test), y_pred_proba_tmp, multi_class="ovr")

    return float(aucroc)


def _evaluate_aucprc_multiclass(
    y_test: np.ndarray,
    y_pred_proba: np.ndarray,
) -> float:
    """Helper for evaluating AUCPRC for any number of classes."""

    y_test, y_pred_proba_tmp, n_classes, classes = _prep_auc_multiclass(y_test, y_pred_proba)

    if n_classes > 2:
        fpr = dict()
        tpr = dict()
        precision = dict()
        recall = dict()
        average_precision = dict()

        y_test = cast(np.ndarray, label_binarize(y_test, classes=classes, sparse_output=False))

        fpr["micro"], tpr["micro"], _ = sklearn.metrics.roc_curve(y_test.ravel(), y_pred_proba_tmp.ravel())
        precision["micro"], recall["micro"], _ = sklearn.metrics.precision_recall_curve(
            y_test.ravel(), y_pred_proba_tmp.ravel()
        )
        average_precision["micro"] = sklearn.metrics.average_precision_score(y_test, y_pred_proba_tmp, average="micro")

        aucprc = average_precision["micro"]
    else:
        aucprc = sklearn.metrics.average_precision_score(np.ravel(y_test), y_pred_proba_tmp)

    return float(aucprc)


[docs]@plugins.register_plugin(name="accuracy", category="prediction.one_off.classification", plugin_type="metric")
class AccuracyOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Accuracy classification score."""

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.accuracy_score(actual, _cast_to_y_pred(predicted)),
        )


[docs]@plugins.register_plugin(name="f1_score_micro", category="prediction.one_off.classification", plugin_type="metric")
class F1ScoreMicroOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """F1 score is a harmonic mean of the precision and recall. This version uses the ``"micro"`` average: calculate
    metrics globally by counting the total true positives, false negatives and false positives.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.f1_score(
                actual,
                _cast_to_y_pred(predicted),
                average="micro",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="f1_score_macro", category="prediction.one_off.classification", plugin_type="metric")
class F1ScoreMacroOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """F1 score is a harmonic mean of the precision and recall. This version uses the ``"macro"`` average: calculate
    metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.f1_score(
                actual,
                _cast_to_y_pred(predicted),
                average="macro",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="f1_score_weighted", category="prediction.one_off.classification", plugin_type="metric")
class F1ScoreWeightedOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """F1 score is a harmonic mean of the precision and recall. This version uses the ``"weighted"`` average: calculate
    metrics for each label, and find their average weighted by support (the number of true instances for each label).
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.f1_score(
                actual,
                _cast_to_y_pred(predicted),
                average="weighted",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="kappa", category="prediction.one_off.classification", plugin_type="metric")
class KappaOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Computes Cohen's kappa, a score that expresses the level of agreement between two annotators on a classification
    problem.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.cohen_kappa_score(actual, _cast_to_y_pred(predicted)),
        )


[docs]@plugins.register_plugin(name="kappa_quadratic", category="prediction.one_off.classification", plugin_type="metric")
class KappaQuadraticOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Computes Cohen's kappa, a score that expresses the level of agreement between two annotators on a classification
    problem. Weighted using the `"quadratic"` weighting.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.cohen_kappa_score(actual, _cast_to_y_pred(predicted), weights="quadratic"),
        )


[docs]@plugins.register_plugin(name="recall_micro", category="prediction.one_off.classification", plugin_type="metric")
class RecallMicroOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Recall is defined as the number of true positives over the number of true positives plus the number of false
    negatives. This version (micro) calculates metrics globally by counting the total true positives.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.recall_score(
                actual,
                _cast_to_y_pred(predicted),
                average="micro",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="recall_macro", category="prediction.one_off.classification", plugin_type="metric")
class RecallMacroOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Recall is defined as the number of true positives over the number of true positives plus the number of false
    negatives. This version (macro) calculates metrics for each label, and finds their unweighted mean.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.recall_score(
                actual,
                _cast_to_y_pred(predicted),
                average="macro",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="recall_weighted", category="prediction.one_off.classification", plugin_type="metric")
class RecallWeightedOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Recall is defined as the number of true positives over the number of true positives plus the number of false
    negatives. This version(weighted) calculates metrics for each label, and find their average weighted by support.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.recall_score(
                actual,
                _cast_to_y_pred(predicted),
                average="weighted",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="precision_micro", category="prediction.one_off.classification", plugin_type="metric")
class PrecisionMicroOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Precision is defined as the number of true positives over the number of true positives plus the number of false
    positives. This version (micro) calculates metrics globally by counting the total true positives.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.precision_score(
                actual,
                _cast_to_y_pred(predicted),
                average="micro",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="precision_macro", category="prediction.one_off.classification", plugin_type="metric")
class PrecisionMacroOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Precision is defined as the number of true positives over the number of true positives plus the number of
    false positives. This version (macro) calculates metrics for each label, and finds their unweighted mean.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.precision_score(
                actual,
                _cast_to_y_pred(predicted),
                average="macro",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="precision_weighted", category="prediction.one_off.classification", plugin_type="metric")
class PrecisionWeightedOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """Precision is defined as the number of true positives over the number of true positives plus the number of
    false positives. This version (weighted) calculates metrics for each label, and find their average weighted
    by support.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.precision_score(
                actual,
                _cast_to_y_pred(predicted),
                average="weighted",
                zero_division=0,
            ),
        )


[docs]@plugins.register_plugin(name="mcc", category="prediction.one_off.classification", plugin_type="metric")
class MccOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """The Matthews Correlation Coefficient is used in machine learning as a measure of the quality of binary and
    multiclass classifications. It takes into account true and false positives and negatives and is generally
    regarded as a balanced measure which can be used even if the classes are of very different sizes.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return cast(
            float,
            sklearn.metrics.matthews_corrcoef(actual, _cast_to_y_pred(predicted)),
        )


[docs]@plugins.register_plugin(name="aucprc", category="prediction.one_off.classification", plugin_type="metric")
class AucPrcOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """The average precision summarizes a precision-recall curve as the weighted mean of precisions achieved at each
    threshold, with the increase in recall from the previous threshold used as the weight.
    """

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return _evaluate_aucprc_multiclass(actual, predicted)


[docs]@plugins.register_plugin(name="aucroc", category="prediction.one_off.classification", plugin_type="metric")
class AucRocOneOffClassificationMetric(metric.OneOffClassificationMetric):
    """The Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores."""

    @property
    def direction(self) -> metric_typing.MetricDirection:  # noqa: D102
        return "maximize"

    def _evaluate(self, actual: np.ndarray, predicted: np.ndarray, *args: Any, **kwargs: Any) -> float:
        return _evaluate_aucroc_multiclass(actual, predicted)