Source code for sacroml.metrics

"""Calculate metrics."""

from __future__ import annotations

from collections.abc import Iterable

import numpy as np
from scipy import interpolate
from scipy.stats import norm
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

VAR_THRESH = 1e-2


def _div(x: float, y: float, default: float) -> float:
    """Solve the problem of division by 0 and round up.

    If y is non-zero, perform x/y and round to 8dp. If it is zero, return the
    default.

    Parameters
    ----------
    x : float
        numerator
    y : float
        denominator
    default : float
        return value if y == 0

    Returns
    -------
    division : float
        x / y, or default if y == 0
    """
    return round(float(x / y), 8) if y != 0 else float(default)


def _tpr_at_fpr(
    y_true: Iterable[float],
    y_score: Iterable[float],
    fpr: float = 0.001,
    fpr_perc: bool = False,
) -> float:
    """Compute the TPR at a fixed FPR.

    In particular, returns the TPR value corresponding to a particular FPR.
    Uses interpolation to fill in gaps.

    Parameters
    ----------
    y_true : Iterable[float]
        actual class labels
    y_score : Iterable[float]
        predicted score
    fpr : float
        false positive rate at which to compute true positive rate
    fpr_perc : bool
        if the fpr is defined as a percentage

    Returns
    -------
    tpr : float
        true positive rate at fpr
    """
    if fpr_perc:
        fpr /= 100.0

    fpr_vals, tpr_vals, thresh_vals = roc_curve(y_true, y_score)
    thresh_from_fpr = interpolate.interp1d(fpr_vals, thresh_vals)
    tpr_from_thresh = interpolate.interp1d(thresh_vals, tpr_vals)

    thresh = thresh_from_fpr(fpr)
    return tpr_from_thresh(thresh)


def _expected_auc_var(auc: float, num_pos: int, num_neg: int) -> float:
    """Compute variance of AUC under assumption of uniform probabilities.

    Uses the expression given as eqn (2) in  https://cs.nyu.edu/~mohri/pub/area.pdf.

    Parameters
    ----------
    auc : float
        auc value at which to compute the variance
    num_pos : int
        number of positive examples
    num_neg : int
        number of negative examples

    Returns
    -------
    var : float
        null variance of AUC
    """
    p_xxy = p_xyy = 1 / 3
    return (
        auc * (1 - auc)
        + (num_pos - 1) * (p_xxy - auc**2)
        + (num_neg - 1) * (p_xyy - auc**2)
    ) / (num_pos * num_neg)


[docs] def min_max_disc( y_true: np.ndarray, pred_probs: np.ndarray, x_prop: float = 0.1, log_p: bool = True ) -> tuple[float, float, float, float]: """Return non-average-case methods for MIA attacks. Considers actual frequency of membership amongst samples with highest- and lowest- assessed probability of membership. If an MIA method confidently asserts that 5% of samples are members and 5% of samples are not, but cannot tell for the remaining 90% of samples, then these metrics will flag this behaviour, but AUC/advantage may not. Since the difference may be noisy, a p-value against a null of independence of true membership and assessed membership probability (that is, membership probabilities are essentially random) is also used as a metric (using a usual Gaussian approximation to binomial). If the p-value is low and the frequency difference is high (>0.5) then the MIA attack is successful for some samples. Parameters ---------- y : np.ndarray true labels yp : np.ndarray probabilities of labels, or monotonic transformation of probabilities xprop : float proportion of samples with highest- and lowest- probability of membership to be considered logp : bool convert p-values to log(p). Returns ------- maxd : float frequency of y=1 amongst proportion xprop of individuals with highest assessed membership probability mind : float frequency of y=1 amongst proportion xprop of individuals with lowest assessed membership probability mmd : float difference between maxd and mind pval : float p-value or log-p value corresponding to mmd against null hypothesis that random variables corresponding to y and yp are independent. Examples -------- >>> y = np.random.choice(2, 100) >>> yp = np.random.rand(100) >>> maxd, mind, mmd, pval = min_max_desc(y, yp, xprop=0.2, logp=True) """ n_examples = int(np.ceil(len(y_true) * x_prop)) pos_frequency = np.mean(y_true) # average frequency y_order = np.argsort(pred_probs) # ordering permutation # Frequencies # y values corresponding to lowest k values of yp y_lowest_n = y_true[y_order[:n_examples]] # y values corresponding to highest k values of yp y_highest_n = y_true[y_order[-(n_examples):]] maxd = np.mean(y_highest_n) mind = np.mean(y_lowest_n) mmd = maxd - mind # P-value # mmd is asymptotically distributed as N(0,sdm^2) under null. sdm = np.sqrt(2 * pos_frequency * (1 - pos_frequency) / n_examples) pval = 1 - norm.cdf(mmd, loc=0, scale=sdm) # normal CDF if log_p: pval = -115.13 if pval < 1e-50 else np.log(pval) # Return return maxd, mind, mmd, pval
[docs] def auc_p_val(auc: float, n_pos: int, n_neg: int) -> tuple[float, float]: """Compute the p-value for a given AUC. Parameters ---------- auc : float Observed AUC value n_pos : int Number of positive examples n_neg : int Number of negative examples Returns ------- auc_p : float p-value of observing an AUC > auc by chance auc_std : float standard deviation of the NULL AUC diustribution (mean = 0.5) """ auc_std = np.sqrt(_expected_auc_var(0.5, n_pos, n_neg)) auc_p = 1 - norm.cdf(auc, loc=0.5, scale=auc_std) return auc_p, auc_std
def _permute_rows(X_test: np.ndarray, y_test: np.ndarray) -> None: """Permute rows. Parameters ---------- X_test : np.ndarray Array of features to be permuted. y_test : np.ndarray Array of labels to be permuted. """ N, _ = X_test.shape order = np.random.RandomState(seed=10).permutation(N) X_test = X_test[order, :] y_test = y_test[order]
[docs] def get_metrics( # pylint: disable=too-many-locals y_pred_proba: np.ndarray, y_test: np.ndarray, permute_rows: bool = True ) -> dict: """Calculate metrics, including attacker advantage for MIA binary. Implemented as Definition 4 on https://arxiv.org/pdf/1709.01604.pdf which is also implemented in tensorFlow-privacy https://github.com/tensorflow/privacy. Parameters ---------- y_test : np.ndarray Test data labels. y_pred_proba : np.ndarray of shape [x,2] and type float Predicted probabilities. permute_rows : bool, default True Whether to permute arrays, see: https://github.com/AI-SDC/SACRO-ML/issues/106 Returns ------- metrics : dict dictionary of metric values Notes ----- Includes the following metrics: * True positive rate or recall (TPR). * False positive rate (FPR), proportion of negative examples incorrectly \ classified as positives. * False alarm rate (FAR), proportion of objects classified as positives \ that are incorrect, also known as false discovery rate. * True neagative rate (TNR). * Positive predictive value or precision (PPV). * Negative predictive value (NPV). * False neagative rate (FNR). * Accuracy (ACC). * F1 Score - harmonic mean of precision and recall. * Advantage. """ if len(y_pred_proba.shape) != 2: raise ValueError("y_pred must be an array of floats of shape [x,2]") if y_pred_proba.shape[1] != 2: raise ValueError("Metrics for multiclass classification are unsupported") if permute_rows: _permute_rows(y_pred_proba, y_test) y_pred = np.argmax(y_pred_proba, axis=1) y_pred_proba = y_pred_proba[:, 1] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() metrics = {} # true positive rate or recall metrics["TPR"] = round(float(tp / (tp + fn)), 8) # false positive rate, proportion of negative examples incorrectly classified as positives metrics["FPR"] = round(float(fp / (fp + tn)), 8) # False alarm rate, proportion of things classified as positives that are incorrect, # also known as false discovery rate metrics["FAR"] = _div(fp, (fp + tp), 0) # true negative rate or specificity metrics["TNR"] = round(float(tn / (tn + fp)), 8) # precision or positive predictive value metrics["PPV"] = _div(tp, (tp + fp), 0) # negative predictive value metrics["NPV"] = _div(tn, (tn + fn), 0) # false negative rate metrics["FNR"] = round(float(fn / (tp + fn)), 8) # overall accuracy metrics["ACC"] = round(float((tp + tn) / (tp + fp + fn + tn)), 8) # harmonic mean of precision and sensitivity metrics["F1score"] = _div( 2 * metrics["PPV"] * metrics["TPR"], metrics["PPV"] + metrics["TPR"], 0 ) # Advantage metrics["Advantage"] = float(abs(metrics["TPR"] - metrics["FPR"])) # calculate AUC of model metrics["AUC"] = round(roc_auc_score(y_test, y_pred_proba), 8) # Calculate AUC p-val metrics["P_HIGHER_AUC"], _ = auc_p_val( metrics["AUC"], y_test.sum(), len(y_test) - y_test.sum() ) fmax, fmin, fdif, pdif = min_max_disc(y_test, y_pred_proba) metrics["FMAX01"] = fmax metrics["FMIN01"] = fmin metrics["FDIF01"] = fdif metrics["PDIF01"] = -pdif # use -log(p) so answer is positive fmax, fmin, fdif, pdif = min_max_disc(y_test, y_pred_proba, x_prop=0.2) metrics["FMAX02"] = fmax metrics["FMIN02"] = fmin metrics["FDIF02"] = fdif metrics["PDIF02"] = -pdif # use -log(p) so answer is positive fmax, fmin, fdif, pdif = min_max_disc(y_test, y_pred_proba, x_prop=0.01) metrics["FMAX001"] = fmax metrics["FMIN001"] = fmin metrics["FDIF001"] = fdif metrics["PDIF001"] = -pdif # use -log(p) so answer is positive # Add some things useful for debugging / filtering metrics["pred_prob_var"] = y_pred_proba.var() # TPR at various FPR fpr_vals = [0.5, 0.2, 0.1, 0.01, 0.001, 0.00001] for fpr in fpr_vals: tpr = _tpr_at_fpr(y_test, y_pred_proba, fpr=fpr) name = f"TPR@{fpr}" metrics[name] = tpr fpr, tpr, roc_thresh = roc_curve(y_test, y_pred_proba) metrics["fpr"] = fpr metrics["tpr"] = tpr metrics["roc_thresh"] = roc_thresh metrics["n_pos_test_examples"] = y_test.sum() metrics["n_neg_test_examples"] = (1 - y_test).sum() return metrics