Scikit-learn Examples#

This section demonstrates how to use SACRO-ML with scikit-learn models for privacy assessment.

Cancer Dataset Example#

Training a Random Forest model on the breast cancer dataset and running privacy attacks.

Training the Model:

Training Random Forest on Cancer Dataset#

"""Example training a Random Forest classifier on breast cancer data.

This simple example demonstrates how the model and data can be passed to
the Target wrapper, which creates a directory with all saved information.
"""

import logging

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sacroml.attacks.target import Target

output_dir = "target_rf_breast_cancer"


if __name__ == "__main__":
    logging.info("Loading dataset")
    X, y = load_breast_cancer(return_X_y=True, as_frame=False)

    logging.info("Splitting data into training and test sets")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    logging.info("Defining the model")
    model = RandomForestClassifier(min_samples_split=2, min_samples_leaf=1)

    logging.info("Training the model")
    model.fit(X_train, y_train)

    logging.info("Wrapping the model and data in a Target object")
    target = Target(
        model=model,
        dataset_name="breast cancer",
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
    )

    logging.info("Writing Target object to directory: '%s'", output_dir)
    target.save(output_dir)

Running Privacy Attacks:

Running Privacy Attacks on Cancer Model#

"""Example of how to run attacks on a model saved with the Target wrapper."""

import logging

from sacroml.attacks.likelihood_attack import LIRAAttack
from sacroml.attacks.structural_attack import StructuralAttack
from sacroml.attacks.target import Target
from sacroml.attacks.worst_case_attack import WorstCaseAttack

output_dir = "output_rf_breast_cancer"
target_dir = "target_rf_breast_cancer"

if __name__ == "__main__":
    logging.info("Loading Target object from '%s'", target_dir)
    target = Target()
    target.load(target_dir)

    logging.info("Running LiRA attack")
    attack = LIRAAttack(n_shadow_models=100, output_dir=output_dir)
    attack.attack(target)

    logging.info("Running worst case attack")
    attack = WorstCaseAttack(
        n_reps=10,
        n_dummy_reps=1,
        train_beta=5,
        test_beta=2,
        p_thresh=0.05,
        test_prop=0.5,
        output_dir=output_dir,
    )
    attack.attack(target)

    logging.info("Running structural attack")
    attack = StructuralAttack(output_dir=output_dir)
    attack.attack(target)

    logging.info("Report available in directory: '%s'", output_dir)

Nursery Dataset Example#

Training a Random Forest model on the nursery dataset and assessing privacy risks.

Training the Model:

Training Random Forest on Nursery Dataset#

"""Example training a Random Forest classifier on the OpenML nursery dataset.

This example demonstrates how a dataset module can be supplied to the Target
wrapper along with the train and test indices. This is in contrast to the
breast cancer example where numpy arrays are passed directly.

This example also shows how to add feature encoding information to the Target
object. This is only necessary for attribute inference attacks.

A directory is created with the saved model and dataset code, which can then
be used to run attacks.
"""

import logging

from dataset import Nursery
from sklearn.ensemble import RandomForestClassifier

from sacroml.attacks.target import Target

output_dir = "target_rf_nursery"

if __name__ == "__main__":
    logging.info("Loading dataset")
    handler = Nursery()

    logging.info("Splitting data into training and test sets")
    indices_train, indices_test = handler.get_train_test_indices()

    logging.info("Getting data")
    X, y = handler.get_data()
    X_train, y_train = handler.get_subset(X, y, indices_train)
    X_test, y_test = handler.get_subset(X, y, indices_test)

    logging.info("Defining the model")
    model = RandomForestClassifier(bootstrap=False)

    logging.info("Training the model")
    model.fit(X_train, y_train)
    acc_train = model.score(X_train, y_train)
    acc_test = model.score(X_test, y_test)
    logging.info("Base model train accuracy: %.4f", acc_train)
    logging.info("Base model test accuracy: %.4f", acc_test)

    logging.info("Wrapping the model and data in a Target object")
    target = Target(
        model=model,
        dataset_name="Nursery",  # Must match the class name in dataset module
        dataset_module_path="dataset.py",
        indices_train=indices_train,
        indices_test=indices_test,
    )

    logging.info("Wrapping feature details and encoding for attribute inference")
    for i, index in enumerate(handler.feature_indices):
        target.add_feature(
            name=handler.feature_names[i],
            indices=index,
            encoding="onehot",
        )

    logging.info("Writing Target object to directory: '%s'", output_dir)
    target.save(output_dir)

Running Privacy Attacks:

Running Privacy Attacks on Nursery Model#

"""Example of how to run attacks on a model saved with the Target wrapper."""

import logging

from sacroml.attacks.attribute_attack import AttributeAttack
from sacroml.attacks.likelihood_attack import LIRAAttack
from sacroml.attacks.structural_attack import StructuralAttack
from sacroml.attacks.target import Target
from sacroml.attacks.worst_case_attack import WorstCaseAttack

output_dir = "output_rf_nursery"
target_dir = "target_rf_nursery"

if __name__ == "__main__":
    logging.info("Loading Target object from '%s'", target_dir)
    target = Target()
    target.load(target_dir)

    logging.info("Running LiRA attack")
    attack = LIRAAttack(n_shadow_models=100, output_dir=output_dir)
    attack.attack(target)

    logging.info("Running worst case attack")
    attack = WorstCaseAttack(
        n_reps=10,
        n_dummy_reps=1,
        train_beta=5,
        test_beta=2,
        p_thresh=0.05,
        test_prop=0.5,
        output_dir=output_dir,
    )
    attack.attack(target)

    logging.info("Running structural attack")
    attack = StructuralAttack(output_dir=output_dir)
    attack.attack(target)

    logging.info("Running attribute attack")
    attack = AttributeAttack(n_cpu=8, output_dir=output_dir)
    attack.attack(target)

    logging.info("Report available in directory: '%s'", output_dir)

Dataset Processing:

Nursery Dataset Processing#

"""Example dataset handler for the OpenML nursery dataset.

Scikit-learn datasets must implement `sacroml.attacks.data.SklearnDataHandler`.
"""

from collections.abc import Sequence

import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sacroml.attacks.data import SklearnDataHandler

random_state = 1


class Nursery(SklearnDataHandler):
    """Nursery dataset handler."""

    def __init__(self) -> None:
        """Fetch and process the nursery dataset."""
        # Get original dataset
        nursery_data = fetch_openml(data_id=26, as_frame=True)
        self.X_orig = np.asarray(nursery_data.data, dtype=str)
        self.y_orig = np.asarray(nursery_data.target, dtype=str)

        # Process dataset
        self.label_enc = LabelEncoder()
        self.feature_enc = OneHotEncoder()
        self.X = self.feature_enc.fit_transform(self.X_orig).toarray()
        self.y = self.label_enc.fit_transform(self.y_orig)

        # Feature encoding information (only required for attribute inference)
        self.feature_indices = [
            [0, 1, 2],  # parents
            [3, 4, 5, 6, 7],  # has_nurs
            [8, 9, 10, 11],  # form
            [12, 13, 14, 15],  # children
            [16, 17, 18],  # housing
            [19, 20],  # finance
            [21, 22, 23],  # social
            [24, 25, 26],  # health
        ]
        self.feature_names = nursery_data.feature_names

    def __len__(self) -> int:
        """Return the length of the dataset."""
        return len(self.X)

    def get_raw_data(self) -> tuple[np.ndarray, np.ndarray] | None:
        """Return the original raw data arrays."""
        return self.X_orig, self.y_orig

    def get_data(self) -> tuple[np.ndarray, np.ndarray]:
        """Return the processed data arrays."""
        return self.X, self.y

    def get_subset(
        self, X: np.ndarray, y: np.ndarray, indices: Sequence[int]
    ) -> tuple[np.ndarray, np.ndarray]:
        """Return a subset of the data."""
        return X[indices], y[indices]

    def get_train_test_indices(self) -> tuple[Sequence[int], Sequence[int]]:
        """Return train and test set indices."""
        indices = range(len(self))
        train, test = train_test_split(
            indices, test_size=0.5, stratify=self.y, random_state=random_state
        )
        return train, test

Scikit-learn Examples#

Cancer Dataset Example#

Nursery Dataset Example#

This Page