Scikit-learn Examples#
This section demonstrates how to use SACRO-ML with scikit-learn models for privacy assessment.
Cancer Dataset Example#
Training a Random Forest model on the breast cancer dataset and running privacy attacks.
Training the Model:
Training Random Forest on Cancer Dataset#
1"""Example training a Random Forest classifier on breast cancer data.
2
3This simple example demonstrates how the model and data can be passed to
4the Target wrapper, which creates a directory with all saved information.
5"""
6
7import logging
8
9from sklearn.datasets import load_breast_cancer
10from sklearn.ensemble import RandomForestClassifier
11from sklearn.model_selection import train_test_split
12
13from sacroml.attacks.target import Target
14
15output_dir = "target_rf_breast_cancer"
16
17
18if __name__ == "__main__":
19 logging.info("Loading dataset")
20 X, y = load_breast_cancer(return_X_y=True, as_frame=False)
21
22 logging.info("Splitting data into training and test sets")
23 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
24
25 logging.info("Defining the model")
26 model = RandomForestClassifier(min_samples_split=2, min_samples_leaf=1)
27
28 logging.info("Training the model")
29 model.fit(X_train, y_train)
30
31 logging.info("Wrapping the model and data in a Target object")
32 target = Target(
33 model=model,
34 dataset_name="breast cancer",
35 X_train=X_train,
36 y_train=y_train,
37 X_test=X_test,
38 y_test=y_test,
39 )
40
41 logging.info("Writing Target object to directory: '%s'", output_dir)
42 target.save(output_dir)
Running Privacy Attacks:
Running Privacy Attacks on Cancer Model#
1"""Example of how to run attacks on a model saved with the Target wrapper."""
2
3import logging
4
5from sacroml.attacks.likelihood_attack import LIRAAttack
6from sacroml.attacks.structural_attack import StructuralAttack
7from sacroml.attacks.target import Target
8from sacroml.attacks.worst_case_attack import WorstCaseAttack
9
10output_dir = "output_rf_breast_cancer"
11target_dir = "target_rf_breast_cancer"
12
13if __name__ == "__main__":
14 logging.info("Loading Target object from '%s'", target_dir)
15 target = Target()
16 target.load(target_dir)
17
18 logging.info("Running LiRA attack")
19 attack = LIRAAttack(n_shadow_models=100, output_dir=output_dir)
20 attack.attack(target)
21
22 logging.info("Running worst case attack")
23 attack = WorstCaseAttack(
24 n_reps=10,
25 n_dummy_reps=1,
26 train_beta=5,
27 test_beta=2,
28 p_thresh=0.05,
29 test_prop=0.5,
30 output_dir=output_dir,
31 )
32 attack.attack(target)
33
34 logging.info("Running structural attack")
35 attack = StructuralAttack(output_dir=output_dir)
36 attack.attack(target)
37
38 logging.info("Report available in directory: '%s'", output_dir)
Nursery Dataset Example#
Training a Random Forest model on the nursery dataset and assessing privacy risks.
Training the Model:
Training Random Forest on Nursery Dataset#
1"""Example training a Random Forest classifier on the OpenML nursery dataset.
2
3This example demonstrates how a dataset module can be supplied to the Target
4wrapper along with the train and test indices. This is in contrast to the
5breast cancer example where numpy arrays are passed directly.
6
7This example also shows how to add feature encoding information to the Target
8object. This is only necessary for attribute inference attacks.
9
10A directory is created with the saved model and dataset code, which can then
11be used to run attacks.
12"""
13
14import logging
15
16from dataset import Nursery
17from sklearn.ensemble import RandomForestClassifier
18
19from sacroml.attacks.target import Target
20
21output_dir = "target_rf_nursery"
22
23if __name__ == "__main__":
24 logging.info("Loading dataset")
25 handler = Nursery()
26
27 logging.info("Splitting data into training and test sets")
28 indices_train, indices_test = handler.get_train_test_indices()
29
30 logging.info("Getting data")
31 X, y = handler.get_data()
32 X_train, y_train = handler.get_subset(X, y, indices_train)
33 X_test, y_test = handler.get_subset(X, y, indices_test)
34
35 logging.info("Defining the model")
36 model = RandomForestClassifier(bootstrap=False)
37
38 logging.info("Training the model")
39 model.fit(X_train, y_train)
40 acc_train = model.score(X_train, y_train)
41 acc_test = model.score(X_test, y_test)
42 logging.info("Base model train accuracy: %.4f", acc_train)
43 logging.info("Base model test accuracy: %.4f", acc_test)
44
45 logging.info("Wrapping the model and data in a Target object")
46 target = Target(
47 model=model,
48 dataset_name="Nursery", # Must match the class name in dataset module
49 dataset_module_path="dataset.py",
50 indices_train=indices_train,
51 indices_test=indices_test,
52 )
53
54 logging.info("Wrapping feature details and encoding for attribute inference")
55 for i, index in enumerate(handler.feature_indices):
56 target.add_feature(
57 name=handler.feature_names[i],
58 indices=index,
59 encoding="onehot",
60 )
61
62 logging.info("Writing Target object to directory: '%s'", output_dir)
63 target.save(output_dir)
Running Privacy Attacks:
Running Privacy Attacks on Nursery Model#
1"""Example of how to run attacks on a model saved with the Target wrapper."""
2
3import logging
4
5from sacroml.attacks.attribute_attack import AttributeAttack
6from sacroml.attacks.likelihood_attack import LIRAAttack
7from sacroml.attacks.structural_attack import StructuralAttack
8from sacroml.attacks.target import Target
9from sacroml.attacks.worst_case_attack import WorstCaseAttack
10
11output_dir = "output_rf_nursery"
12target_dir = "target_rf_nursery"
13
14if __name__ == "__main__":
15 logging.info("Loading Target object from '%s'", target_dir)
16 target = Target()
17 target.load(target_dir)
18
19 logging.info("Running LiRA attack")
20 attack = LIRAAttack(n_shadow_models=100, output_dir=output_dir)
21 attack.attack(target)
22
23 logging.info("Running worst case attack")
24 attack = WorstCaseAttack(
25 n_reps=10,
26 n_dummy_reps=1,
27 train_beta=5,
28 test_beta=2,
29 p_thresh=0.05,
30 test_prop=0.5,
31 output_dir=output_dir,
32 )
33 attack.attack(target)
34
35 logging.info("Running structural attack")
36 attack = StructuralAttack(output_dir=output_dir)
37 attack.attack(target)
38
39 logging.info("Running attribute attack")
40 attack = AttributeAttack(n_cpu=8, output_dir=output_dir)
41 attack.attack(target)
42
43 logging.info("Report available in directory: '%s'", output_dir)
Dataset Processing:
Nursery Dataset Processing#
1"""Example dataset handler for the OpenML nursery dataset.
2
3Scikit-learn datasets must implement `sacroml.attacks.data.SklearnDataHandler`.
4"""
5
6from collections.abc import Sequence
7
8import numpy as np
9from sklearn.datasets import fetch_openml
10from sklearn.model_selection import train_test_split
11from sklearn.preprocessing import LabelEncoder, OneHotEncoder
12
13from sacroml.attacks.data import SklearnDataHandler
14
15random_state = 1
16
17
18class Nursery(SklearnDataHandler):
19 """Nursery dataset handler."""
20
21 def __init__(self) -> None:
22 """Fetch and process the nursery dataset."""
23 # Get original dataset
24 nursery_data = fetch_openml(data_id=26, as_frame=True)
25 self.X_orig = np.asarray(nursery_data.data, dtype=str)
26 self.y_orig = np.asarray(nursery_data.target, dtype=str)
27
28 # Process dataset
29 self.label_enc = LabelEncoder()
30 self.feature_enc = OneHotEncoder()
31 self.X = self.feature_enc.fit_transform(self.X_orig).toarray()
32 self.y = self.label_enc.fit_transform(self.y_orig)
33
34 # Feature encoding information (only required for attribute inference)
35 self.feature_indices = [
36 [0, 1, 2], # parents
37 [3, 4, 5, 6, 7], # has_nurs
38 [8, 9, 10, 11], # form
39 [12, 13, 14, 15], # children
40 [16, 17, 18], # housing
41 [19, 20], # finance
42 [21, 22, 23], # social
43 [24, 25, 26], # health
44 ]
45 self.feature_names = nursery_data.feature_names
46
47 def __len__(self) -> int:
48 """Return the length of the dataset."""
49 return len(self.X)
50
51 def get_raw_data(self) -> tuple[np.ndarray, np.ndarray] | None:
52 """Return the original raw data arrays."""
53 return self.X_orig, self.y_orig
54
55 def get_data(self) -> tuple[np.ndarray, np.ndarray]:
56 """Return the processed data arrays."""
57 return self.X, self.y
58
59 def get_subset(
60 self, X: np.ndarray, y: np.ndarray, indices: Sequence[int]
61 ) -> tuple[np.ndarray, np.ndarray]:
62 """Return a subset of the data."""
63 return X[indices], y[indices]
64
65 def get_train_test_indices(self) -> tuple[Sequence[int], Sequence[int]]:
66 """Return train and test set indices."""
67 indices = range(len(self))
68 train, test = train_test_split(
69 indices, test_size=0.5, stratify=self.y, random_state=random_state
70 )
71 return train, test