User Stories#

This section contains real-world user stories demonstrating how SACRO-ML is used in Trusted Research Environments (TREs).

User Story 1: Basic Model Training and Attack#

A basic workflow showing model training and privacy assessment.

Researcher Template:

User Story 1 - Researcher Template#
  1"""RESEARCHER EXAMPLE FOR USER STORY 1.
  2
  3This file is an example of a researcher creating/training a machine learning
  4model and requesting for it to be released.
  5
  6This specific example uses the nursery dataset: data is read in and
  7pre-processed, and a classifier is trained and tested on this dataset.
  8
  9This example follows User Story 1.
 10
 11Steps:
 12
 13- Researcher reads in data and processes it.
 14- Researcher creates and trains a classifier.
 15- Researcher runs experiments themselves to check if their model is disclosive
 16  or not.
 17- Once satisfied, researcher calls request_release() to make it ready for TRE
 18"""
 19
 20import logging
 21import os
 22
 23import numpy as np
 24import pandas as pd
 25from sklearn.model_selection import train_test_split
 26from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 27
 28from sacroml.attacks.target import Target
 29from sacroml.safemodel.classifiers import SafeDecisionTreeClassifier
 30
 31
 32def main():
 33    """Create and train a model to be released."""
 34    # This section is not necessary but helpful - cleans up files that are
 35    # created by sacroml
 36    save_directory = "training_artefacts"
 37    print("Creating directory for training artefacts")
 38
 39    if not os.path.exists(save_directory):
 40        os.makedirs(save_directory)
 41
 42    print()
 43    print("Acting as researcher...")
 44    print()
 45
 46    # Read in and pre-process the dataset - replace this with your data
 47    # reading/pre-processing code
 48    print(os.getcwd())
 49    filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv")
 50    print("Reading data from " + filename)
 51    data_df = pd.read_csv(filename)
 52
 53    print()
 54
 55    labels = np.asarray(data_df["class"])
 56    data = np.asarray(data_df.drop(columns=["class"], inplace=False))
 57
 58    n_features = np.shape(data)[1]
 59    indices: list[list[int]] = [
 60        [0, 1, 2],  # parents
 61        [3, 4, 5, 6, 7],  # has_nurs
 62        [8, 9, 10, 11],  # form
 63        [12, 13, 14, 15],  # children
 64        [16, 17, 18],  # housing
 65        [19, 20],  # finance
 66        [21, 22, 23],  # social
 67        [24, 25, 26],  # health
 68    ]
 69
 70    (
 71        X_train_orig,
 72        X_test_orig,
 73        y_train_orig,
 74        y_test_orig,
 75    ) = train_test_split(
 76        data,
 77        labels,
 78        test_size=0.5,
 79        stratify=labels,
 80        shuffle=True,
 81    )
 82
 83    label_enc = LabelEncoder()
 84    feature_enc = OneHotEncoder()
 85    X_train = feature_enc.fit_transform(X_train_orig).toarray()
 86    y_train = label_enc.fit_transform(y_train_orig)
 87    X_test = feature_enc.transform(X_test_orig).toarray()
 88    y_test = label_enc.transform(y_test_orig)
 89
 90    logging.getLogger("attack-reps").setLevel(logging.WARNING)
 91    logging.getLogger("prep-attack-data").setLevel(logging.WARNING)
 92    logging.getLogger("attack-from-preds").setLevel(logging.WARNING)
 93
 94    # Create and train a SafeDecisionTree classifier on the above data
 95    model = SafeDecisionTreeClassifier(random_state=1)
 96    model.fit(X_train, y_train)
 97
 98    # Run a preliminary check to make sure the model is not disclosive
 99    _, _ = model.preliminary_check()
100
101    # Wrap the model and data in a Target object
102    # needed in order to call request_release()
103    target = Target(
104        model=model,
105        dataset_name="nursery",
106        # processed data
107        X_train=X_train,
108        y_train=y_train,
109        X_test=X_test,
110        y_test=y_test,
111        # original unprocessed data
112        X_train_orig=X_train_orig,
113        y_train_orig=y_train_orig,
114        X_test_orig=X_test_orig,
115        y_test_orig=y_test_orig,
116    )
117    for i in range(n_features):
118        target.add_feature(data_df.columns[i], indices[i], "onehot")
119
120    logging.info("Dataset: %s", target.dataset_name)
121    logging.info("Features: %s", target.features)
122    logging.info("X_train shape: %s", str(target.X_train.shape))
123    logging.info("y_train shape: %s", str(target.y_train.shape))
124    logging.info("X_test shape: %s", str(target.X_test.shape))
125    logging.info("y_test shape: %s", str(target.y_test.shape))
126
127    # Researcher can check for themselves whether their model passes individual
128    # disclosure checks.Leave this code as-is for output disclosure checking.
129    print("==========> first running attacks explicitly via run_attack()")
130    for attack_name in ["worst_case", "attribute", "lira"]:
131        print(f"===> running {attack_name} attack directly")
132        metadata = model.run_attack(target, attack_name, save_directory)
133        logging.info("metadata is:")
134        for key, val in metadata.items():
135            if isinstance(val, dict):
136                logging.info(" %s ", key)
137                for key1, val2 in val.items():
138                    logging.info("  %s : %s", key1, val2)
139            else:
140                logging.info(" %s : %s", key, val)
141
142    # Modify/re-run all of the above code until you're happy with the model
143    # you've created. If the tests do not pass, try changing the model or
144    # hyperparameters until the tests pass. When you are satisfied and ready to
145    # release your model, call the request release() function with the Target
146    # class you created above.
147
148    # This code will run checks for the TRE staff.
149
150    # NOTE: you should only do this when you have confirmed that the above
151    # tests pass. You would not normally waste your and TRE time calling this
152    # unless you have already checked that your model is not disclosive or can
153    # provide a justification for an exception request.
154
155    print("===> now running attacks implicitly via request_release()")
156    model.request_release(path=save_directory, ext="pkl", target=target)
157
158    # The files generated can be found in this file location.
159    print(f"Please see the files generated in: {save_directory}")
160
161
162if __name__ == "__main__":
163    main()

TRE Implementation:

User Story 1 - TRE Implementation#
 1"""TRE SCRIPT FOR USER STORY 1.
 2
 3This file contains the code needed to run user story 1.
 4
 5To run: change the user_story key inside the .yaml config file to '1', and run
 6the 'generate_disclosure_risk_report.py' file.
 7
 8NOTE: you should not need to change this file at all, set all parameters via
 9the .yaml file.
10"""
11
12import argparse
13import os
14
15import yaml
16
17from sacroml.attacks.attack_report_formatter import GenerateTextReport
18
19
20def generate_report(directory, attack_results, target, outfile):
21    """Generate report based on target model."""
22    print()
23    print("Acting as TRE...")
24    print()
25
26    text_report = GenerateTextReport()
27
28    attack_pathname = os.path.join(directory, attack_results)
29    text_report.process_attack_target_json(
30        attack_pathname, target_filename=os.path.join(directory, target)
31    )
32
33    out_pathname = os.path.join(directory, outfile)
34    text_report.export_to_file(output_filename=out_pathname, move_files=True)
35
36    print("Results written to " + out_pathname)
37
38
39def run_user_story(release_config: dict):
40    """Run the user story, parsing arguments and then invoking report generation."""
41    generate_report(
42        release_config["training_artefacts_dir"],
43        release_config["attack_results"],
44        release_config["target_results"],
45        release_config["outfile"],
46    )
47
48
49if __name__ == "__main__":
50    parser = argparse.ArgumentParser(
51        description=(
52            "Generate a risk report after request_release() "
53            "has been called by researcher"
54        )
55    )
56
57    parser.add_argument(
58        "--config_file",
59        type=str,
60        action="store",
61        dest="config_file",
62        required=False,
63        default="default_config.yaml",
64        help=("Name of yaml configuration file"),
65    )
66
67    args = parser.parse_args()
68
69    try:
70        with open(args.config_file, encoding="utf-8") as handle:
71            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
72    except AttributeError as error:
73        print(
74            f"Invalid command. Try --help to get more detailserror message is {error}"
75        )
76
77    run_user_story(config)

User Story 2: Data Processing and Privacy Assessment#

Advanced workflow including data preprocessing and comprehensive privacy assessment.

Data Processing:

User Story 2 - Data Processing#
 1"""SUPPORTING FILE FOR USER STORY 2.
 2
 3This file is an example of a function created by a researcher that will
 4pre-process a dataset.
 5
 6To use: write a function that will process your input data, and output the
 7processed version.
 8
 9NOTE: in order to work, this function needs to:
10
11- take a single parameter (the data to be processed)
12- return a dictionary
13- which contains the keys ]
14    ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices']
15"""
16
17import numpy as np
18from sklearn.model_selection import train_test_split
19from sklearn.preprocessing import LabelEncoder, OneHotEncoder
20
21
22def process_dataset(data):
23    """Create a function that does the data pre-processing for user story 2."""
24    # Replace the contents of this function with your pre-processing code
25
26    labels = np.asarray(data["class"])
27    data = np.asarray(data.drop(columns=["class"], inplace=False))
28
29    n_features_raw_data = np.shape(data)[1]
30
31    label_enc = LabelEncoder()
32    feature_enc = OneHotEncoder()
33    X_transformed = feature_enc.fit_transform(data).toarray()
34    y_transformed = label_enc.fit_transform(labels)
35
36    row_indices = np.arange(np.shape(X_transformed)[0])
37
38    # This step is not necessary, however it's the simplest way of getting
39    # training indices from the data. Any method of generating indices of
40    # samples to be used for training will work here.
41    (
42        X_train,
43        X_test,
44        y_train,
45        y_test,
46        train_indices,
47        test_indices,
48    ) = train_test_split(
49        X_transformed,
50        y_transformed,
51        row_indices,
52        test_size=0.5,
53        stratify=y_transformed,
54        shuffle=True,
55    )
56
57    returned = {}
58    returned["n_features_raw_data"] = n_features_raw_data
59    returned["X_transformed"] = X_transformed
60    returned["y_transformed"] = y_transformed
61    returned["train_indices"] = train_indices
62
63    return returned

Researcher Template:

User Story 2 - Researcher Template#
  1"""RESEARCHER EXAMPLE FOR USER STORY 2.
  2
  3This file is an example of a researcher creating/training a machine learning
  4model and to be released form a secure environment.
  5
  6This specific example uses the nursery dataset: data is read in and
  7pre-processed, and a classifier is trained and tested on this dataset.
  8
  9This example follows User Story 2.
 10
 11Steps:
 12
 13- Researcher creates a function to read and process a dataset, which a TRE can
 14  also use and call.
 15- Researcher creates and trains a classifier on this data.
 16- Researcher emails (or otherwise contacts) TRE to request the model be released.
 17- TREs will use this code/functions to test the model themselves.
 18"""
 19
 20import logging
 21import os
 22
 23import numpy as np
 24import pandas as pd
 25from data_processing_researcher import process_dataset
 26
 27from sacroml.attacks.target import Target
 28from sacroml.safemodel.classifiers import SafeDecisionTreeClassifier
 29
 30
 31def run_user_story():
 32    """Create and train a model to be released."""
 33    # This section is not necessary but helpful - cleans up files that are
 34    # created by sacroml
 35    directory = "training_artefacts"
 36    print("Creating directory for training artefacts")
 37
 38    if not os.path.exists(directory):
 39        os.makedirs(directory)
 40
 41    print()
 42    print("Acting as researcher...")
 43    print()
 44
 45    # Read in and pre-process the dataset - replace this with your dataset
 46    filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv")
 47    print("Reading data from " + filename)
 48    data = pd.read_csv(filename)
 49
 50    # Write a function to pre-process the data that the TRE can call (see
 51    # data_processing_researcher.py) Use the output of this function to split
 52    # the data into training/testing sets.
 53
 54    # NOTE: to use this user story/script, the process_dataset function MUST:
 55    # take a single parameter (the data to be processed) return a dictionary
 56    # which contains the keys:
 57    # >>> ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices']
 58    # as in this example.
 59
 60    returned = process_dataset(data)
 61
 62    X_transformed = returned["X_transformed"]
 63    y_transformed = returned["y_transformed"]
 64
 65    train_indices = set(returned["train_indices"])
 66
 67    X_train = []
 68    X_test = []
 69    y_train = []
 70    y_test = []
 71
 72    for i, label in enumerate(y_transformed):
 73        if i in train_indices:
 74            X_train.append(X_transformed[i])
 75            y_train.append(label)
 76        else:
 77            X_test.append(X_transformed[i])
 78            y_test.append(label)
 79
 80    logging.getLogger("attack-reps").setLevel(logging.WARNING)
 81    logging.getLogger("prep-attack-data").setLevel(logging.WARNING)
 82    logging.getLogger("attack-from-preds").setLevel(logging.WARNING)
 83
 84    # Build a model and request its release
 85    model = SafeDecisionTreeClassifier(random_state=1)
 86    model.fit(X_train, y_train)
 87    model.request_release(path=directory, ext="pkl")
 88
 89    # Wrap the model and data in a Target object
 90    target = Target(
 91        model=model,
 92        dataset_name="nursery",
 93        X_train=X_train,
 94        y_train=y_train,
 95        X_test=X_test,
 96        y_test=y_test,
 97    )
 98
 99    # NOTE: we assume here that the researcher does not use the target.save()
100    # function and instead provides only the model and the list of indices
101    # which have been used to split the dataset, which will allow a TRE to
102    # re-create the input data used in training.
103
104    logging.info("Dataset: %s", target.name)
105    logging.info("Features: %s", target.features)
106    logging.info("X_train shape = %s", np.shape(target.X_train))
107    logging.info("y_train shape = %s", np.shape(target.y_train))
108    logging.info("X_test shape = %s", np.shape(target.X_test))
109    logging.info("y_test shape = %s", np.shape(target.y_test))
110
111
112if __name__ == "__main__":
113    run_user_story()

TRE Implementation:

User Story 2 - TRE Implementation#
  1"""TRE SCRIPT FOR USER STORY 2.
  2
  3This file contains the code needed to run user story 2.
  4
  5To run: change the user_story key inside the .yaml config file to '2', and run
  6the 'generate_disclosure_risk_report.py' file.
  7
  8NOTE: you should not need to change this file at all, set all parameters via
  9the .yaml file.
 10"""
 11
 12import argparse
 13import importlib
 14import os
 15import pickle
 16
 17import numpy as np
 18import pandas as pd
 19import yaml
 20
 21from sacroml.attacks.attack_report_formatter import GenerateTextReport
 22from sacroml.attacks.target import Target
 23
 24
 25def process_dataset(filename, function_name, data_to_be_processed):
 26    """Process dataset.
 27
 28    DO NOT CHANGE: this is a wrapper function that allows a callable function
 29    to be read from a file.
 30    """
 31    spec = importlib.util.spec_from_file_location(function_name, filename)
 32    module = importlib.util.module_from_spec(spec)
 33    spec.loader.exec_module(module)
 34    function = getattr(module, function_name)
 35    return function(data_to_be_processed)
 36
 37
 38def generate_report(
 39    data_processing_filename,
 40    data_processing_function_name,
 41    dataset_filename,
 42    directory,
 43    target_model,
 44    attack_results,
 45    target_filename,
 46    outfile,
 47):
 48    """Generate report based on target model."""
 49    print()
 50    print("Acting as TRE...")
 51    print(
 52        "(when instructions on how to recreate the dataset have "
 53        "been provided by the researcher)"
 54    )
 55    print(directory)
 56    print()
 57
 58    # Read in the model supplied by the researcher
 59    filename = os.path.join(directory, target_model)
 60    print("Reading target model from " + filename)
 61    with open(filename, "rb") as f:
 62        target_model = pickle.load(f)
 63
 64    # Read the data used by the researcher, and process it using their defined function
 65    print("Reading data from " + dataset_filename)
 66    data = pd.read_csv(dataset_filename)
 67
 68    returned = process_dataset(
 69        data_processing_filename, data_processing_function_name, data
 70    )
 71    X_transformed = returned["X_transformed"]
 72    y_transformed = returned["y_transformed"]
 73    train_indices = set(returned["train_indices"])
 74
 75    X_train = []
 76    X_test = []
 77    y_train = []
 78    y_test = []
 79
 80    for i, label in enumerate(y_transformed):
 81        if i in train_indices:
 82            X_train.append(X_transformed[i])
 83            y_train.append(label)
 84        else:
 85            X_test.append(X_transformed[i])
 86            y_test.append(label)
 87
 88    X_train = np.array(X_train)
 89    y_train = np.array(y_train)
 90    X_test = np.array(X_test)
 91    y_test = np.array(y_test)
 92
 93    # Wrap the model and data in a Target object
 94    target = Target(
 95        model=target_model,
 96        X_train=X_train,
 97        y_train=y_train,
 98        X_test=X_test,
 99        y_test=y_test,
100    )
101
102    # TRE calls request_release()
103    print("===> now running attacks implicitly via request_release()")
104    target_model.request_release(path=directory, ext="pkl", target=target)
105
106    print(f"Please see the files generated in: {directory}")
107
108    # Generate a report indicating calculated disclosure risk
109    text_report = GenerateTextReport()
110    text_report.process_attack_target_json(
111        os.path.join(directory, attack_results),
112        target_filename=os.path.join(directory, target_filename),
113    )
114
115    text_report.export_to_file(
116        output_filename=os.path.join(directory, outfile), move_files=True
117    )
118
119    print("Results written to " + str(os.path.join(directory, outfile)))
120
121
122def run_user_story(release_config: dict):
123    """Run the user story, parsing arguments and then invoking report generation."""
124    generate_report(
125        release_config["data_processing_filename"],
126        release_config["data_processing_function_name"],
127        release_config["dataset_filename"],
128        release_config["training_artefacts_dir"],
129        release_config["target_model"],
130        release_config["attack_results"],
131        release_config["target_results"],
132        release_config["outfile"],
133    )
134
135
136if __name__ == "__main__":
137    parser = argparse.ArgumentParser(
138        description=(
139            "Generate a risk report after request_release() "
140            "has been called by researcher"
141        )
142    )
143
144    parser.add_argument(
145        "--config_file",
146        type=str,
147        action="store",
148        dest="config_file",
149        required=False,
150        default="default_config.yaml",
151        help=("Name of yaml configuration file"),
152    )
153
154    args = parser.parse_args()
155
156    try:
157        with open(args.config_file, encoding="utf-8") as handle:
158            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
159    except AttributeError as error:
160        print(
161            f"Invalid command. Try --help to get more detailserror message is {error}"
162        )
163
164    run_user_story(config)

User Story 3: Advanced Privacy Analysis#

Comprehensive privacy analysis workflow for sensitive datasets.

Researcher Template:

User Story 3 - Researcher Template#
 1"""RESEARCHER EXAMPLE FOR USER STORY 3.
 2
 3This file is an example of a researcher creating/training a machine learning
 4model and to be released form a secure environment.
 5
 6This specific example uses the nursery dataset: data is read in and
 7pre-processed, and a classifier is trained and tested on this dataset.
 8
 9This example follows User Story 3.
10
11Steps:
12
13- Researcher creates and pre-processes a dataset.
14- Researcher creates and trains a classifier on this data.
15- Reasercher saves the model manually (e.g. using pickle, not through
16  request_release() or similar).
17- Researcher emails (or otherwise contacts) TRE to request the model be released.
18- TREs will use this model and data to test the model themselves.
19"""
20
21import os
22import pickle
23
24import numpy as np
25import pandas as pd
26from sklearn.ensemble import RandomForestClassifier
27from sklearn.metrics import accuracy_score
28from sklearn.model_selection import train_test_split
29from sklearn.preprocessing import LabelEncoder, OneHotEncoder
30
31
32def run_user_story():
33    """Create and train a model to be released."""
34    # This section is not necessary but helpful - cleans up files that are
35    # created by sacroml
36    directory = "training_artefacts"
37    print("Creating directory for training artefacts")
38
39    if not os.path.exists(directory):
40        os.makedirs(directory)
41
42    # Read in and pre-process the dataset - replace this with your data
43    # reading/pre-processing code.
44    filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv")
45    print("Reading data from " + filename)
46    data = pd.read_csv(filename)
47
48    target_encoder = LabelEncoder()
49    target_vals = target_encoder.fit_transform(data["class"].values)
50    target_dataframe = pd.DataFrame({"class": target_vals})
51    data = data.drop(columns=["class"], inplace=False)
52
53    feature_encoder = OneHotEncoder()
54    X_encoded = feature_encoder.fit_transform(data).toarray()
55    feature_dataframe = pd.DataFrame(
56        X_encoded, columns=feature_encoder.get_feature_names_out()
57    )
58
59    X_train, X_test, y_train, y_test = train_test_split(
60        feature_dataframe.values,
61        target_dataframe.to_numpy().flatten(),
62        test_size=0.7,
63        random_state=42,
64    )
65
66    # Save the training and test data to a file which a TRE can access
67    print("Saving training/testing data to ./" + directory)
68    np.savetxt(os.path.join(directory, "X_train.txt"), X_train, fmt="%d")
69    np.savetxt(os.path.join(directory, "y_train.txt"), y_train, fmt="%d")
70    np.savetxt(os.path.join(directory, "X_test.txt"), X_test, fmt="%d")
71    np.savetxt(os.path.join(directory, "y_test.txt"), y_test, fmt="%d")
72
73    # Create, train and test a model
74    # Replace this with your training and testing code
75    hyperparameters = {}
76    hyperparameters["min_samples_split"] = 5
77    hyperparameters["min_samples_leaf"] = 5
78    hyperparameters["max_depth"] = None
79    hyperparameters["bootstrap"] = False
80
81    target_model = RandomForestClassifier(**hyperparameters)
82    target_model.fit(X_train, y_train)
83
84    train_acc = accuracy_score(y_train, target_model.predict(X_train))
85    test_acc = accuracy_score(y_test, target_model.predict(X_test))
86    print(f"Training accuracy on model: {train_acc:.2f}")
87    print(f"Testing accuracy on model: {test_acc:.2f}")
88
89    # Save your model somewhere a TRE can access
90    filename = os.path.join(directory, "model.pkl")
91    print("Saving model to " + filename)
92    with open(filename, "wb") as file:
93        pickle.dump(target_model, file)
94
95
96if __name__ == "__main__":
97    run_user_story()

TRE Implementation:

User Story 3 - TRE Implementation#
  1"""TRE SCRIPT FOR USER STORY 3.
  2
  3This file contains the code needed to run user story 3.
  4
  5To run: change the user_story key inside the .yaml config file to '3', and run
  6the 'generate_disclosure_risk_report.py' file.
  7
  8NOTE: you should not need to change this file at all, set all parameters via
  9the .yaml file.
 10"""
 11
 12import argparse
 13import logging
 14import os
 15import pickle
 16
 17import numpy as np
 18import yaml
 19
 20from sacroml.attacks.attack_report_formatter import GenerateTextReport
 21from sacroml.attacks.likelihood_attack import LIRAAttack
 22from sacroml.attacks.target import Target
 23from sacroml.attacks.worst_case_attack import WorstCaseAttack
 24
 25
 26def generate_report(
 27    directory,
 28    target_model,
 29    X_train,
 30    y_train,
 31    X_test,
 32    y_test,
 33    target_filename,
 34    outfile,
 35):
 36    """Generate report based on target model."""
 37    print()
 38    print("Acting as TRE...")
 39    print()
 40
 41    if not os.path.exists(directory):
 42        os.makedirs(directory)
 43
 44    # Suppress messages from SACRO-ML -- comment out these lines to
 45    # see all the sacroml logging statements
 46    logging.getLogger("attack-reps").setLevel(logging.WARNING)
 47    logging.getLogger("prep-attack-data").setLevel(logging.WARNING)
 48    logging.getLogger("attack-from-preds").setLevel(logging.WARNING)
 49
 50    # Read the model to be released as supplied by the researcher
 51    model_filename = os.path.join(directory, target_model)
 52    print("Reading target model from " + model_filename)
 53    with open(model_filename, "rb") as file:
 54        target_model = pickle.load(file)
 55
 56    # Read the training/testing data as supplied by the researcher
 57    print("Reading training/testing data from ./" + directory)
 58    train_x = np.loadtxt(os.path.join(directory, X_train))
 59    train_y = np.loadtxt(os.path.join(directory, y_train))
 60    test_x = np.loadtxt(os.path.join(directory, X_test))
 61    test_y = np.loadtxt(os.path.join(directory, y_test))
 62
 63    # Wrap the training and test data into the Target object
 64    target = Target(
 65        model=target_model,
 66        X_train=train_x,
 67        y_train=train_y,
 68        X_test=test_x,
 69        y_test=test_y,
 70    )
 71    target.save(os.path.join(directory, "target"))
 72
 73    # Run the attack
 74    wca = WorstCaseAttack(n_dummy_reps=10, output_dir=directory)
 75    wca.attack(target)
 76
 77    # Run the LiRA attack to test disclosure risk
 78    lira_attack_obj = LIRAAttack(n_shadow_models=100, output_dir=directory)
 79    lira_attack_obj.attack(target)
 80
 81    text_report = GenerateTextReport()
 82    text_report.process_attack_target_json(
 83        os.path.join(directory, "report") + ".json",
 84        target_filename=os.path.join(directory, "target", target_filename),
 85    )
 86
 87    text_report.export_to_file(
 88        output_filename=os.path.join(directory, outfile),
 89        move_files=True,
 90        model_filename=model_filename,
 91    )
 92
 93    print("Results written to " + os.path.join(directory, outfile))
 94
 95
 96def run_user_story(release_config: dict):
 97    """Run the user story, parsing arguments and then invoking report generation."""
 98    generate_report(
 99        release_config["training_artefacts_dir"],
100        release_config["target_model"],
101        release_config["X_train_path"],
102        release_config["y_train_path"],
103        release_config["X_test_path"],
104        release_config["y_test_path"],
105        release_config["target_results"],
106        release_config["outfile"],
107    )
108
109
110if __name__ == "__main__":
111    parser = argparse.ArgumentParser(
112        description=(
113            "Generate a risk report after request_release() "
114            "has been called by researcher"
115        )
116    )
117
118    parser.add_argument(
119        "--config_file",
120        type=str,
121        action="store",
122        dest="config_file",
123        required=False,
124        default="default_config.yaml",
125        help=("Name of yaml configuration file"),
126    )
127
128    args = parser.parse_args()
129
130    try:
131        with open(args.config_file, encoding="utf-8") as handle:
132            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
133    except AttributeError as error:
134        print(
135            f"Invalid command. Try --help to get more detailserror message is {error}"
136        )
137
138    run_user_story(config)

Configuration and Utilities#

Default Configuration:

Default Configuration File#
 1---
 2# DIRECTIONS FOR USE
 3#
 4# Researcher should fill in this file with relevant parameters for the model
 5# they are releasing. Parameters tagged with a 'generated by code' label are
 6# files that are generated by sacroml. You can change the filename, but it is
 7# not necessary. All other parameters need to be set by either the researcher
 8# or TRE.
 9
10# Scenario to be run
11user_story: UNDEFINED
12
13# Details of experiments and files - replace these with the relevant filenames.
14
15# Path to the dataset used to train the model.
16# Researcher should supply this.
17dataset_filename: "./user_stories_resources/dataset_26_nursery.csv"
18
19# Name of the file (or directory for keras) containing the saved model.
20# Researcher should supply this.
21target_model: "model.pkl"
22
23# Location of the directory with all the files needed to assess this release.
24# Generated by code.
25outfile: "summary.txt"
26attack_results: "report.json"
27training_artefacts_dir: "training_artefacts"
28
29# User story 1, 2 or 3: name of the target results file generated by sacroml.
30# Generated by code.
31target_results: "target.yaml"
32
33# User story 2
34data_processing_filename: user_story_2/data_processing_researcher.py
35data_processing_function_name: process_dataset
36
37# User story 3
38X_train_path: "X_train.txt"
39y_train_path: "y_train.txt"
40X_test_path: "X_test.txt"
41y_test_path: "y_test.txt"
42
43# User story 4 - replace with path to csv files
44train_probabilities: "output_train.csv"
45test_probabilities: "output_test.csv"
46...

Report Generation:

Disclosure Risk Report Generation#
 1"""TRE script to perform disclosure checking for a trained ML model.
 2
 3Researchers should fill out the relevant parameters in the .yaml file, which
 4should be in the same directory as this file. TREs can change the script that
 5is run using the user_story parameter at the top of the file.
 6
 7To run this code:
 8
 9python generate_disclosure_risk_report.py (with the .yaml file in the same directory)
10
11NOTE: you should not need to change this file at all.
12"""
13
14import argparse
15
16import yaml
17from user_story_1 import user_story_1_tre
18from user_story_2 import user_story_2_tre
19from user_story_3 import user_story_3_tre
20from user_story_4 import user_story_4_tre
21from user_story_7 import user_story_7_tre
22from user_story_8 import user_story_8_tre
23
24if __name__ == "__main__":
25    parser = argparse.ArgumentParser(
26        description=("Run user stories code from a config file")
27    )
28
29    parser.add_argument(
30        "--config_file",
31        type=str,
32        action="store",
33        dest="config_file",
34        required=False,
35        default="default_config.yaml",
36        help=("Name of yaml configuration file"),
37    )
38
39    args = parser.parse_args()
40
41    try:
42        with open(args.config_file, encoding="utf-8") as handle:
43            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
44    except AttributeError as error:  # pragma:no cover
45        print(
46            f"Invalid command. Try --help to get more detailserror message is {error}"
47        )
48
49    user_story = config["user_story"]
50    if user_story == "UNDEFINED":
51        print(
52            "User story not selected, please select a user story by "
53            "referring to user_stories_flow_chart.png and adding the "
54            "relevant number to the the first line of 'default_config.yaml'"
55        )
56    elif user_story == 1:
57        user_story_1_tre.run_user_story(config)
58    elif user_story == 2:
59        user_story_2_tre.run_user_story(config)
60    elif user_story == 3:
61        user_story_3_tre.run_user_story(config)
62    elif user_story == 4:
63        user_story_4_tre.run_user_story(config)
64    elif user_story == 7:
65        user_story_7_tre.run_user_story(config)
66    elif user_story == 8:
67        user_story_8_tre.run_user_story(config)
68    else:
69        raise NotImplementedError(f"User story {user_story} has not been implemented")