User Stories#

This section contains real-world user stories demonstrating how SACRO-ML is used in Trusted Research Environments (TREs).

User Story 1: Basic Model Training and Attack#

A basic workflow showing model training and privacy assessment.

Researcher Template:

User Story 1 - Researcher Template#

"""RESEARCHER EXAMPLE FOR USER STORY 1.

This file is an example of a researcher creating/training a machine learning
model and requesting for it to be released.

This specific example uses the nursery dataset: data is read in and
pre-processed, and a classifier is trained and tested on this dataset.

This example follows User Story 1.

Steps:

- Researcher reads in data and processes it.
- Researcher creates and trains a classifier.
- Researcher runs experiments themselves to check if their model is disclosive
  or not.
- Once satisfied, researcher calls request_release() to make it ready for TRE
"""

import logging
import os

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sacroml.attacks.target import Target
from sacroml.safemodel.classifiers import SafeDecisionTreeClassifier


def main():
    """Create and train a model to be released."""
    # This section is not necessary but helpful - cleans up files that are
    # created by sacroml
    save_directory = "training_artefacts"
    print("Creating directory for training artefacts")

    if not os.path.exists(save_directory):
        os.makedirs(save_directory)

    print()
    print("Acting as researcher...")
    print()

    # Read in and pre-process the dataset - replace this with your data
    # reading/pre-processing code
    print(os.getcwd())
    filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv")
    print("Reading data from " + filename)
    data_df = pd.read_csv(filename)

    print()

    labels = np.asarray(data_df["class"])
    data = np.asarray(data_df.drop(columns=["class"], inplace=False))

    n_features = np.shape(data)[1]
    indices: list[list[int]] = [
        [0, 1, 2],  # parents
        [3, 4, 5, 6, 7],  # has_nurs
        [8, 9, 10, 11],  # form
        [12, 13, 14, 15],  # children
        [16, 17, 18],  # housing
        [19, 20],  # finance
        [21, 22, 23],  # social
        [24, 25, 26],  # health
    ]

    (
        X_train_orig,
        X_test_orig,
        y_train_orig,
        y_test_orig,
    ) = train_test_split(
        data,
        labels,
        test_size=0.5,
        stratify=labels,
        shuffle=True,
    )

    label_enc = LabelEncoder()
    feature_enc = OneHotEncoder()
    X_train = feature_enc.fit_transform(X_train_orig).toarray()
    y_train = label_enc.fit_transform(y_train_orig)
    X_test = feature_enc.transform(X_test_orig).toarray()
    y_test = label_enc.transform(y_test_orig)

    logging.getLogger("attack-reps").setLevel(logging.WARNING)
    logging.getLogger("prep-attack-data").setLevel(logging.WARNING)
    logging.getLogger("attack-from-preds").setLevel(logging.WARNING)

    # Create and train a SafeDecisionTree classifier on the above data
    model = SafeDecisionTreeClassifier(random_state=1)
    model.fit(X_train, y_train)

    # Run a preliminary check to make sure the model is not disclosive
    _, _ = model.preliminary_check()

    # Wrap the model and data in a Target object
    # needed in order to call request_release()
    target = Target(
        model=model,
        dataset_name="nursery",
        # processed data
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        # original unprocessed data
        X_train_orig=X_train_orig,
        y_train_orig=y_train_orig,
        X_test_orig=X_test_orig,
        y_test_orig=y_test_orig,
    )
    for i in range(n_features):
        target.add_feature(data_df.columns[i], indices[i], "onehot")

    logging.info("Dataset: %s", target.dataset_name)
    logging.info("Features: %s", target.features)
    logging.info("X_train shape: %s", str(target.X_train.shape))
    logging.info("y_train shape: %s", str(target.y_train.shape))
    logging.info("X_test shape: %s", str(target.X_test.shape))
    logging.info("y_test shape: %s", str(target.y_test.shape))

    # Researcher can check for themselves whether their model passes individual
    # disclosure checks.Leave this code as-is for output disclosure checking.
    print("==========> first running attacks explicitly via run_attack()")
    for attack_name in ["worst_case", "attribute", "lira"]:
        print(f"===> running {attack_name} attack directly")
        metadata = model.run_attack(target, attack_name, save_directory)
        logging.info("metadata is:")
        for key, val in metadata.items():
            if isinstance(val, dict):
                logging.info(" %s ", key)
                for key1, val2 in val.items():
                    logging.info("  %s : %s", key1, val2)
            else:
                logging.info(" %s : %s", key, val)

    # Modify/re-run all of the above code until you're happy with the model
    # you've created. If the tests do not pass, try changing the model or
    # hyperparameters until the tests pass. When you are satisfied and ready to
    # release your model, call the request release() function with the Target
    # class you created above.

    # This code will run checks for the TRE staff.

    # NOTE: you should only do this when you have confirmed that the above
    # tests pass. You would not normally waste your and TRE time calling this
    # unless you have already checked that your model is not disclosive or can
    # provide a justification for an exception request.

    print("===> now running attacks implicitly via request_release()")
    model.request_release(path=save_directory, ext="pkl", target=target)

    # The files generated can be found in this file location.
    print(f"Please see the files generated in: {save_directory}")


if __name__ == "__main__":
    main()

TRE Implementation:

User Story 1 - TRE Implementation#

"""TRE SCRIPT FOR USER STORY 1.

This file contains the code needed to run user story 1.

To run: change the user_story key inside the .yaml config file to '1', and run
the 'generate_disclosure_risk_report.py' file.

NOTE: you should not need to change this file at all, set all parameters via
the .yaml file.
"""

import argparse
import os

import yaml

from sacroml.attacks.attack_report_formatter import GenerateTextReport


def generate_report(directory, attack_results, target, outfile):
    """Generate report based on target model."""
    print()
    print("Acting as TRE...")
    print()

    text_report = GenerateTextReport()

    attack_pathname = os.path.join(directory, attack_results)
    text_report.process_attack_target_json(
        attack_pathname, target_filename=os.path.join(directory, target)
    )

    out_pathname = os.path.join(directory, outfile)
    text_report.export_to_file(output_filename=out_pathname, move_files=True)

    print("Results written to " + out_pathname)


def run_user_story(release_config: dict):
    """Run the user story, parsing arguments and then invoking report generation."""
    generate_report(
        release_config["training_artefacts_dir"],
        release_config["attack_results"],
        release_config["target_results"],
        release_config["outfile"],
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=(
            "Generate a risk report after request_release() "
            "has been called by researcher"
        )
    )

    parser.add_argument(
        "--config_file",
        type=str,
        action="store",
        dest="config_file",
        required=False,
        default="default_config.yaml",
        help=("Name of yaml configuration file"),
    )

    args = parser.parse_args()

    try:
        with open(args.config_file, encoding="utf-8") as handle:
            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
    except AttributeError as error:
        print(
            f"Invalid command. Try --help to get more detailserror message is {error}"
        )

    run_user_story(config)

User Story 2: Data Processing and Privacy Assessment#

Advanced workflow including data preprocessing and comprehensive privacy assessment.

Data Processing:

User Story 2 - Data Processing#

"""SUPPORTING FILE FOR USER STORY 2.

This file is an example of a function created by a researcher that will
pre-process a dataset.

To use: write a function that will process your input data, and output the
processed version.

NOTE: in order to work, this function needs to:

- take a single parameter (the data to be processed)
- return a dictionary
- which contains the keys ]
    ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices']
"""

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


def process_dataset(data):
    """Create a function that does the data pre-processing for user story 2."""
    # Replace the contents of this function with your pre-processing code

    labels = np.asarray(data["class"])
    data = np.asarray(data.drop(columns=["class"], inplace=False))

    n_features_raw_data = np.shape(data)[1]

    label_enc = LabelEncoder()
    feature_enc = OneHotEncoder()
    X_transformed = feature_enc.fit_transform(data).toarray()
    y_transformed = label_enc.fit_transform(labels)

    row_indices = np.arange(np.shape(X_transformed)[0])

    # This step is not necessary, however it's the simplest way of getting
    # training indices from the data. Any method of generating indices of
    # samples to be used for training will work here.
    (
        X_train,
        X_test,
        y_train,
        y_test,
        train_indices,
        test_indices,
    ) = train_test_split(
        X_transformed,
        y_transformed,
        row_indices,
        test_size=0.5,
        stratify=y_transformed,
        shuffle=True,
    )

    returned = {}
    returned["n_features_raw_data"] = n_features_raw_data
    returned["X_transformed"] = X_transformed
    returned["y_transformed"] = y_transformed
    returned["train_indices"] = train_indices

    return returned

Researcher Template:

User Story 2 - Researcher Template#

"""RESEARCHER EXAMPLE FOR USER STORY 2.

This file is an example of a researcher creating/training a machine learning
model and to be released form a secure environment.

This specific example uses the nursery dataset: data is read in and
pre-processed, and a classifier is trained and tested on this dataset.

This example follows User Story 2.

Steps:

- Researcher creates a function to read and process a dataset, which a TRE can
  also use and call.
- Researcher creates and trains a classifier on this data.
- Researcher emails (or otherwise contacts) TRE to request the model be released.
- TREs will use this code/functions to test the model themselves.
"""

import logging
import os

import numpy as np
import pandas as pd
from data_processing_researcher import process_dataset

from sacroml.attacks.target import Target
from sacroml.safemodel.classifiers import SafeDecisionTreeClassifier


def run_user_story():
    """Create and train a model to be released."""
    # This section is not necessary but helpful - cleans up files that are
    # created by sacroml
    directory = "training_artefacts"
    print("Creating directory for training artefacts")

    if not os.path.exists(directory):
        os.makedirs(directory)

    print()
    print("Acting as researcher...")
    print()

    # Read in and pre-process the dataset - replace this with your dataset
    filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv")
    print("Reading data from " + filename)
    data = pd.read_csv(filename)

    # Write a function to pre-process the data that the TRE can call (see
    # data_processing_researcher.py) Use the output of this function to split
    # the data into training/testing sets.

    # NOTE: to use this user story/script, the process_dataset function MUST:
    # take a single parameter (the data to be processed) return a dictionary
    # which contains the keys:
    # >>> ['n_features_raw_data', 'X_transformed', 'y_transformed', 'train_indices']
    # as in this example.

    returned = process_dataset(data)

    X_transformed = returned["X_transformed"]
    y_transformed = returned["y_transformed"]

    train_indices = set(returned["train_indices"])

    X_train = []
    X_test = []
    y_train = []
    y_test = []

    for i, label in enumerate(y_transformed):
        if i in train_indices:
            X_train.append(X_transformed[i])
            y_train.append(label)
        else:
            X_test.append(X_transformed[i])
            y_test.append(label)

    logging.getLogger("attack-reps").setLevel(logging.WARNING)
    logging.getLogger("prep-attack-data").setLevel(logging.WARNING)
    logging.getLogger("attack-from-preds").setLevel(logging.WARNING)

    # Build a model and request its release
    model = SafeDecisionTreeClassifier(random_state=1)
    model.fit(X_train, y_train)
    model.request_release(path=directory, ext="pkl")

    # Wrap the model and data in a Target object
    target = Target(
        model=model,
        dataset_name="nursery",
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
    )

    # NOTE: we assume here that the researcher does not use the target.save()
    # function and instead provides only the model and the list of indices
    # which have been used to split the dataset, which will allow a TRE to
    # re-create the input data used in training.

    logging.info("Dataset: %s", target.name)
    logging.info("Features: %s", target.features)
    logging.info("X_train shape = %s", np.shape(target.X_train))
    logging.info("y_train shape = %s", np.shape(target.y_train))
    logging.info("X_test shape = %s", np.shape(target.X_test))
    logging.info("y_test shape = %s", np.shape(target.y_test))


if __name__ == "__main__":
    run_user_story()

TRE Implementation:

User Story 2 - TRE Implementation#

"""TRE SCRIPT FOR USER STORY 2.

This file contains the code needed to run user story 2.

To run: change the user_story key inside the .yaml config file to '2', and run
the 'generate_disclosure_risk_report.py' file.

NOTE: you should not need to change this file at all, set all parameters via
the .yaml file.
"""

import argparse
import importlib
import os
import pickle

import numpy as np
import pandas as pd
import yaml

from sacroml.attacks.attack_report_formatter import GenerateTextReport
from sacroml.attacks.target import Target


def process_dataset(filename, function_name, data_to_be_processed):
    """Process dataset.

    DO NOT CHANGE: this is a wrapper function that allows a callable function
    to be read from a file.
    """
    spec = importlib.util.spec_from_file_location(function_name, filename)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    function = getattr(module, function_name)
    return function(data_to_be_processed)


def generate_report(
    data_processing_filename,
    data_processing_function_name,
    dataset_filename,
    directory,
    target_model,
    attack_results,
    target_filename,
    outfile,
):
    """Generate report based on target model."""
    print()
    print("Acting as TRE...")
    print(
        "(when instructions on how to recreate the dataset have "
        "been provided by the researcher)"
    )
    print(directory)
    print()

    # Read in the model supplied by the researcher
    filename = os.path.join(directory, target_model)
    print("Reading target model from " + filename)
    with open(filename, "rb") as f:
        target_model = pickle.load(f)

    # Read the data used by the researcher, and process it using their defined function
    print("Reading data from " + dataset_filename)
    data = pd.read_csv(dataset_filename)

    returned = process_dataset(
        data_processing_filename, data_processing_function_name, data
    )
    X_transformed = returned["X_transformed"]
    y_transformed = returned["y_transformed"]
    train_indices = set(returned["train_indices"])

    X_train = []
    X_test = []
    y_train = []
    y_test = []

    for i, label in enumerate(y_transformed):
        if i in train_indices:
            X_train.append(X_transformed[i])
            y_train.append(label)
        else:
            X_test.append(X_transformed[i])
            y_test.append(label)

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)

    # Wrap the model and data in a Target object
    target = Target(
        model=target_model,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
    )

    # TRE calls request_release()
    print("===> now running attacks implicitly via request_release()")
    target_model.request_release(path=directory, ext="pkl", target=target)

    print(f"Please see the files generated in: {directory}")

    # Generate a report indicating calculated disclosure risk
    text_report = GenerateTextReport()
    text_report.process_attack_target_json(
        os.path.join(directory, attack_results),
        target_filename=os.path.join(directory, target_filename),
    )

    text_report.export_to_file(
        output_filename=os.path.join(directory, outfile), move_files=True
    )

    print("Results written to " + str(os.path.join(directory, outfile)))


def run_user_story(release_config: dict):
    """Run the user story, parsing arguments and then invoking report generation."""
    generate_report(
        release_config["data_processing_filename"],
        release_config["data_processing_function_name"],
        release_config["dataset_filename"],
        release_config["training_artefacts_dir"],
        release_config["target_model"],
        release_config["attack_results"],
        release_config["target_results"],
        release_config["outfile"],
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=(
            "Generate a risk report after request_release() "
            "has been called by researcher"
        )
    )

    parser.add_argument(
        "--config_file",
        type=str,
        action="store",
        dest="config_file",
        required=False,
        default="default_config.yaml",
        help=("Name of yaml configuration file"),
    )

    args = parser.parse_args()

    try:
        with open(args.config_file, encoding="utf-8") as handle:
            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
    except AttributeError as error:
        print(
            f"Invalid command. Try --help to get more detailserror message is {error}"
        )

    run_user_story(config)

User Story 3: Advanced Privacy Analysis#

Comprehensive privacy analysis workflow for sensitive datasets.

Researcher Template:

User Story 3 - Researcher Template#

"""RESEARCHER EXAMPLE FOR USER STORY 3.

This file is an example of a researcher creating/training a machine learning
model and to be released form a secure environment.

This specific example uses the nursery dataset: data is read in and
pre-processed, and a classifier is trained and tested on this dataset.

This example follows User Story 3.

Steps:

- Researcher creates and pre-processes a dataset.
- Researcher creates and trains a classifier on this data.
- Reasercher saves the model manually (e.g. using pickle, not through
  request_release() or similar).
- Researcher emails (or otherwise contacts) TRE to request the model be released.
- TREs will use this model and data to test the model themselves.
"""

import os
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


def run_user_story():
    """Create and train a model to be released."""
    # This section is not necessary but helpful - cleans up files that are
    # created by sacroml
    directory = "training_artefacts"
    print("Creating directory for training artefacts")

    if not os.path.exists(directory):
        os.makedirs(directory)

    # Read in and pre-process the dataset - replace this with your data
    # reading/pre-processing code.
    filename = os.path.join(".", "user_stories_resources", "dataset_26_nursery.csv")
    print("Reading data from " + filename)
    data = pd.read_csv(filename)

    target_encoder = LabelEncoder()
    target_vals = target_encoder.fit_transform(data["class"].values)
    target_dataframe = pd.DataFrame({"class": target_vals})
    data = data.drop(columns=["class"], inplace=False)

    feature_encoder = OneHotEncoder()
    X_encoded = feature_encoder.fit_transform(data).toarray()
    feature_dataframe = pd.DataFrame(
        X_encoded, columns=feature_encoder.get_feature_names_out()
    )

    X_train, X_test, y_train, y_test = train_test_split(
        feature_dataframe.values,
        target_dataframe.to_numpy().flatten(),
        test_size=0.7,
        random_state=42,
    )

    # Save the training and test data to a file which a TRE can access
    print("Saving training/testing data to ./" + directory)
    np.savetxt(os.path.join(directory, "X_train.txt"), X_train, fmt="%d")
    np.savetxt(os.path.join(directory, "y_train.txt"), y_train, fmt="%d")
    np.savetxt(os.path.join(directory, "X_test.txt"), X_test, fmt="%d")
    np.savetxt(os.path.join(directory, "y_test.txt"), y_test, fmt="%d")

    # Create, train and test a model
    # Replace this with your training and testing code
    hyperparameters = {}
    hyperparameters["min_samples_split"] = 5
    hyperparameters["min_samples_leaf"] = 5
    hyperparameters["max_depth"] = None
    hyperparameters["bootstrap"] = False

    target_model = RandomForestClassifier(**hyperparameters)
    target_model.fit(X_train, y_train)

    train_acc = accuracy_score(y_train, target_model.predict(X_train))
    test_acc = accuracy_score(y_test, target_model.predict(X_test))
    print(f"Training accuracy on model: {train_acc:.2f}")
    print(f"Testing accuracy on model: {test_acc:.2f}")

    # Save your model somewhere a TRE can access
    filename = os.path.join(directory, "model.pkl")
    print("Saving model to " + filename)
    with open(filename, "wb") as file:
        pickle.dump(target_model, file)


if __name__ == "__main__":
    run_user_story()

TRE Implementation:

User Story 3 - TRE Implementation#

"""TRE SCRIPT FOR USER STORY 3.

This file contains the code needed to run user story 3.

To run: change the user_story key inside the .yaml config file to '3', and run
the 'generate_disclosure_risk_report.py' file.

NOTE: you should not need to change this file at all, set all parameters via
the .yaml file.
"""

import argparse
import logging
import os
import pickle

import numpy as np
import yaml

from sacroml.attacks.attack_report_formatter import GenerateTextReport
from sacroml.attacks.likelihood_attack import LIRAAttack
from sacroml.attacks.target import Target
from sacroml.attacks.worst_case_attack import WorstCaseAttack


def generate_report(
    directory,
    target_model,
    X_train,
    y_train,
    X_test,
    y_test,
    target_filename,
    outfile,
):
    """Generate report based on target model."""
    print()
    print("Acting as TRE...")
    print()

    if not os.path.exists(directory):
        os.makedirs(directory)

    # Suppress messages from SACRO-ML -- comment out these lines to
    # see all the sacroml logging statements
    logging.getLogger("attack-reps").setLevel(logging.WARNING)
    logging.getLogger("prep-attack-data").setLevel(logging.WARNING)
    logging.getLogger("attack-from-preds").setLevel(logging.WARNING)

    # Read the model to be released as supplied by the researcher
    model_filename = os.path.join(directory, target_model)
    print("Reading target model from " + model_filename)
    with open(model_filename, "rb") as file:
        target_model = pickle.load(file)

    # Read the training/testing data as supplied by the researcher
    print("Reading training/testing data from ./" + directory)
    train_x = np.loadtxt(os.path.join(directory, X_train))
    train_y = np.loadtxt(os.path.join(directory, y_train))
    test_x = np.loadtxt(os.path.join(directory, X_test))
    test_y = np.loadtxt(os.path.join(directory, y_test))

    # Wrap the training and test data into the Target object
    target = Target(
        model=target_model,
        X_train=train_x,
        y_train=train_y,
        X_test=test_x,
        y_test=test_y,
    )
    target.save(os.path.join(directory, "target"))

    # Run the attack
    wca = WorstCaseAttack(n_dummy_reps=10, output_dir=directory)
    wca.attack(target)

    # Run the LiRA attack to test disclosure risk
    lira_attack_obj = LIRAAttack(n_shadow_models=100, output_dir=directory)
    lira_attack_obj.attack(target)

    text_report = GenerateTextReport()
    text_report.process_attack_target_json(
        os.path.join(directory, "report") + ".json",
        target_filename=os.path.join(directory, "target", target_filename),
    )

    text_report.export_to_file(
        output_filename=os.path.join(directory, outfile),
        move_files=True,
        model_filename=model_filename,
    )

    print("Results written to " + os.path.join(directory, outfile))


def run_user_story(release_config: dict):
    """Run the user story, parsing arguments and then invoking report generation."""
    generate_report(
        release_config["training_artefacts_dir"],
        release_config["target_model"],
        release_config["X_train_path"],
        release_config["y_train_path"],
        release_config["X_test_path"],
        release_config["y_test_path"],
        release_config["target_results"],
        release_config["outfile"],
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=(
            "Generate a risk report after request_release() "
            "has been called by researcher"
        )
    )

    parser.add_argument(
        "--config_file",
        type=str,
        action="store",
        dest="config_file",
        required=False,
        default="default_config.yaml",
        help=("Name of yaml configuration file"),
    )

    args = parser.parse_args()

    try:
        with open(args.config_file, encoding="utf-8") as handle:
            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
    except AttributeError as error:
        print(
            f"Invalid command. Try --help to get more detailserror message is {error}"
        )

    run_user_story(config)

Configuration and Utilities#

Default Configuration:

Default Configuration File#

---
# DIRECTIONS FOR USE
#
# Researcher should fill in this file with relevant parameters for the model
# they are releasing. Parameters tagged with a 'generated by code' label are
# files that are generated by sacroml. You can change the filename, but it is
# not necessary. All other parameters need to be set by either the researcher
# or TRE.

# Scenario to be run
user_story: UNDEFINED

# Details of experiments and files - replace these with the relevant filenames.

# Path to the dataset used to train the model.
# Researcher should supply this.
dataset_filename: "./user_stories_resources/dataset_26_nursery.csv"

# Name of the file (or directory for keras) containing the saved model.
# Researcher should supply this.
target_model: "model.pkl"

# Location of the directory with all the files needed to assess this release.
# Generated by code.
outfile: "summary.txt"
attack_results: "report.json"
training_artefacts_dir: "training_artefacts"

# User story 1, 2 or 3: name of the target results file generated by sacroml.
# Generated by code.
target_results: "target.yaml"

# User story 2
data_processing_filename: user_story_2/data_processing_researcher.py
data_processing_function_name: process_dataset

# User story 3
X_train_path: "X_train.txt"
y_train_path: "y_train.txt"
X_test_path: "X_test.txt"
y_test_path: "y_test.txt"

# User story 4 - replace with path to csv files
train_probabilities: "output_train.csv"
test_probabilities: "output_test.csv"
...

Report Generation:

Disclosure Risk Report Generation#

"""TRE script to perform disclosure checking for a trained ML model.

Researchers should fill out the relevant parameters in the .yaml file, which
should be in the same directory as this file. TREs can change the script that
is run using the user_story parameter at the top of the file.

To run this code:

python generate_disclosure_risk_report.py (with the .yaml file in the same directory)

NOTE: you should not need to change this file at all.
"""

import argparse

import yaml
from user_story_1 import user_story_1_tre
from user_story_2 import user_story_2_tre
from user_story_3 import user_story_3_tre
from user_story_4 import user_story_4_tre
from user_story_7 import user_story_7_tre
from user_story_8 import user_story_8_tre

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=("Run user stories code from a config file")
    )

    parser.add_argument(
        "--config_file",
        type=str,
        action="store",
        dest="config_file",
        required=False,
        default="default_config.yaml",
        help=("Name of yaml configuration file"),
    )

    args = parser.parse_args()

    try:
        with open(args.config_file, encoding="utf-8") as handle:
            config = yaml.load(handle, Loader=yaml.loader.SafeLoader)
    except AttributeError as error:  # pragma:no cover
        print(
            f"Invalid command. Try --help to get more detailserror message is {error}"
        )

    user_story = config["user_story"]
    if user_story == "UNDEFINED":
        print(
            "User story not selected, please select a user story by "
            "referring to user_stories_flow_chart.png and adding the "
            "relevant number to the the first line of 'default_config.yaml'"
        )
    elif user_story == 1:
        user_story_1_tre.run_user_story(config)
    elif user_story == 2:
        user_story_2_tre.run_user_story(config)
    elif user_story == 3:
        user_story_3_tre.run_user_story(config)
    elif user_story == 4:
        user_story_4_tre.run_user_story(config)
    elif user_story == 7:
        user_story_7_tre.run_user_story(config)
    elif user_story == 8:
        user_story_8_tre.run_user_story(config)
    else:
        raise NotImplementedError(f"User story {user_story} has not been implemented")

User Stories#

User Story 1: Basic Model Training and Attack#

User Story 2: Data Processing and Privacy Assessment#

User Story 3: Advanced Privacy Analysis#

Configuration and Utilities#

This Page