attack_with_jsma_rate_table_mnist.py

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import numpy as np
from six.moves import xrange
import tensorflow as tf
from tensorflow.python.platform import flags
import logging
import os

from cleverhans.attacks import SaliencyMapMethod
from cleverhans.utils import other_classes, set_log_level
from cleverhans.utils import pair_visual, grid_visual, AccuracyReport, create_logger
from cleverhans.utils_mnist import data_mnist
from cleverhans.utils_tf import model_train, model_eval, model_argmax, tf_model_load
from cleverhans.utils_keras import KerasModelWrapper, cnn_model
from cleverhans_tutorials.tutorial_models import make_basic_cnn

_logger = create_logger("cleverhans.utils.tf")
FLAGS = flags.FLAGS


def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, nb_classes=10, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()
    
    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(4254264)

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
    #                                               train_end=train_end,
    #                                               test_start=test_start,
    #                                               test_end=test_end)
    
    # Get notMNIST data
    # with np.load("notmnist.npz") as data:
    #     X_train, Y_train, X_test, Y_test = data['examples_train'], data['labels_train'], data['examples_test'], data['labels_test']
    
    # Get MNISTnotMNIST data
    with np.load("mnist.npz") as data:
        X_train, Y_train, X_test, Y_test = data['X_train'], data['Y_train'], data['X_test'], data['Y_test']
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))
    
    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")
    
    # Define TF model graph
    model_path = "./"
    model_name = "clean_trained_mnist_model"
    model = make_basic_cnn(nb_classes=nb_classes)
    if tf_model_load(sess, file_path=os.path.join(model_path, model_name)):
        print(model_name, " reloaded.")
    preds = model.get_probs(x)
    # print('shape is', preds.get_shape())
    
    # clean_train = True
    # if clean_train:
    #     train_params = {
    #         'nb_epochs': nb_epochs,
    #         'batch_size': batch_size,
    #         'learning_rate': learning_rate
    #     }
    #     model_path = "./"
    #     model_name = "clean_trained__model_notmnist"
    #     rng = np.random.RandomState([1989, 12, 13])
    #     model = make_basic_cnn()
    #     preds = model.get_probs(x)
    # 
    #     def evaluate():
    #         # Evaluate the accuracy of the MNIST model on legitimate test
    #         # examples
    #         eval_params = {'batch_size': batch_size}
    #         acc = model_eval(
    #             sess, x, y, preds, X_test, Y_test, args=eval_params)
    #         report.clean_train_clean_eval = acc
    #         assert X_test.shape[0] == test_end - test_start, X_test.shape
    #         print('Test accuracy on legitimate examples: %0.4f' % acc)
    #     model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,args=train_params, rng=rng)
    #     
    #     save_path = os.path.join(model_path, model_name)
    #     saver = tf.train.Saver()
    #     saver.save(sess, save_path)
    #     _logger.info("Completed model training and saved at: " + str(save_path))
    # print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    # train_params = {
    #     'nb_epochs': nb_epochs,
    #     'batch_size': batch_size,
    #     'learning_rate': learning_rate,
    #     'train_dir': model_path,
    #     'filename': model_name
    # }
    # sess.run(tf.global_variables_initializer())
    # rng = np.random.RandomState([2017, 8, 30])
    # model_train(sess, x, y, preds, X_train, Y_train, save=True, args=train_params,
    #             rng=rng)
    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    # report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')
    # misclassify
    results2 = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    # grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    # grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1, 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    rng = np.random.RandomState([1358, 23, 234])
    index_shuf = list(range(len(X_test)))
    rng.shuffle(index_shuf)
    X_test = X_test[index_shuf]
    Y_test = Y_test[index_shuf]
    
    # create a dictionary to keep track of occurence of each letter
    # create a 2D array to kee track of successful attacks
    occurence = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0, 7:0, 8:0, 9:0}
                 # 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0}
    rate_table = np.zeros((nb_classes, nb_classes), dtype='f')
    
    
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = X_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)
        
        # add one to current class occurence
        occurence[current_class] += 1

        # For the grid visualization, keep original images along the diagonal
        # grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
        #     sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)
            # misclassify
            res2 = int(model_argmax(sess, x, preds, adv_x) != current_class)
            # if success, add one to successful rate table
            if res == 1:
                rate_table[current_class, target] += 1.

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            # if viz_enabled:
            #     figure = pair_visual(
            #         np.reshape(sample, (img_rows, img_cols)),
            #         np.reshape(adv_x, (img_rows, img_cols)), figure)

            # Add our adversarial example to our grid data
            # grid_viz_data[target, current_class, :, :, :] = np.reshape(
            #     adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            results2[target, sample_ind] = res2
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')
    
    # Close TF session
    sess.close()
    
    # Compute success rate of each letter attacking each target
    for cur in range(nb_classes):
        if occurence[cur] != 0:
            rate_table[cur,:] /= float(occurence[cur])
    print("The table of rate of successful attacking is shown below")
    print(rate_table)
    print("the number of occurrence of each class is ", occurence)
    
    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    # misclassify
    succ_rate2 = float(np.sum(results2)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    print('Avg. rate of misclassified adv. examples {0:.4f}'.format(succ_rate2))
    # report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    
    # Finally, block & display a grid of all the adversarial examples
    # if viz_enabled:
    #     import matplotlib.pyplot as plt
    #     plt.close(figure)
    #     _ = grid_visual(grid_viz_data)

    return report


def main(argv=None):
    mnist_tutorial_jsma(viz_enabled=FLAGS.viz_enabled,
                        nb_epochs=FLAGS.nb_epochs,
                        batch_size=FLAGS.batch_size,
                        nb_classes=FLAGS.nb_classes,
                        source_samples=FLAGS.source_samples,
                        learning_rate=FLAGS.learning_rate)


if __name__ == '__main__':
    flags.DEFINE_boolean('viz_enabled', False, 'Visualize adversarial ex.')
    flags.DEFINE_integer('nb_epochs', 8, 'Number of epochs to train model')
    flags.DEFINE_integer('batch_size', 128, 'Size of training batches')
    flags.DEFINE_integer('nb_classes', 10, 'Number of output classes')
    flags.DEFINE_integer('source_samples', 1000, 'Nb of test inputs to attack')
    flags.DEFINE_float('learning_rate', 0.001, 'Learning rate for training')

    tf.app.run()