Source code for robustcheck.EpsilonGreedyUntargeted.EpsilonGreedyUntargeted

import numpy as np
import random
from robustcheck.types.UntargetedAttack import UntargetedAttack



[docs]
class EpsilonGreedyUntargeted(UntargetedAttack):
    """ Black-box, untargeted adversarial attack against image classifiers.

    It encapsulates the target model and image and provides a method to run the adversarial attack. The attack
    samples groups of pixels to adversarially perturb according to a classic epsilon-greedy strategy. The reward
    is represented by the decrease in the probability to be classified correctly of the target image by the target
    model. The attack samples a pixel from the group that provided the highest average reward so far with
    probability 1-epsilon, and a pixel from a random group with probability epsilon.

    Attributes:
        model: Target model to be attacked. This has to expose a predict method that returns the
            output probability distributions when provided a batch of images as input.
        img: An array (HxWxC) representing the target image to be perturbed.
        label: An integer representing the correct class index of the image.
        pixel_groups: An array of arrays of pairs of integers. Each second level array represents the indices of pixels
            that get attacked as part of the same pixel group. Usual approaches are to have these groups created based
            on objectness or on spatial proximity (e.g. in a grid-like setup).
        epsilon: A float representing the probability of exploration (choosing a random group of pixels to be perturbed)
            in the classic epsilon-greedy strategy.
        pixel_space_max: A number (integer or float) representing the maximum value pixels can take in the image space.
            This is used for extracting normalised metrics about the attack success later on.
        verbose: A boolean flag which, when set to True, enables printing info on the attack results.

    Methods:
        get_best_candidate(self): Returns the fittest individual in the active generation.
        is_perturbed(self): Returns a boolean representing whether a successful adversarial perturbation has been
            achieved in the active generation.
        run_adversarial_attack(self, steps=100): Runs the adversarial attack based on the evolutionary strategy until a
            successful adversarial perturbation was found or until steps generations were explored. Returns the total
            number of generations before the stopping condition was reached.
    """
    def __init__(
            self,
            model,
            img,
            label,
            pixel_groups,
            epsilon=0.1,
            pixel_space_int_flag=True,
            pixel_space_min=0,
            pixel_space_max=255,
            steps=1000,
            verbose=False
    ):
        UntargetedAttack.__init__(self, model, img, label)  # Each instance encapsulates the model and image to perturb

        self._perturbed_img = np.copy(img)  # self._perturbed_img is the variable we will iteratively perturb
        self._model_perturbed_prediction = self.model.predict(np.array([self._perturbed_img]))[0]
        self.queries = 1

        self.pixel_groups = pixel_groups
        self.number_groups = len(self.pixel_groups)

        assert 0 <= epsilon <= 1  # epsilon is a probability (of exploration), needs to be between 0 and 1
        self.epsilon = epsilon

        self.pixel_space_int_flag = pixel_space_int_flag
        self.pixel_space_min = pixel_space_min
        self.pixel_space_max = pixel_space_max

        self.steps = steps

        self.verbose = verbose

        # self._values contains the historical average rewards per pixel group
        self._values = [0.0 for _ in range(self.number_groups)]

        # self._counts contains the historical count of explorations per pixel group
        self._counts = [0 for _ in range(self.number_groups)]


[docs]
    def select_group(self):
        """
        This is the core method that trades off between exploration and exploitation, as expected in classic
        epsilon-greedy strategies. Here, exploration is represented by sampling a random group of pixels, while
        exploitation means selecting a group of pixels with the highest average reward observed so far.
        """
        if random.random() > self.epsilon:
            # Pick a pixel group with the highest historical reward with probability 1 - self.epsilon.
            max_reward_indices = np.flatnonzero(np.array(self._values) == np.max(np.array(self._values)))
            max_group_index = np.random.choice(max_reward_indices)
            return max_group_index
        else:
            # Pick a random pixel group with probability self.epsilon.
            random_group_index = random.randrange(self.number_groups)
            return random_group_index



[docs]
    def update(self, chosen_group, reward):
        """
        Updates a pixel group chosen_group according to an observed reward. This will update the corresponding
        group value and count of historical observation. This updates the instance fields _values and _counts.

        Args:
            chosen_group: An integer representing the index of the pixel group that will get updated after a new reward
                was observed.
            reward: The reward used to update the pixel group value and count.
        Returns:
            A float representing the updated value of the historical average reward of the chosen group.
        """
        self._counts[chosen_group] = self._counts[chosen_group] + 1

        n = self._counts[chosen_group]
        value = self._values[chosen_group]

        new_value = ((n - 1) / float(n)) * value + (1 / float(n)) * reward
        self._values[chosen_group] = new_value
        return new_value



[docs]
    def explore_attack_group(self, group_index):
        """
        Explores the potential reward obtained by sampling the attacked pixel from a fixed group.

        Args:
            group_index: An integer representing the index of the pixel group that the method will attempt perturbing.
        Returns:
            A dictionary containing information about the perturbation attempt. The dictionary contains the following
            fields:
                "potential_reward": A float representing the expected reward by perturbing the target group.
                "altered_image": A three-dimensional array representing the perturbed image after applying the
                    group_index group perturbation.
                "prob_before": A float representing the probability of the perturbed image to be classified correctly
                    by the target model before applying the group_index group perturbation.
                "prob_after": A float representing the probability of the perturbed image to be classified correctly
                    by the target model after applying the group_index group perturbation.
                "pred_after": An array of floats representing the probability distribution of the perturbed image as
                    output by the target model after applying the group_index group perturbation.
        """
        attack_group = self.pixel_groups[group_index]
        count_pixels_group = len(attack_group)
        attack_pixel_index = random.randrange(count_pixels_group)
        attack_pixel = attack_group[attack_pixel_index]

        candidate_next_perturbed_img = self._perturbed_img.copy()

        for ch in range(np.shape(self.img)[2]):
            value = (
                random.randint(int(self.pixel_space_min), int(self.pixel_space_max))
                if self.pixel_space_int_flag
                else random.uniform(self.pixel_space_min, self.pixel_space_max)
            )
            candidate_next_perturbed_img[attack_pixel[0]][attack_pixel[1]][ch] = value

        correct_class_prob_before = self._model_perturbed_prediction[self.label]

        pred_after = self.model.predict(np.array([candidate_next_perturbed_img]))[0]
        self.queries += 1

        correct_class_prob_after = pred_after[self.label]
        potential_reward = correct_class_prob_before - correct_class_prob_after

        return {
            "potential_reward": potential_reward,
            "altered_image": candidate_next_perturbed_img,
            "prob_before": correct_class_prob_before,
            "prob_after": correct_class_prob_after,
            "pred_after": pred_after
        }



[docs]
    def run_adversarial_attack(self):
        """
        Runs the adversarial attack.

        Returns:
             An integer representing the number of attack steps until either the attack was successful or the maximum
             steps threshold was reached.
        """
        trial_index = 0
        while trial_index < self.steps and not self.is_perturbed():
            attack_group = self.select_group()  # Select the target pixel group to perturb

            # Simulate attacking the target pixel group and retrieve the potential reward
            attack_result = self.explore_attack_group(attack_group)

            potential_reward = attack_result["potential_reward"]
            altered_image = attack_result["altered_image"]

            # Only update the perturbed image self._perturbed_img if the potential reward is positive
            if potential_reward > 0:
                self._perturbed_img = altered_image
                self._model_perturbed_prediction = attack_result["pred_after"]

            # Update the average historical reward of the target pixel group no matter if the reward was positive or not
            self.update(attack_group, potential_reward)

            trial_index += 1

        if self.is_perturbed() and self.verbose:
            print(f"Image successfully perturbed in {trial_index} rounds")
        elif not self.is_perturbed() and self.verbose:
            print(f"The attack did not succeed within {trial_index} rounds")

        if self.verbose:
            print("Correct label:", self.label)
            print("Predicted label:", np.argmax(self._model_perturbed_prediction))

        return trial_index



[docs]
    def get_best_candidate(self):
        return self._perturbed_img



[docs]
    def is_perturbed(self):
        """
        Returns:
            A boolean representing whether the adversarial attack has been successful
        """
        pred_label = np.argmax(self._model_perturbed_prediction)
        correct_output = (pred_label == self.label)
        return not correct_output



    # TODO: move to /attacks module together with EvoStrategyUniformUntargeted, keep only RobustnessCheck in main folder
Source code for robustcheck.EpsilonGreedyUntargeted.EpsilonGreedyUntargeted

RobustCheck

Navigation

Related Topics