add gradient-based white-box search

2021-10-13 00:05:06 +03:00 · 2020-10-05 18:46:47 -04:00
parent 6273b19c19
commit bdbeae80d2
7 changed files with 71 additions and 8 deletions
--- a/textattack/search_methods/beam_search.py
+++ b/textattack/search_methods/beam_search.py
@@ -49,5 +49,9 @@ class BeamSearch(SearchMethod):
            beam = [potential_next_beam[i] for i in best_indices]
        return best_result

+    @property
+    def is_blackbox(self):
+        return True
+
    def extra_repr_keys(self):
        return ["beam_width"]
--- a/textattack/search_methods/genetic_algorithm.py
+++ b/textattack/search_methods/genetic_algorithm.py
@@ -285,6 +285,10 @@ class GeneticAlgorithm(PopulationBasedSearch, ABC):
        substitutions."""
        return transformation_consists_of_word_swaps(transformation)

+    @property
+    def is_blackbox(self):
+        return True
+
    def extra_repr_keys(self):
        return [
            "pop_size",
--- a/textattack/search_methods/greedy_word_swap_wir.py
+++ b/textattack/search_methods/greedy_word_swap_wir.py
@@ -28,6 +28,7 @@ class GreedyWordSwapWIR(SearchMethod):

    Args:
        wir_method: method for ranking most important words
+        model_wrapper: model wrapper used for gradient-based ranking
    """

    def __init__(self, wir_method="unk"):
@@ -44,6 +45,7 @@ class GreedyWordSwapWIR(SearchMethod):
            ]
            leave_one_results, search_over = self.get_goal_results(leave_one_texts)
            index_scores = np.array([result.score for result in leave_one_results])
+
        elif self.wir_method == "weighted-saliency":
            # first, compute word saliency
            leave_one_texts = [
@@ -74,12 +76,47 @@ class GreedyWordSwapWIR(SearchMethod):
                delta_ps.append(max_score_change)

            index_scores = softmax_saliency_scores * np.array(delta_ps)
+
        elif self.wir_method == "delete":
            leave_one_texts = [
                initial_text.delete_word_at_index(i) for i in range(len_text)
            ]
            leave_one_results, search_over = self.get_goal_results(leave_one_texts)
            index_scores = np.array([result.score for result in leave_one_results])
+
+        elif self.wir_method == "gradient":
+            victim_model = self.get_model()
+            index_scores = np.zeros(initial_text.num_words)
+            grad_output = victim_model.get_grad(initial_text.tokenizer_input)
+            gradient = grad_output["gradient"]
+            j = 0
+            last_matched = 0
+            for i, word in enumerate(initial_text.words):
+                word = initial_text.words[i].lower()
+                matched_tokens = []
+                a = []
+                while j < len(grad_output["tokens"]) and len(word) > 0:
+                    token = grad_output["tokens"][j].lower()
+                    # remove "##" if it's a subword
+                    token = token.replace("##", "")
+                    idx = word.find(token)
+                    if idx == 0:
+                        word = word[idx + len(token) :]
+                        matched_tokens.append(j)
+                        a.append(token)
+                        last_matched = j
+                    j += 1
+
+                if not matched_tokens:
+                    # Reset j to most recent match
+                    j = last_matched
+                    index_scores[i] = 0.0
+                else:
+                    agg_grad = np.mean(gradient[matched_tokens], axis=0)
+                    index_scores[i] = np.linalg.norm(agg_grad, ord=1)
+
+            search_over = False
+
        elif self.wir_method == "random":
            index_order = np.arange(len_text)
            np.random.shuffle(index_order)
@@ -146,5 +183,12 @@ class GreedyWordSwapWIR(SearchMethod):
        limited to word swap and deletion transformations."""
        return transformation_consists_of_word_swaps_and_deletions(transformation)

+    @property
+    def is_blackbox(self):
+        if self.wir_method == "gradient":
+            return False
+        else:
+            return True
+
    def extra_repr_keys(self):
        return ["wir_method"]
--- a/textattack/search_methods/particle_swarm_optimization.py
+++ b/textattack/search_methods/particle_swarm_optimization.py
@@ -45,6 +45,7 @@ class ParticleSwarmOptimization(PopulationBasedSearch):
        self.pop_size = pop_size
        self.post_turn_check = post_turn_check
        self.max_turn_retries = 20
+        self.is_blackbox = True

        self._search_over = False
        self.omega_1 = 0.8
@@ -329,6 +330,10 @@ class ParticleSwarmOptimization(PopulationBasedSearch):
        substitutions."""
        return transformation_consists_of_word_swaps(transformation)

+    @property
+    def is_blackbox(self):
+        return True
+
    def extra_repr_keys(self):
        return ["pop_size", "max_iters", "post_turn_check", "max_turn_retries"]

--- a/textattack/search_methods/search_method.py
+++ b/textattack/search_methods/search_method.py
@@ -32,6 +32,12 @@ class SearchMethod(ABC):
            raise AttributeError(
                "Search Method must have access to filter_transformations method"
            )
+
+        if not self.is_blackbox and not hasattr(self, "get_model"):
+            raise AttributeError(
+                "Search Method must have access to get_model method if it is a white-box method"
+            )
+
        return self._perform_search(initial_result)

    @abstractmethod
@@ -48,6 +54,12 @@ class SearchMethod(ABC):
        ``transformation``."""
        return True

+    @property
+    def is_blackbox(self):
+        """Returns `True` if search method does not require access to victim
+        model's internal states."""
+        raise NotImplementedError()
+
    def extra_repr_keys(self):
        return []

--- a/textattack/shared/attack.py
+++ b/textattack/shared/attack.py
@@ -114,6 +114,8 @@ class Attack:
            )
        )
        self.search_method.filter_transformations = self.filter_transformations
+        if not search_method.is_blackbox:
+            self.search_method.get_model = lambda: self.goal_function.model

    def clear_cache(self, recursive=True):
        self.constraints_cache.clear()
--- a/textattack/shared/utils/misc.py
+++ b/textattack/shared/utils/misc.py
@@ -100,14 +100,6 @@ def load_textattack_model_from_path(model_name, model_path):
        model = textattack.models.helpers.WordCNNForClassification(
            model_path=model_path, num_labels=num_labels
        )
-    elif model_name.startswith("bert"):
-        model_path, num_labels = model_path
-        textattack.shared.logger.info(
-            f"Loading pre-trained TextAttack BERT model: {colored_model_name}"
-        )
-        model = textattack.models.helpers.BERTForClassification(
-            model_path=model_path, num_labels=num_labels
-        )
    elif model_name.startswith("t5"):
        model = textattack.models.helpers.T5ForTextToText(model_path)
    else: