1
0
mirror of https://github.com/QData/TextAttack.git synced 2021-10-13 00:05:06 +03:00
Files
textattack-nlp-transformer/textattack/search_methods/PSO_algorithm.py

275 lines
10 KiB
Python

"""
Reimplementation of search method from Word-level Textual Adversarial Attacking as Combinatorial Optimization
by Zang et. al
`<https://www.aclweb.org/anthology/2020.acl-main.540.pdf>`_
`<https://github.com/thunlp/SememePSO-Attack>`_
"""
from copy import deepcopy
import numpy as np
from textattack.goal_function_results import GoalFunctionResultStatus
from textattack.search_methods import SearchMethod
class PSOAlgorithm(SearchMethod):
"""
Attacks a model with word substiutitions using a Particle Swarm Optimization (PSO) algorithm.
Some key hyper-parameters are setup according to the original paper:
"We adjust PSO on the validation set of SST and set ω_1 as 0.8 and ω_2 as 0.2.
We set the max velocity of the particles V_{max} to 3, which means the changing
probability of the particles ranges from 0.047 (sigmoid(-3)) to 0.953 (sigmoid(3))."
Args:
pop_size (:obj:`int`, optional): The population size. Defauls to 60.
max_iters (:obj:`int`, optional): The maximum number of iterations to use. Defaults to 20.
"""
def __init__(
self, pop_size=60, max_iters=20,
):
self.max_iters = max_iters
self.pop_size = pop_size
self.search_over = False
self.Omega_1 = 0.8
self.Omega_2 = 0.2
self.C1_origin = 0.8
self.C2_origin = 0.2
self.V_max = 3.0
def _generate_population(self, x_orig, neighbors_list, neighbors_len):
h_score, w_list = self._gen_h_score(x_orig, neighbors_len, neighbors_list)
return [self._mutate(x_orig, h_score, w_list) for _ in range(self.pop_size)]
def _mutate(self, x_cur, w_select_probs, w_list):
rand_idx = np.random.choice(len(w_select_probs), 1, p=w_select_probs)[0]
return x_cur.replace_word_at_index(rand_idx, w_list[rand_idx])
def _gen_h_score(self, x, neighbors_len, neighbors_list):
w_list = []
prob_list = []
for i, orig_w in enumerate(x.words):
if neighbors_len[i] == 0:
w_list.append(orig_w)
prob_list.append(0)
continue
p, w = self._gen_most_change(x, i, neighbors_list[i])
w_list.append(w)
prob_list.append(p)
prob_list = self._norm(prob_list)
h_score = prob_list
h_score = np.array(h_score)
return h_score, w_list
def _norm(self, n):
tn = []
for i in n:
if i <= 0:
tn.append(0)
else:
tn.append(i)
s = np.sum(tn)
if s == 0:
for i in range(len(tn)):
tn[i] = 1
return [t / len(tn) for t in tn]
new_n = [t / s for t in tn]
return new_n
# for un-targeted attacking
def _gen_most_change(self, x_cur, pos, replace_list):
orig_result, self.search_over = self.get_goal_results([x_cur])
if self.search_over:
return 0, x_cur.words[pos]
new_x_list = [x_cur.replace_word_at_index(pos, w) for w in replace_list]
# new_x_list = self.get_transformations(
# x_cur,
# original_text=self.original_attacked_text,
# indices_to_modify=[pos],
# )
new_x_results, self.search_over = self.get_goal_results(new_x_list)
new_x_scores = np.array([r.score for r in new_x_results])
new_x_scores = (
new_x_scores - orig_result[0].score
) # minimize the score of ground truth
if len(new_x_scores):
return (
np.max(new_x_scores),
new_x_list[np.argsort(new_x_scores)[-1]].words[pos],
)
else:
return 0, x_cur.words[pos]
def _get_neighbors_list(self, attacked_text):
"""
Generates this neighbors_len list
Args:
attacked_text: The original text
Returns:
A list of number of candidate neighbors for each word
"""
words = attacked_text.words
neighbors_list = [[] for _ in range(len(words))]
transformations = self.get_transformations(
attacked_text, original_text=self.original_attacked_text
)
for transformed_text in transformations:
try:
diff_idx = attacked_text.first_word_diff_index(transformed_text)
neighbors_list[diff_idx].append(transformed_text.words[diff_idx])
except:
assert len(attacked_text.words) == len(transformed_text.words)
assert all(
[
w1 == w2
for w1, w2 in zip(attacked_text.words, transformed_text.words)
]
)
neighbors_list = [np.array(x) for x in neighbors_list]
return neighbors_list
def _equal(self, a, b):
if a == b:
return -self.V_max
else:
return self.V_max
def _turn(self, x1, x2, prob, x_len):
indices_to_replace = []
words_to_replace = []
x2_words = x2.words
for i in range(x_len):
if np.random.uniform() < prob[i]:
indices_to_replace.append(i)
words_to_replace.append(x2_words[i])
new_text = x1.replace_words_at_indices(indices_to_replace, words_to_replace)
return new_text
def _count_change_ratio(self, x1, x2, x_len):
change_ratio = float(np.sum(x1.words != x2.words)) / float(x_len)
return change_ratio
def _sigmoid(self, n):
return 1 / (1 + np.exp(-n))
def _perform_search(self, initial_result):
self.original_attacked_text = initial_result.attacked_text
x_len = len(self.original_attacked_text.words)
self.correct_output = initial_result.output
# get word substitute candidates and generate population
neighbors_list = self._get_neighbors_list(self.original_attacked_text)
neighbors_len = [len(x) for x in neighbors_list]
pop = self._generate_population(
self.original_attacked_text, neighbors_list, neighbors_len
)
# test population against target model
pop_results, self.search_over = self.get_goal_results(pop)
if self.search_over:
return max(pop_results, key=lambda x: x.score)
pop_scores = np.array([r.score for r in pop_results])
# rank the scores from low to high and check if there is a successful attack
part_elites = deepcopy(pop)
part_elites_scores = pop_scores
top_attack = np.argmax(pop_scores)
all_elite = pop[top_attack]
all_elite_score = pop_scores[top_attack]
if pop_results[top_attack].goal_status == GoalFunctionResultStatus.SUCCEEDED:
return pop_results[top_attack]
# set up hyper-parameters
V = np.random.uniform(-self.V_max, self.V_max, self.pop_size)
V_P = [[V[t] for _ in range(x_len)] for t in range(self.pop_size)]
# start iterations
for i in range(self.max_iters):
Omega = (self.Omega_1 - self.Omega_2) * (
self.max_iters - i
) / self.max_iters + self.Omega_2
C1 = self.C1_origin - i / self.max_iters * (self.C1_origin - self.C2_origin)
C2 = self.C2_origin + i / self.max_iters * (self.C1_origin - self.C2_origin)
P1 = C1
P2 = C2
all_elite_words = all_elite.words
for id in range(self.pop_size):
# calculate the probability of turning each word
pop_words = pop[id].words
part_elites_words = part_elites[id].words
for dim in range(x_len):
V_P[id][dim] = Omega * V_P[id][dim] + (1 - Omega) * (
self._equal(pop_words[dim], part_elites_words[dim])
+ self._equal(pop_words[dim], all_elite_words[dim])
)
turn_prob = [self._sigmoid(V_P[id][d]) for d in range(x_len)]
if np.random.uniform() < P1:
pop[id] = self._turn(part_elites[id], pop[id], turn_prob, x_len)
if np.random.uniform() < P2:
pop[id] = self._turn(all_elite, pop[id], turn_prob, x_len)
# check if there is any successful attack in the current population
pop_results, self.search_over = self.get_goal_results(pop)
if self.search_over:
return max(pop_results, key=lambda x: x.score)
pop_scores = np.array([r.score for r in pop_results])
top_attack = np.argmax(pop_scores)
if (
pop_results[top_attack].goal_status
== GoalFunctionResultStatus.SUCCEEDED
):
return pop_results[top_attack]
# mutation based on the current change rate
new_pop = []
for x in pop:
change_ratio = self._count_change_ratio(
x, self.original_attacked_text, x_len
)
p_change = (
1 - 2 * change_ratio
) # referred from the original source code
if np.random.uniform() < p_change:
new_h, new_w_list = self._gen_h_score(
x, neighbors_len, neighbors_list
)
new_pop.append(self._mutate(x, new_h, new_w_list))
else:
new_pop.append(x)
pop = new_pop
# check if there is any successful attack in the current population
pop_results, self.search_over = self.get_goal_results(pop)
if self.search_over:
return max(pop_results, key=lambda x: x.score)
pop_scores = np.array([r.score for r in pop_results])
top_attack = np.argmax(pop_scores)
if (
pop_results[top_attack].goal_status
== GoalFunctionResultStatus.SUCCEEDED
):
return pop_results[top_attack]
# update the elite if the score is increased
for k in range(self.pop_size):
if pop_scores[k] > part_elites_scores[k]:
part_elites[k] = pop[k]
part_elites_scores[k] = pop_scores[k]
if pop_scores[top_attack] > all_elite_score:
all_elite = pop[top_attack]
all_elite_score = pop_scores[top_attack]
return initial_result