mirror of
https://github.com/QData/TextAttack.git
synced 2021-10-13 00:05:06 +03:00
Add stanza support for part-of-speech constraint
This commit is contained in:
@@ -12,6 +12,7 @@ pandas>=1.0.1
|
||||
scikit-learn
|
||||
scipy==1.4.1
|
||||
sentence_transformers>0.2.6
|
||||
stanza
|
||||
torch
|
||||
transformers==3.3.0
|
||||
tensorflow>=2
|
||||
|
||||
63
tests/sample_outputs/run_attack_stanza_pos_tagger.txt
Normal file
63
tests/sample_outputs/run_attack_stanza_pos_tagger.txt
Normal file
@@ -0,0 +1,63 @@
|
||||
Attack(
|
||||
(search_method): GreedySearch
|
||||
(goal_function): UntargetedClassification
|
||||
(transformation): WordSwapEmbedding(
|
||||
(max_candidates): 15
|
||||
(embedding_type): paragramcf
|
||||
)
|
||||
(constraints):
|
||||
(0): PartOfSpeech(
|
||||
(tagger_type): stanza
|
||||
(tagset): universal
|
||||
(allow_verb_noun_swap): True
|
||||
(compare_against_original): True
|
||||
)
|
||||
(1): RepeatModification
|
||||
(2): StopwordModification
|
||||
(is_black_box): True
|
||||
)
|
||||
|
||||
--------------------------------------------- Result 1 ---------------------------------------------
|
||||
[92mPositive (91%)[0m --> [91mNegative (62%)[0m
|
||||
|
||||
lovingly photographed in the manner of a golden book sprung to [92mlife[0m , stuart little 2 manages sweetness largely without stickiness .
|
||||
|
||||
lovingly photographed in the manner of a golden book sprung to [91msubsistence[0m , stuart little 2 manages sweetness largely without stickiness .
|
||||
|
||||
|
||||
--------------------------------------------- Result 2 ---------------------------------------------
|
||||
[92mPositive (99%)[0m --> [91mNegative (58%)[0m
|
||||
|
||||
[92mconsistently[0m [92mclever[0m and suspenseful .
|
||||
|
||||
[91mpersistently[0m [91mbrainy[0m and suspenseful .
|
||||
|
||||
|
||||
--------------------------------------------- Result 3 ---------------------------------------------
|
||||
[91mNegative (78%)[0m --> [37m[SKIPPED][0m
|
||||
|
||||
it's like a " big chill " reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .
|
||||
|
||||
|
||||
--------------------------------------------- Result 4 ---------------------------------------------
|
||||
[92mPositive (96%)[0m --> [91mNegative (85%)[0m
|
||||
|
||||
the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with [92mtremendous[0m [92mskill[0m .
|
||||
|
||||
the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with [91mcolossal[0m [91mjurisdiction[0m .
|
||||
|
||||
|
||||
|
||||
+-------------------------------+--------+
|
||||
| Attack Results | |
|
||||
+-------------------------------+--------+
|
||||
| Number of successful attacks: | 3 |
|
||||
| Number of failed attacks: | 0 |
|
||||
| Number of skipped attacks: | 1 |
|
||||
| Original accuracy: | 75.0% |
|
||||
| Accuracy under attack: | 0.0% |
|
||||
| Attack success rate: | 100.0% |
|
||||
| Average perturbed word %: | 22.04% |
|
||||
| Average num. words per input: | 15.5 |
|
||||
| Avg num queries: | 175.67 |
|
||||
+-------------------------------+--------+
|
||||
@@ -135,6 +135,17 @@ attack_test_params = [
|
||||
),
|
||||
"tests/sample_outputs/kuleshov_cnn_sst_2.txt",
|
||||
),
|
||||
#
|
||||
# test: run_attack on LSTM MR using word embedding transformation and greedy search with Stanza part-of-speech tagger as a constraint
|
||||
#
|
||||
(
|
||||
"run_attack_stanza_pos_tagger",
|
||||
(
|
||||
"textattack attack --model lstm-mr --num-examples 4 --search-method greedy --transformation word-swap-embedding "
|
||||
"--constraints repeat stopword part-of-speech^tagger_type=\\'stanza\\' --shuffle=False"
|
||||
),
|
||||
"tests/sample_outputs/run_attack_stanza_pos_tagger.txt",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ from flair.data import Sentence
|
||||
from flair.models import SequenceTagger
|
||||
import lru
|
||||
import nltk
|
||||
import stanza
|
||||
|
||||
import textattack
|
||||
from textattack.constraints import Constraint
|
||||
@@ -46,10 +47,11 @@ class PartOfSpeech(Constraint):
|
||||
of `<https://arxiv.org/abs/1907.11932>`_ adapted from
|
||||
`<https://github.com/jind11/TextFooler>`_.
|
||||
|
||||
POS tagger from Flair `<https://github.com/flairNLP/flair>` also available
|
||||
POS taggers from Flair `<https://github.com/flairNLP/flair>`_ and
|
||||
Stanza `<https://github.com/stanfordnlp/stanza>`_ are also available
|
||||
|
||||
Args:
|
||||
tagger_type (str): Name of the tagger to use (available choices: "nltk", "flair").
|
||||
tagger_type (str): Name of the tagger to use (available choices: "nltk", "flair", "stanza").
|
||||
tagset (str): tagset to use for POS tagging
|
||||
allow_verb_noun_swap (bool): If `True`, allow verbs to be swapped with nouns and vice versa.
|
||||
compare_against_original (bool): If `True`, compare against the original text.
|
||||
@@ -75,6 +77,11 @@ class PartOfSpeech(Constraint):
|
||||
else:
|
||||
self._flair_pos_tagger = SequenceTagger.load("pos-fast")
|
||||
|
||||
if tagger_type == "stanza":
|
||||
self._stanza_pos_tagger = stanza.Pipeline(
|
||||
lang="en", processors="tokenize, pos", tokenize_pretokenized=True
|
||||
)
|
||||
|
||||
def clear_cache(self):
|
||||
self._pos_tag_cache.clear()
|
||||
|
||||
@@ -101,6 +108,11 @@ class PartOfSpeech(Constraint):
|
||||
context_key_sentence
|
||||
)
|
||||
|
||||
if self.tagger_type == "stanza":
|
||||
word_list, pos_list = textattack.shared.utils.zip_stanza_result(
|
||||
self._stanza_pos_tagger(context_key), tagset=self.tagset
|
||||
)
|
||||
|
||||
self._pos_tag_cache[context_key] = (word_list, pos_list)
|
||||
|
||||
# idx of `word` in `context_words`
|
||||
|
||||
@@ -115,6 +115,10 @@ def _post_install():
|
||||
nltk.download("universal_tagset")
|
||||
nltk.download("wordnet")
|
||||
|
||||
import stanza
|
||||
|
||||
stanza.download("en")
|
||||
|
||||
|
||||
def set_cache_dir(cache_dir):
|
||||
"""Sets all relevant cache directories to ``TA_CACHE_DIR``."""
|
||||
|
||||
@@ -197,3 +197,26 @@ def zip_flair_result(pred, tag_type="pos-fast"):
|
||||
pos_list.append(token.get_tag("ner"))
|
||||
|
||||
return word_list, pos_list
|
||||
|
||||
|
||||
def zip_stanza_result(pred, tagset="universal"):
|
||||
"""Takes the first sentence from a document from `stanza` and returns two
|
||||
lists, one of words and the other of their corresponding parts-of-
|
||||
speech."""
|
||||
from stanza.models.common.doc import Document
|
||||
|
||||
if not isinstance(pred, Document):
|
||||
raise TypeError("Result from Stanza POS tagger must be a `Document` object.")
|
||||
|
||||
word_list = []
|
||||
pos_list = []
|
||||
|
||||
for sentence in pred.sentences:
|
||||
for word in sentence.words:
|
||||
word_list.append(word.text)
|
||||
if tagset == "universal":
|
||||
pos_list.append(word.upos)
|
||||
else:
|
||||
pos_list.append(word.xpos)
|
||||
|
||||
return word_list, pos_list
|
||||
|
||||
Reference in New Issue
Block a user