1
0
mirror of https://github.com/QData/TextAttack.git synced 2021-10-13 00:05:06 +03:00

Add stanza support for part-of-speech constraint

This commit is contained in:
k-ivey
2020-10-01 19:55:11 -04:00
parent 8c575f2006
commit f079faa61e
6 changed files with 116 additions and 2 deletions

View File

@@ -12,6 +12,7 @@ pandas>=1.0.1
scikit-learn
scipy==1.4.1
sentence_transformers>0.2.6
stanza
torch
transformers==3.3.0
tensorflow>=2

View File

@@ -0,0 +1,63 @@
Attack(
(search_method): GreedySearch
(goal_function): UntargetedClassification
(transformation): WordSwapEmbedding(
(max_candidates): 15
(embedding_type): paragramcf
)
(constraints):
(0): PartOfSpeech(
(tagger_type): stanza
(tagset): universal
(allow_verb_noun_swap): True
(compare_against_original): True
)
(1): RepeatModification
(2): StopwordModification
(is_black_box): True
)
--------------------------------------------- Result 1 ---------------------------------------------
Positive (91%) --> Negative (62%)
lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .
lovingly photographed in the manner of a golden book sprung to subsistence , stuart little 2 manages sweetness largely without stickiness .
--------------------------------------------- Result 2 ---------------------------------------------
Positive (99%) --> Negative (58%)
consistently clever and suspenseful .
persistently brainy and suspenseful .
--------------------------------------------- Result 3 ---------------------------------------------
Negative (78%) --> [SKIPPED]
it's like a " big chill " reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .
--------------------------------------------- Result 4 ---------------------------------------------
Positive (96%) --> Negative (85%)
the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with tremendous skill .
the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with colossal jurisdiction .
+-------------------------------+--------+
| Attack Results | |
+-------------------------------+--------+
| Number of successful attacks: | 3 |
| Number of failed attacks: | 0 |
| Number of skipped attacks: | 1 |
| Original accuracy: | 75.0% |
| Accuracy under attack: | 0.0% |
| Attack success rate: | 100.0% |
| Average perturbed word %: | 22.04% |
| Average num. words per input: | 15.5 |
| Avg num queries: | 175.67 |
+-------------------------------+--------+

View File

@@ -135,6 +135,17 @@ attack_test_params = [
),
"tests/sample_outputs/kuleshov_cnn_sst_2.txt",
),
#
# test: run_attack on LSTM MR using word embedding transformation and greedy search with Stanza part-of-speech tagger as a constraint
#
(
"run_attack_stanza_pos_tagger",
(
"textattack attack --model lstm-mr --num-examples 4 --search-method greedy --transformation word-swap-embedding "
"--constraints repeat stopword part-of-speech^tagger_type=\\'stanza\\' --shuffle=False"
),
"tests/sample_outputs/run_attack_stanza_pos_tagger.txt",
),
]

View File

@@ -3,6 +3,7 @@ from flair.data import Sentence
from flair.models import SequenceTagger
import lru
import nltk
import stanza
import textattack
from textattack.constraints import Constraint
@@ -46,10 +47,11 @@ class PartOfSpeech(Constraint):
of `<https://arxiv.org/abs/1907.11932>`_ adapted from
`<https://github.com/jind11/TextFooler>`_.
POS tagger from Flair `<https://github.com/flairNLP/flair>` also available
POS taggers from Flair `<https://github.com/flairNLP/flair>`_ and
Stanza `<https://github.com/stanfordnlp/stanza>`_ are also available
Args:
tagger_type (str): Name of the tagger to use (available choices: "nltk", "flair").
tagger_type (str): Name of the tagger to use (available choices: "nltk", "flair", "stanza").
tagset (str): tagset to use for POS tagging
allow_verb_noun_swap (bool): If `True`, allow verbs to be swapped with nouns and vice versa.
compare_against_original (bool): If `True`, compare against the original text.
@@ -75,6 +77,11 @@ class PartOfSpeech(Constraint):
else:
self._flair_pos_tagger = SequenceTagger.load("pos-fast")
if tagger_type == "stanza":
self._stanza_pos_tagger = stanza.Pipeline(
lang="en", processors="tokenize, pos", tokenize_pretokenized=True
)
def clear_cache(self):
self._pos_tag_cache.clear()
@@ -101,6 +108,11 @@ class PartOfSpeech(Constraint):
context_key_sentence
)
if self.tagger_type == "stanza":
word_list, pos_list = textattack.shared.utils.zip_stanza_result(
self._stanza_pos_tagger(context_key), tagset=self.tagset
)
self._pos_tag_cache[context_key] = (word_list, pos_list)
# idx of `word` in `context_words`

View File

@@ -115,6 +115,10 @@ def _post_install():
nltk.download("universal_tagset")
nltk.download("wordnet")
import stanza
stanza.download("en")
def set_cache_dir(cache_dir):
"""Sets all relevant cache directories to ``TA_CACHE_DIR``."""

View File

@@ -197,3 +197,26 @@ def zip_flair_result(pred, tag_type="pos-fast"):
pos_list.append(token.get_tag("ner"))
return word_list, pos_list
def zip_stanza_result(pred, tagset="universal"):
"""Takes the first sentence from a document from `stanza` and returns two
lists, one of words and the other of their corresponding parts-of-
speech."""
from stanza.models.common.doc import Document
if not isinstance(pred, Document):
raise TypeError("Result from Stanza POS tagger must be a `Document` object.")
word_list = []
pos_list = []
for sentence in pred.sentences:
for word in sentence.words:
word_list.append(word.text)
if tagset == "universal":
pos_list.append(word.upos)
else:
pos_list.append(word.xpos)
return word_list, pos_list