Add stanza support for part-of-speech constraint

2021-10-13 00:05:06 +03:00 · 2020-10-01 19:55:11 -04:00
parent 8c575f2006
commit f079faa61e
6 changed files with 116 additions and 2 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,6 +12,7 @@ pandas>=1.0.1
 scikit-learn
 scipy==1.4.1
 sentence_transformers>0.2.6
+stanza
 torch
 transformers==3.3.0
 tensorflow>=2
--- a/tests/sample_outputs/run_attack_stanza_pos_tagger.txt
+++ b/tests/sample_outputs/run_attack_stanza_pos_tagger.txt
@@ -0,0 +1,63 @@
+Attack(
+  (search_method): GreedySearch
+  (goal_function):  UntargetedClassification
+  (transformation):  WordSwapEmbedding(
+    (max_candidates):  15
+    (embedding_type):  paragramcf
+  )
+  (constraints): 
+    (0): PartOfSpeech(
+        (tagger_type):  stanza
+        (tagset):  universal
+        (allow_verb_noun_swap):  True
+        (compare_against_original):  True
+      )
+    (1): RepeatModification
+    (2): StopwordModification
+  (is_black_box):  True
+) 
+
+--------------------------------------------- Result 1 ---------------------------------------------
+[92mPositive (91%)[0m --> [91mNegative (62%)[0m
+
+lovingly photographed in the manner of a golden book sprung to [92mlife[0m , stuart little 2 manages sweetness largely without stickiness .
+
+lovingly photographed in the manner of a golden book sprung to [91msubsistence[0m , stuart little 2 manages sweetness largely without stickiness .
+
+
+--------------------------------------------- Result 2 ---------------------------------------------
+[92mPositive (99%)[0m --> [91mNegative (58%)[0m
+
+[92mconsistently[0m [92mclever[0m and suspenseful .
+
+[91mpersistently[0m [91mbrainy[0m and suspenseful .
+
+
+--------------------------------------------- Result 3 ---------------------------------------------
+[91mNegative (78%)[0m --> [37m[SKIPPED][0m
+
+it's like a " big chill " reunion of the baader-meinhof gang , only these guys are more harmless pranksters than political activists .
+
+
+--------------------------------------------- Result 4 ---------------------------------------------
+[92mPositive (96%)[0m --> [91mNegative (85%)[0m
+
+the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with [92mtremendous[0m [92mskill[0m .
+
+the story gives ample opportunity for large-scale action and suspense , which director shekhar kapur supplies with [91mcolossal[0m [91mjurisdiction[0m .
+
+
+
+-------------------------------+--------+
+| Attack Results                |        |
+-------------------------------+--------+
+| Number of successful attacks: | 3      |
+| Number of failed attacks:     | 0      |
+| Number of skipped attacks:    | 1      |
+| Original accuracy:            | 75.0%  |
+| Accuracy under attack:        | 0.0%   |
+| Attack success rate:          | 100.0% |
+| Average perturbed word %:     | 22.04% |
+| Average num. words per input: | 15.5   |
+| Avg num queries:              | 175.67 |
+-------------------------------+--------+
--- a/tests/test_command_line/test_attack.py
+++ b/tests/test_command_line/test_attack.py
@@ -135,6 +135,17 @@ attack_test_params = [
        ),
        "tests/sample_outputs/kuleshov_cnn_sst_2.txt",
    ),
+    #
+    # test: run_attack on LSTM MR using word embedding transformation and greedy search with Stanza part-of-speech tagger as a constraint
+    #
+    (
+        "run_attack_stanza_pos_tagger",
+        (
+            "textattack attack --model lstm-mr --num-examples 4 --search-method greedy --transformation word-swap-embedding "
+            "--constraints repeat stopword part-of-speech^tagger_type=\\'stanza\\' --shuffle=False"
+        ),
+        "tests/sample_outputs/run_attack_stanza_pos_tagger.txt",
+    ),
 ]


--- a/textattack/constraints/grammaticality/part_of_speech.py
+++ b/textattack/constraints/grammaticality/part_of_speech.py
@@ -3,6 +3,7 @@ from flair.data import Sentence
 from flair.models import SequenceTagger
 import lru
 import nltk
+import stanza

 import textattack
 from textattack.constraints import Constraint
@@ -46,10 +47,11 @@ class PartOfSpeech(Constraint):
    of `<https://arxiv.org/abs/1907.11932>`_ adapted from
    `<https://github.com/jind11/TextFooler>`_.

-    POS tagger from Flair `<https://github.com/flairNLP/flair>` also available
+    POS taggers from Flair `<https://github.com/flairNLP/flair>`_ and
+    Stanza `<https://github.com/stanfordnlp/stanza>`_ are also available

    Args:
-        tagger_type (str): Name of the tagger to use (available choices: "nltk", "flair").
+        tagger_type (str): Name of the tagger to use (available choices: "nltk", "flair", "stanza").
        tagset (str): tagset to use for POS tagging
        allow_verb_noun_swap (bool): If `True`, allow verbs to be swapped with nouns and vice versa.
        compare_against_original (bool): If `True`, compare against the original text.
@@ -75,6 +77,11 @@ class PartOfSpeech(Constraint):
            else:
                self._flair_pos_tagger = SequenceTagger.load("pos-fast")

+        if tagger_type == "stanza":
+            self._stanza_pos_tagger = stanza.Pipeline(
+                lang="en", processors="tokenize, pos", tokenize_pretokenized=True
+            )
+
    def clear_cache(self):
        self._pos_tag_cache.clear()

@@ -101,6 +108,11 @@ class PartOfSpeech(Constraint):
                    context_key_sentence
                )

+            if self.tagger_type == "stanza":
+                word_list, pos_list = textattack.shared.utils.zip_stanza_result(
+                    self._stanza_pos_tagger(context_key), tagset=self.tagset
+                )
+
            self._pos_tag_cache[context_key] = (word_list, pos_list)

        # idx of `word` in `context_words`
--- a/textattack/shared/utils/install.py
+++ b/textattack/shared/utils/install.py
@@ -115,6 +115,10 @@ def _post_install():
    nltk.download("universal_tagset")
    nltk.download("wordnet")

+    import stanza
+
+    stanza.download("en")
+

 def set_cache_dir(cache_dir):
    """Sets all relevant cache directories to ``TA_CACHE_DIR``."""
--- a/textattack/shared/utils/strings.py
+++ b/textattack/shared/utils/strings.py
@@ -197,3 +197,26 @@ def zip_flair_result(pred, tag_type="pos-fast"):
            pos_list.append(token.get_tag("ner"))

    return word_list, pos_list
+
+
+def zip_stanza_result(pred, tagset="universal"):
+    """Takes the first sentence from a document from `stanza` and returns two
+    lists, one of words and the other of their corresponding parts-of-
+    speech."""
+    from stanza.models.common.doc import Document
+
+    if not isinstance(pred, Document):
+        raise TypeError("Result from Stanza POS tagger must be a `Document` object.")
+
+    word_list = []
+    pos_list = []
+
+    for sentence in pred.sentences:
+        for word in sentence.words:
+            word_list.append(word.text)
+            if tagset == "universal":
+                pos_list.append(word.upos)
+            else:
+                pos_list.append(word.xpos)
+
+    return word_list, pos_list