add thought vectors

2021-10-13 00:05:06 +03:00 · 2020-05-06 11:16:17 -04:00
parent 3602625e0b
commit d647133230
5 changed files with 69 additions and 3 deletions
--- a/textattack/constraints/semantics/thought_vector.py
+++ b/textattack/constraints/semantics/thought_vector.py
@@ -0,0 +1,66 @@
+import functools
+import torch
+
+from textattack.shared import utils
+from textattack.constraints import Constraint
+from textattack.shared import WordEmbedding
+from textattack.shared import TokenizedText
+
+class ThoughtVector(Constraint):
+    """
+    A constraint on the distance between two sentences' thought vectors.
+    
+    Args:
+        word_embedding (str): The word embedding to use
+        min_cos_sim: the minimum cosine similarity between thought vectors
+        max_mse_dist: the maximum euclidean distance between thought vectors
+    """
+    def __init__(self, embedding_type='paragramcf', max_mse_dist, min_cos_sim):
+        self.word_embedding = WordEmbedding(embedding_type)
+        
+        if (max_mse_dist or min_cos_sim) is None:
+            raise ValueError('Must set max_mse_dist or min_cos_sim')
+        
+        self.max_mse_dist = max_mse_dist
+        self.min_cos_sim = min_cos_sim
+    
+    @functools.lru_cache(maxsize=2**10)
+    def _get_thought_vector(self, tokenized_text):
+        return torch.sum([self.word_embedding[word] for word in tokenized_text.words])
+    
+    def __call__(self, x, x_adv):
+        """ Returns true if (x, x_adv) are closer than `self.min_cos_sim`
+            and `self.max_mse_dist`. """
+        
+        if not isinstance(x, TokenizedText):
+            raise TypeError('x must be of type TokenizedText')
+        if not isinstance(x_adv, TokenizedText):
+            raise TypeError('x_adv must be of type TokenizedText')
+        
+        thought_vector_1 = self._get_thought_vector(x)
+        thought_vector_2 = self._get_thought_vector(x_adv)
+        
+        # Check cosine distance.
+        if self.min_cos_sim:
+            cos_sim = torch.nn.CosineSimilarity(dim=0)(thought_vector_1, thought_vector_2)
+            if cos_sim < self.min_cos_sim:
+                return False
+        # Check MSE distance.
+        if self.max_mse_dist:
+            mse_dist = torch.sum((e1 - e2) ** 2)
+            if mse_dist > self.max_mse_dist:
+                return False
+        return True
+        
+    def extra_repr_keys(self):
+        """Set the extra representation of the constraint using these keys.
+        
+        To print customized extra information, you should reimplement
+        this method in your own constraint. Both single-line and multi-line
+        strings are acceptable.
+        """ 
+        if self.min_cos_sim is None:
+            metric = 'max_mse_dist'
+        else:
+            metric = 'min_cos_sim'
+        return ['embedding_type', metric]
--- a/textattack/models/helpers/init.py
+++ b/textattack/models/helpers/init.py
@@ -1,5 +1,5 @@
 # Helper stuff, like embeddings.
-from . import helper_utils
+from . import utils
 from .glove_embedding_layer import GloveEmbeddingLayer

 # Helper modules.
--- a/textattack/models/helpers/lstm_for_classification.py
+++ b/textattack/models/helpers/lstm_for_classification.py
@@ -5,7 +5,7 @@ import torch.nn as nn
 from textattack.shared import utils

 from textattack.models.helpers import GloveEmbeddingLayer
-from textattack.models.helpers.helper_utils import load_cached_state_dict
+from textattack.models.helpers.utils import load_cached_state_dict

 class LSTMForClassification(nn.Module):
    """ A long short-term memory neural network for text classification. 
--- a/textattack/models/helpers/helper_utils.py
+++ b/textattack/models/helpers/helper_utils.py
--- a/textattack/models/helpers/word_cnn_for_classification.py
+++ b/textattack/models/helpers/word_cnn_for_classification.py
@@ -5,7 +5,7 @@ import torch.nn.functional as F

 from textattack.shared import utils
 from textattack.models.helpers import GloveEmbeddingLayer
-from textattack.models.helpers.helper_utils import load_cached_state_dict
+from textattack.models.helpers.utils import load_cached_state_dict

 class WordCNNForClassification(nn.Module):
    """ A convolutional neural network for text classification.