1
0
mirror of https://github.com/QData/TextAttack.git synced 2021-10-13 00:05:06 +03:00

update models readme; merge

This commit is contained in:
Jack Morris
2020-07-06 16:51:22 -04:00
parent 71f98a6b84
commit 0c91cd9512
5 changed files with 104 additions and 64 deletions

View File

@@ -140,10 +140,7 @@ HUGGINGFACE_DATASET_BY_MODEL = {
"textattack/roberta-base-CoLA",
("glue", "cola", "validation"),
),
"roberta-base-imdb": (
"textattack/roberta-base-imdb",
("imdb", None, "test"),
),
"roberta-base-imdb": ("textattack/roberta-base-imdb", ("imdb", None, "test"),),
"roberta-base-mr": (
"textattack/textattack/roberta-base-rotten-tomatoes",
("rotten_tomatoes", None, "test"),
@@ -156,10 +153,7 @@ HUGGINGFACE_DATASET_BY_MODEL = {
"textattack/roberta-base-QNLI",
("glue", "qnli", "validation"),
),
"roberta-base-rte": (
"textattack/roberta-base-RTE",
("glue", "rte", "validation")
),
"roberta-base-rte": ("textattack/roberta-base-RTE", ("glue", "rte", "validation")),
"roberta-base-sst2": (
"textattack/roberta-base-SST-2",
("glue", "sst2", "validation"),
@@ -183,26 +177,20 @@ HUGGINGFACE_DATASET_BY_MODEL = {
"textattack/albert-base-v2-CoLA",
("glue", "cola", "validation"),
),
"albert-base-v2-imdb": (
"textattack/albert-base-v2-imdb",
("imdb", None, "test"),
),
"albert-base-v2-imdb": ("textattack/albert-base-v2-imdb", ("imdb", None, "test"),),
"albert-base-v2-mr": (
"textattack/albert-base-v2-rotten-tomatoes",
("rotten_tomatoes", None, "test"),
),
"albert-base-v2-rte": (
"textattack/albert-base-v2-RTE",
("glue", "rte", "validation")
"textattack/albert-base-v2-RTE",
("glue", "rte", "validation"),
),
"albert-base-v2-qqp": (
"textattack/albert-base-v2-QQP",
("glue", "qqp", "validation"),
),
"albert-base-v2-snli": (
"textattack/albert-base-v2-snli",
("snli", None, "test"),
),
"albert-base-v2-snli": ("textattack/albert-base-v2-snli", ("snli", None, "test"),),
"albert-base-v2-sst2": (
"textattack/albert-base-v2-SST-2",
("glue", "sst2", "validation"),
@@ -239,8 +227,8 @@ HUGGINGFACE_DATASET_BY_MODEL = {
("glue", "mrpc", "validation"),
),
"xlnet-base-cased-rte": (
"textattack/xlnet-base-cased-RTE",
("glue", "rte", "validation")
"textattack/xlnet-base-cased-RTE",
("glue", "rte", "validation"),
),
"xlnet-base-cased-stsb": (
"textattack/xlnet-base-cased-STS-B",
@@ -261,38 +249,18 @@ TEXTATTACK_DATASET_BY_MODEL = {
#
# CNNs
#
"lstm-sst2": ("models/classification/lstm/sst2",
("glue", "sst2", "validation")
),
"lstm-yelp": (
"models/classification/lstm/yelp",
("yelp_polarity", None, "test"),
),
"lstm-imdb": (
"models/classification/lstm/imdb",
("imdb", None, "test")
),
"lstm-mr": (
"models/classification/lstm/mr",
("rotten_tomatoes", None, "test"),
),
"lstm-sst2": ("models/classification/lstm/sst2", ("glue", "sst2", "validation")),
"lstm-yelp": ("models/classification/lstm/yelp", ("yelp_polarity", None, "test"),),
"lstm-imdb": ("models/classification/lstm/imdb", ("imdb", None, "test")),
"lstm-mr": ("models/classification/lstm/mr", ("rotten_tomatoes", None, "test"),),
#
# LSTMs
#
"cnn-sst2": (
"models/classification/cnn/sst",
("glue", "sst2", "validation")
),
"cnn-imdb": (
"models/classification/cnn/imdb",
("imdb", None, "test")
),
"cnn-yelp": (
"models/classification/cnn/yelp",
("yelp_polarity", None, "test"),
),
"cnn-sst2": ("models/classification/cnn/sst", ("glue", "sst2", "validation")),
"cnn-imdb": ("models/classification/cnn/imdb", ("imdb", None, "test")),
"cnn-yelp": ("models/classification/cnn/yelp", ("yelp_polarity", None, "test"),),
"cnn-mr": (
"models/classification/cnn/rotten-tomatoes",
"models/classification/cnn/rotten-tomatoes",
("rotten_tomatoes", None, "test"),
),
#

View File

@@ -11,9 +11,11 @@ from textattack.commands.attack.attack_args_helpers import *
logger = textattack.shared.logger
def _cb(s):
return textattack.shared.utils.color_text(str(s), color="blue", method="ansi")
class EvalModelCommand(TextAttackCommand):
"""
The TextAttack model benchmarking module:
@@ -29,12 +31,12 @@ class EvalModelCommand(TextAttackCommand):
def test_model_on_dataset(self, args):
model = parse_model_from_args(args)
dataset = parse_dataset_from_args(args)
preds = []
ground_truth_outputs = []
i = 0
while i < min(args.num_examples, len(dataset)):
dataset_batch = dataset[i:min(args.num_examples, i+args.batch_size)]
dataset_batch = dataset[i : min(args.num_examples, i + args.batch_size)]
batch_inputs = []
for (text_input, ground_truth_output) in dataset_batch:
attacked_text = textattack.shared.AttackedText(text_input)
@@ -43,20 +45,20 @@ class EvalModelCommand(TextAttackCommand):
ground_truth_outputs.append(ground_truth_output)
preds.extend(self.get_preds(model, batch_inputs))
i += args.batch_size
preds = torch.stack(preds).squeeze().cpu()
ground_truth_outputs = torch.tensor(ground_truth_outputs).cpu()
logger.info(f'Got {len(preds)} predictions.')
logger.info(f"Got {len(preds)} predictions.")
if preds.ndim == 1:
# if preds is just a list of numbers, assume regression for now
# TODO integrate with `textattack.metrics` package
pearson_correlation, _ = scipy.stats.pearsonr(ground_truth_outputs, preds)
spearman_correlation, _ = scipy.stats.spearmanr(ground_truth_outputs, preds)
logger.info(f'Pearson correlation = {_cb(pearson_correlation)}')
logger.info(f'Spearman correlation = {_cb(spearman_correlation)}')
logger.info(f"Pearson correlation = {_cb(pearson_correlation)}")
logger.info(f"Spearman correlation = {_cb(spearman_correlation)}")
else:
guess_labels = preds.argmax(dim=1)
successes = (guess_labels == ground_truth_outputs).sum().item()

View File

@@ -71,6 +71,9 @@ class ListThingsCommand(TextAttackCommand):
"feature", help=f"the feature to list", choices=ListThingsCommand.things()
)
parser.add_argument(
"--plain", help="print output without color", default=False, action='store_true'
"--plain",
help="print output without color",
default=False,
action="store_true",
)
parser.set_defaults(func=ListThingsCommand())

View File

@@ -50,11 +50,11 @@ All evaluations shown are on the full validation or test set up to 1000 examples
- nlp dataset `rotten_tomatoes`, split `test`
- Successes: 768/1000
- Accuracy: 76.80%
- SST-2 (`lstm-sst2`)
- SST-2 (`cnn-sst2`)
- nlp dataset `glue`, subset `sst2`, split `validation`
- Successes: 721/872
- Accuracy: 82.68%
- Yelp Polarity (`lstm-yelp`)
- Yelp Polarity (`cnn-yelp`)
- nlp dataset `yelp_polarity`, split `test`
- Successes: 913/1000
- Accuracy: 91.30%
@@ -66,7 +66,7 @@ All evaluations shown are on the full validation or test set up to 1000 examples
<section>
- CoLA `albert-base-v2-cola`
- CoLA (`albert-base-v2-cola`)
- nlp dataset `glue`, subset `cola`, split `validation`
- Successes: 829/1000
- Accuracy: 82.90%
@@ -102,7 +102,7 @@ All evaluations shown are on the full validation or test set up to 1000 examples
- nlp dataset `glue`, subset `wnli`, split `validation`
- Successes: 42/71
- Accuracy: 59.15%
- Yelp Polarity (`lstm-yelp`)
- Yelp Polarity (`albert-base-v2-yelp`)
- nlp dataset `yelp_polarity`, split `test`
- Successes: 963/1000
- Accuracy: 96.30%
@@ -242,12 +242,77 @@ All evaluations shown are on the full validation or test set up to 1000 examples
<section>
- CoLA (`roberta-base-cola`)
- nlp dataset `glue`, subset `cola`, split `validation`
- Successes: 857/1000
- Accuracy: 85.70%
- IMDB (`roberta-base-imdb`)
- nlp dataset `imdb`, split `test`
- Successes: 941/1000
- Accuracy: 94.10%
- Movie Reviews [Rotten Tomatoes] (`roberta-base-mr`)
- nlp dataset `rotten_tomatoes`, split `test`
- Successes: 883/1000
- Accuracy: 88.30%
- MRPC (`roberta-base-mrpc`)
- nlp dataset `glue`, subset `mrpc`, split `validation`
- Successes: 371/408
- Accuracy: 91.18%
- QNLI (`roberta-base-qnli`)
- nlp dataset `glue`, subset `qnli`, split `validation`
- Successes: 917/1000
- Accuracy: 91.70%
- Recognizing Textual Entailment (`roberta-base-rte`)
- nlp dataset `glue`, subset `rte`, split `validation`
- Successes: 217/277
- Accuracy: 78.34%
- SST-2 (`roberta-base-sst2`)
- nlp dataset `glue`, subset `sst2`, split `validation`
- Successes: 820/872
- Accuracy: 94.04%)
- STS-b (`roberta-base-stsb`)
- nlp dataset `glue`, subset `stsb`, split `validation`
- Pearson correlation: 0.906067852162708
- Spearman correlation: 0.9025045272903051
- WNLI (`roberta-base-wnli`)
- nlp dataset `glue`, subset `wnli`, split `validation`
- Successes: 40/71
- Accuracy: 56.34%
</section>
### `xlnet-base-cased`
<section>
- CoLA (`xlnet-base-cased-cola`)
- nlp dataset `glue`, subset `cola`, split `validation`
- Successes: 800/1000
- Accuracy: 80.00%
- IMDB (`xlnet-base-cased-imdb`)
- nlp dataset `imdb`, split `test`
- Successes: 957/1000
- Accuracy: 95.70%
- Movie Reviews [Rotten Tomatoes] (`xlnet-base-cased-mr`)
- nlp dataset `rotten_tomatoes`, split `test`
- Successes: 876/1000
- Accuracy: 87.60%
- MRPC (`xlnet-base-cased-mrpc`)
- nlp dataset `glue`, subset `mrpc`, split `validation`
- Successes: 363/408
- Accuracy: 88.97%
- Recognizing Textual Entailment (`xlnet-base-cased-rte`)
- nlp dataset `glue`, subset `rte`, split `validation`
- Successes: 196/277
- Accuracy: 70.76%
- STS-b (`xlnet-base-cased-stsb`)
- nlp dataset `glue`, subset `stsb`, split `validation`
- Pearson correlation: 0.883111673280641
- Spearman correlation: 0.8773439961182335
- WNLI (`xlnet-base-cased-wnli`)
- nlp dataset `glue`, subset `wnli`, split `validation`
- Successes: 41/71
- Accuracy: 57.75%
</section>

View File

@@ -11,7 +11,9 @@ def load_cached_state_dict(model_folder_path):
# Take the first model matching the pattern *model.bin.
model_path_list = glob.glob(os.path.join(model_folder_path, "*model.bin"))
if not model_path_list:
raise FileNotFoundError(f"model.bin not found in model folder {model_folder_path}.")
raise FileNotFoundError(
f"model.bin not found in model folder {model_folder_path}."
)
model_path = model_path_list[0]
state_dict = torch.load(model_path, map_location=utils.device)
return state_dict