From 672729747432810f9bcb37149104124dd3cc4165 Mon Sep 17 00:00:00 2001 From: Julian Salazar Date: Sun, 11 Oct 2020 01:11:53 -0700 Subject: [PATCH] Update tests, add BLiMP results for DistilBERT/ALBERT (#2). --- examples/lingacc-blimp/README.md | 2 ++ tests/test_models.py | 35 +++++++++++++++++++++------- tests/test_scorers.py | 39 ++++++++++++++++++++++++++++---- 3 files changed, 63 insertions(+), 13 deletions(-) diff --git a/examples/lingacc-blimp/README.md b/examples/lingacc-blimp/README.md index 828eec6..c0d5e6f 100644 --- a/examples/lingacc-blimp/README.md +++ b/examples/lingacc-blimp/README.md @@ -41,6 +41,8 @@ echo "RoBERTa (large)" ``` These give 82.6% and 86.5%, respectively. Human performance is 88.6%. +See [the paper](https://www.aclweb.org/anthology/2020.acl-main.240/) for complete results. After the paper, we found `distilbert-base-cased` gives 78.3% and `albert-xxlarge-v2` gives 84.4%; details in [Issue #2](https://github.com/awslabs/mlm-scoring/issues/2). + ## Pseudo-perplexities This gives token-level PPPLs of 59.2 on the acceptable sentences and 111.2 on the unacceptable ones: diff --git a/tests/test_models.py b/tests/test_models.py index 8686b7d..ee5463b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,28 +1,47 @@ import pytest -import mxnet as mx +# MXNet-based import gluonnlp as nlp +import mxnet as mx +# PyTorch-based +import torch +import transformers from mlm.models import get_pretrained +from mlm.models.bert import BertForMaskedLMOptimized def test_get_pretrained(): - # bert-base-en-uncased + # MXNet: bert-base-en-uncased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased') # Check the model assert isinstance(model, nlp.model.BERTModel) assert len(model.encoder.transformer_cells) == 12 - assert pytest.approx(model.word_embed[0].params['bertmodel0_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == -0.0424806065 - # Check the vocab unk_idx = vocab.token_to_idx[vocab.unknown_token] + assert pytest.approx(model.word_embed[0].params['bertmodel0_word_embed_embedding0_weight']._data[0][unk_idx,0].asscalar()) == -0.0424806065 + # Check the vocab assert vocab.token_to_idx['test'] != unk_idx assert vocab.token_to_idx['Test'] == unk_idx # Check the tokenizer assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') - # bert-base-en-uncased-owt + # PyTorch: bert-base-en-uncased + + model, _, tokenizer = get_pretrained([mx.cpu()], 'bert-base-uncased') + # Check the model + assert isinstance(model, BertForMaskedLMOptimized) + assert len(model.bert.encoder.layer) == 12 + unk_idx = tokenizer.unk_token_id + assert pytest.approx(model.bert.embeddings.word_embeddings.parameters().__next__()[unk_idx,0].detach().numpy().item()) == -0.0424806065 + # Check the vocab + assert tokenizer.convert_tokens_to_ids('test') != unk_idx + assert tokenizer.convert_tokens_to_ids('Test') == unk_idx + # Check the tokenizer + assert tuple(tokenizer.tokenize("The man jumped up, put his basket on Philammon's head")) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') + + # MXNet: bert-base-en-uncased-owt model, vocab_new, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased-owt') # Check the model @@ -32,7 +51,7 @@ def test_get_pretrained(): # Check the tokenizer assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head') - # bert-large-en-cased + # MXNet: bert-large-en-cased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-large-en-cased') # Check the model @@ -47,7 +66,7 @@ def test_get_pretrained(): # Check the tokenizer assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('The', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'Phil', '##am', '##mon', "'", 's', 'head') - # bert-base-multi-cased + # MXNet: bert-base-multi-cased model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-multi-cased') # Check the model @@ -59,4 +78,4 @@ def test_get_pretrained(): assert vocab.token_to_idx['Test'] != unk_idx assert vocab.token_to_idx['これは'] != unk_idx # Check the tokenizer - assert tuple(tokenizer("これは Test ですよ。")) == ('これは', 'Test', 'で', '##す', '##よ', '。') + assert tuple(tokenizer("これはTestですよ。")) == ('これは', '##T', '##est', '##で', '##す', '##よ', '。') diff --git a/tests/test_scorers.py b/tests/test_scorers.py index c79e21f..5644f23 100644 --- a/tests/test_scorers.py +++ b/tests/test_scorers.py @@ -5,11 +5,12 @@ from mxnet.gluon.data import Dataset from mlm.loaders import Corpus from mlm.models import get_pretrained -from mlm.scorers import MLMScorer, MLMScorerPT +from mlm.scorers import LMScorer, MLMScorer, MLMScorerPT +# The ASR case, where we append . as an EOS -def _get_scorer_and_corpus(): +def _get_scorer_and_corpus_eos(): ctxs = [mx.cpu()] model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-uncased') scorer_mx = MLMScorer(model, vocab, tokenizer, ctxs, eos=True, wwm=False) @@ -20,18 +21,46 @@ def _get_scorer_and_corpus(): def test_mlmscorer_corpus_to_dataset(): - scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus() + scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus_eos() dataset = scorer_mx.corpus_to_dataset(corpus) assert isinstance(dataset, Dataset) # Our three tokens, plus the EOS assert len(dataset) == 4 -def test_mlmscorer_score(): - scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus() +def test_mlmscorer_score_eos(): + scorer_mx, scorer_pt, corpus = _get_scorer_and_corpus_eos() scores, _ = scorer_mx.score(corpus) assert len(scores) == 1 assert pytest.approx(scores[0], abs=0.0001) == -13.3065947 scores, _ = scorer_pt.score(corpus) assert len(scores) == 1 assert pytest.approx(scores[0], abs=0.0001) == -13.3065947 + + +# The general case + +def test_mlmscorer_score_sentences(): + + TEST_CASES = ( + # README examples + ('bert-base-en-cased', MLMScorer, (None, -6.126666069030762, -5.50140380859375, -0.7823182344436646, None)), + ('bert-base-cased', MLMScorerPT, (None, -6.126738548278809, -5.501765727996826, -0.782496988773346, None)), + ('gpt2-117m-en-cased', LMScorer, (-8.293947219848633, -6.387561798095703, -1.3138668537139893)), + # etc. + ('albert-base-v2', MLMScorerPT, (None, -16.480087280273438, -12.897505760192871, -4.277405738830566, None)), + ('distilbert-base-cased', MLMScorerPT, (None, -5.1874895095825195, -6.390861511230469, -3.8225560188293457, None)), + ) + + for name, scorer_cls, expected_scores in TEST_CASES: + model, vocab, tokenizer = get_pretrained([mx.cpu()], name) + scorer = scorer_cls(model, vocab, tokenizer, [mx.cpu()]) + scores = scorer.score_sentences(["Hello world!"], per_token=True)[0] + expected_total = 0 + for score, expected_score in zip(scores, expected_scores): + if score is None and expected_score is None: + continue + assert pytest.approx(score, abs=0.0001) == expected_score + expected_total += expected_score + score_total = scorer.score_sentences(["Hello world!"], per_token=False)[0] + assert pytest.approx(score_total, abs=0.0001) == expected_total