Change CLI default for aux punctuation

2021-10-10 02:35:08 +03:00 · 2020-05-15 01:47:24 -07:00
parent 6150dcabf8
commit 0285308736
5 changed files with 25 additions and 22 deletions
--- a/README.md
+++ b/README.md
@@ -50,7 +50,6 @@ mlm score \
    --model bert-base-en-uncased \
    --max-utts 3 \
    --gpus 0 \
-    --split-size 100 \
    examples/asr-librispeech-espnet/data/dev-other.am.json \
    > examples/demo/dev-other-3.lm.json
 ```
--- a/examples/asr-librispeech-espnet/README.md
+++ b/examples/asr-librispeech-espnet/README.md
@@ -19,6 +19,7 @@ for set in dev-clean dev-other test-clean test-other ; do
            --model ${model} \
            --gpus 0,1,2,3 \
            --split-size 2000 \
+            --eos \
            data/${set}.am.json \
            > output/${model}/${set}.lm.json
    done
@@ -34,6 +35,7 @@ for set in dev-clean dev-other test-clean test-other ; do
            --gpus 0,1,2,3 \
            --split-size 2000 \
            --weights params/bert-base-en-uncased-380k.params \
+            --eos \
            data/${set}.am.json \
            > output/${model}-380k/${set}.lm.json
    done
@@ -107,6 +109,7 @@ mkdir -p output-distill/${model}/params-1e-5_8gpu_384/
 mlm finetune \
    --model ${model} \
    --gpus 0,1,2,3,4,5,6,7 \
+    --eos \
    --corpus-dir output-distill \
    --score-dir output-distill/${model} \
    --output-dir output-distill/${model}/params-1e-5_8gpu_384/ \
@@ -124,6 +127,7 @@ for set in dev-clean ; do
        --model ${model} \
        --gpus 0 \
        --weights output-distill/${model}/params-1e-5_8gpu_384/epoch-10.params \
+        --eos \
        --no-mask \
        --split-size 500 \
        data/${set}.am.json \
--- a/examples/lingacc-blimp/README.md
+++ b/examples/lingacc-blimp/README.md
@@ -26,7 +26,6 @@ for file in data/* ; do
            --model ${model} \
            --gpus 0 \
            --split-size 500 \
-            --no-eos \
            ${file} \
            > output/${model}/$(basename ${file} .txt).lm.json
    done
@@ -54,7 +53,6 @@ for suffix in good.txt bad.txt ; do
            --model ${model} \
            --gpus 0 \
            --split-size 1500 \
-            --no-eos \
            data-concat/${suffix} \
            > output/${model}.${suffix}
    done
--- a/examples/nmt-tedtalks-ace/README.md
+++ b/examples/nmt-tedtalks-ace/README.md
@@ -21,7 +21,6 @@ for set in dev ; do
                --mode hyp \
                --model ${model} \
                --gpus 0 \
-                --no-eos \
                --split-size 500 \
                ${PREFIX_DIR}/${set}.${src}.bpe.beam_trans.nobpe \
                > output/${model}/${set}-${pair}.lm.json
--- a/src/mlm/cmds.py
+++ b/src/mlm/cmds.py
@@ -82,12 +82,12 @@ def main() -> None:
                        help="Scoring references (.txt, .json 'refs') vs. hypotheses (.json 'hyp_*')")
    parser_score.add_argument('--temp', type=float, default=1.0,
                        help="softmax temperature")
-    parser_score.add_argument('--split-size', type=int, default=2000,
+    parser_score.add_argument('--split-size', type=int, default=500,
                        help="split size (per GPU)")
    parser_score.add_argument('--no-mask', action='store_true',
                        help="Instead of making masked copies, do not mask")
-    parser_score.add_argument('--no-eos', action='store_true',
-                        help="do not append '.' (this may break train-test parity)")
+    parser_score.add_argument('--eos', action='store_true',
+                        help="append '.' (this can help mitigate train-test disparity)")
    parser_score.add_argument('--detok', action='store_true',
                        help="perform Moses English detokenization on hypotheses before scoring")

@@ -114,8 +114,8 @@ def main() -> None:
                        help="split size (per GPU)")
    parser_bin.add_argument('--no-mask', action='store_true',
                        help="Instead of making masked copies, do not mask")
-    parser_bin.add_argument('--no-eos', action='store_true',
-                        help="do not append '.' (this breaks train-test parity)")
+    parser_bin.add_argument('--eos', action='store_true',
+                        help="append '.' (this can help mitigate train-test disparity)")
    capitalize_parser = parser_bin.add_mutually_exclusive_group(required=False)
    capitalize_parser.add_argument('--capitalize', dest='capitalize', action='store_true')
    capitalize_parser.add_argument('--no-capitalize', dest='capitalize', action='store_false')
@@ -160,8 +160,8 @@ def main() -> None:
                        help="Number of initial layers to freeze")

    # TODO: deduplicate
-    parser_finetune.add_argument('--no-eos', action='store_true',
-                        help="do not append '.' (this breaks train-test parity)")
+    parser_finetune.add_argument('--eos', action='store_true',
+                        help="append '.' (this can help mitigate train-test disparity)")
    capitalize_parser = parser_finetune.add_mutually_exclusive_group(required=False)
    capitalize_parser.add_argument('--capitalize', dest='capitalize', action='store_true')
    capitalize_parser.add_argument('--no-capitalize', dest='capitalize', action='store_false')
@@ -191,13 +191,13 @@ def cmd_score(args: argparse.Namespace) -> None:

    # Set scorer
    if isinstance(model, nlp.model.BERTModel):
-        scorer = MLMScorer(model, vocab, tokenizer, eos=(not args.no_eos), wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
+        scorer = MLMScorer(model, vocab, tokenizer, eos=args.eos, wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
    elif isinstance(model, BERTRegression):
-        scorer = RegressionScorer(model, vocab, tokenizer, eos=(not args.no_eos), wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
+        scorer = RegressionScorer(model, vocab, tokenizer, eos=args.eos, wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
    else:
        assert not args.whole_word_mask
        assert not args.no_mask
-        scorer = LMScorer(model, vocab, tokenizer, eos=(not args.no_eos), capitalize=args.capitalize, ctxs=ctxs)
+        scorer = LMScorer(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)

    # What data do we use?
    if args.mode == 'hyp':
@@ -225,18 +225,21 @@ def cmd_score(args: argparse.Namespace) -> None:
    scored_corpus = ScoredCorpus.from_corpus_and_scores(corpus, scores)

    num_words_list, max_sent_len = corpus.get_num_words()
-    if not args.no_eos:
+    if args.eos:
        logging.warn("Adding EOSes '.' to (P)PPL computation")
        num_words_list = [x+1 for x in num_words_list]
    num_words_total = sum(num_words_list)
-    logging.warn("# words (no added markers): {}".format(num_words_total))
+    if args.eos:
+        logging.warn("# words (excluding EOS '.'): {}".format(num_words_total))
+    else:
+        logging.warn("# words: {}".format(num_words_total))
    logging.warn("longest sentence: {}".format(max_sent_len))

    num_toks_total = sum(true_tok_lens)
-    if not args.no_eos:
-        logging.warn("# toks (including EOS '.'): {}".format(num_toks_total))
+    if args.eos:
+        logging.warn("# tokens (including EOS '.'): {}".format(num_toks_total))
    else:
-        logging.warn("# toks: {}".format(num_toks_total))
+        logging.warn("# tokens: {}".format(num_toks_total))


    if not args.per_token:
@@ -285,13 +288,13 @@ def cmd_bin(args: argparse.Namespace) -> None:
    if isinstance(model, nlp.model.BERTModel):
        assert not args.whole_word_mask
        assert not args.no_mask
-        binner = MLMBinner(model, vocab, tokenizer, eos=(not args.no_eos), capitalize=args.capitalize, ctxs=ctxs)
+        binner = MLMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)
    elif isinstance(model, BERTRegression):
        raise ValueError("Not supported")
    else:
        assert not args.whole_word_mask
        assert not args.no_mask
-        binner = LMBinner(model, vocab, tokenizer, eos=(not args.no_eos), capitalize=args.capitalize, ctxs=ctxs)
+        binner = LMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)

    # What data do we use?
    if args.mode == 'hyp':
@@ -419,5 +422,5 @@ def cmd_finetune(args: argparse.Namespace) -> None:

    ### FINETUNING LOOP

-    tuner = RegressionFinetuner(model, vocab, tokenizer, eos=(not args.no_eos), wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
+    tuner = RegressionFinetuner(model, vocab, tokenizer, eos=args.eos, wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
    tuner.tune(scored_corpus, ratio=1, split_size=args.split_size, output_dir=Path(args.output_dir))