Change CLI default for aux punctuation

This commit is contained in:
Julian Salazar
2020-05-15 01:47:24 -07:00
parent 6150dcabf8
commit 0285308736
5 changed files with 25 additions and 22 deletions

View File

@@ -50,7 +50,6 @@ mlm score \
--model bert-base-en-uncased \
--max-utts 3 \
--gpus 0 \
--split-size 100 \
examples/asr-librispeech-espnet/data/dev-other.am.json \
> examples/demo/dev-other-3.lm.json
```

View File

@@ -19,6 +19,7 @@ for set in dev-clean dev-other test-clean test-other ; do
--model ${model} \
--gpus 0,1,2,3 \
--split-size 2000 \
--eos \
data/${set}.am.json \
> output/${model}/${set}.lm.json
done
@@ -34,6 +35,7 @@ for set in dev-clean dev-other test-clean test-other ; do
--gpus 0,1,2,3 \
--split-size 2000 \
--weights params/bert-base-en-uncased-380k.params \
--eos \
data/${set}.am.json \
> output/${model}-380k/${set}.lm.json
done
@@ -107,6 +109,7 @@ mkdir -p output-distill/${model}/params-1e-5_8gpu_384/
mlm finetune \
--model ${model} \
--gpus 0,1,2,3,4,5,6,7 \
--eos \
--corpus-dir output-distill \
--score-dir output-distill/${model} \
--output-dir output-distill/${model}/params-1e-5_8gpu_384/ \
@@ -124,6 +127,7 @@ for set in dev-clean ; do
--model ${model} \
--gpus 0 \
--weights output-distill/${model}/params-1e-5_8gpu_384/epoch-10.params \
--eos \
--no-mask \
--split-size 500 \
data/${set}.am.json \

View File

@@ -26,7 +26,6 @@ for file in data/* ; do
--model ${model} \
--gpus 0 \
--split-size 500 \
--no-eos \
${file} \
> output/${model}/$(basename ${file} .txt).lm.json
done
@@ -54,7 +53,6 @@ for suffix in good.txt bad.txt ; do
--model ${model} \
--gpus 0 \
--split-size 1500 \
--no-eos \
data-concat/${suffix} \
> output/${model}.${suffix}
done

View File

@@ -21,7 +21,6 @@ for set in dev ; do
--mode hyp \
--model ${model} \
--gpus 0 \
--no-eos \
--split-size 500 \
${PREFIX_DIR}/${set}.${src}.bpe.beam_trans.nobpe \
> output/${model}/${set}-${pair}.lm.json

View File

@@ -82,12 +82,12 @@ def main() -> None:
help="Scoring references (.txt, .json 'refs') vs. hypotheses (.json 'hyp_*')")
parser_score.add_argument('--temp', type=float, default=1.0,
help="softmax temperature")
parser_score.add_argument('--split-size', type=int, default=2000,
parser_score.add_argument('--split-size', type=int, default=500,
help="split size (per GPU)")
parser_score.add_argument('--no-mask', action='store_true',
help="Instead of making masked copies, do not mask")
parser_score.add_argument('--no-eos', action='store_true',
help="do not append '.' (this may break train-test parity)")
parser_score.add_argument('--eos', action='store_true',
help="append '.' (this can help mitigate train-test disparity)")
parser_score.add_argument('--detok', action='store_true',
help="perform Moses English detokenization on hypotheses before scoring")
@@ -114,8 +114,8 @@ def main() -> None:
help="split size (per GPU)")
parser_bin.add_argument('--no-mask', action='store_true',
help="Instead of making masked copies, do not mask")
parser_bin.add_argument('--no-eos', action='store_true',
help="do not append '.' (this breaks train-test parity)")
parser_bin.add_argument('--eos', action='store_true',
help="append '.' (this can help mitigate train-test disparity)")
capitalize_parser = parser_bin.add_mutually_exclusive_group(required=False)
capitalize_parser.add_argument('--capitalize', dest='capitalize', action='store_true')
capitalize_parser.add_argument('--no-capitalize', dest='capitalize', action='store_false')
@@ -160,8 +160,8 @@ def main() -> None:
help="Number of initial layers to freeze")
# TODO: deduplicate
parser_finetune.add_argument('--no-eos', action='store_true',
help="do not append '.' (this breaks train-test parity)")
parser_finetune.add_argument('--eos', action='store_true',
help="append '.' (this can help mitigate train-test disparity)")
capitalize_parser = parser_finetune.add_mutually_exclusive_group(required=False)
capitalize_parser.add_argument('--capitalize', dest='capitalize', action='store_true')
capitalize_parser.add_argument('--no-capitalize', dest='capitalize', action='store_false')
@@ -191,13 +191,13 @@ def cmd_score(args: argparse.Namespace) -> None:
# Set scorer
if isinstance(model, nlp.model.BERTModel):
scorer = MLMScorer(model, vocab, tokenizer, eos=(not args.no_eos), wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
scorer = MLMScorer(model, vocab, tokenizer, eos=args.eos, wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
elif isinstance(model, BERTRegression):
scorer = RegressionScorer(model, vocab, tokenizer, eos=(not args.no_eos), wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
scorer = RegressionScorer(model, vocab, tokenizer, eos=args.eos, wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
else:
assert not args.whole_word_mask
assert not args.no_mask
scorer = LMScorer(model, vocab, tokenizer, eos=(not args.no_eos), capitalize=args.capitalize, ctxs=ctxs)
scorer = LMScorer(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)
# What data do we use?
if args.mode == 'hyp':
@@ -225,18 +225,21 @@ def cmd_score(args: argparse.Namespace) -> None:
scored_corpus = ScoredCorpus.from_corpus_and_scores(corpus, scores)
num_words_list, max_sent_len = corpus.get_num_words()
if not args.no_eos:
if args.eos:
logging.warn("Adding EOSes '.' to (P)PPL computation")
num_words_list = [x+1 for x in num_words_list]
num_words_total = sum(num_words_list)
logging.warn("# words (no added markers): {}".format(num_words_total))
if args.eos:
logging.warn("# words (excluding EOS '.'): {}".format(num_words_total))
else:
logging.warn("# words: {}".format(num_words_total))
logging.warn("longest sentence: {}".format(max_sent_len))
num_toks_total = sum(true_tok_lens)
if not args.no_eos:
logging.warn("# toks (including EOS '.'): {}".format(num_toks_total))
if args.eos:
logging.warn("# tokens (including EOS '.'): {}".format(num_toks_total))
else:
logging.warn("# toks: {}".format(num_toks_total))
logging.warn("# tokens: {}".format(num_toks_total))
if not args.per_token:
@@ -285,13 +288,13 @@ def cmd_bin(args: argparse.Namespace) -> None:
if isinstance(model, nlp.model.BERTModel):
assert not args.whole_word_mask
assert not args.no_mask
binner = MLMBinner(model, vocab, tokenizer, eos=(not args.no_eos), capitalize=args.capitalize, ctxs=ctxs)
binner = MLMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)
elif isinstance(model, BERTRegression):
raise ValueError("Not supported")
else:
assert not args.whole_word_mask
assert not args.no_mask
binner = LMBinner(model, vocab, tokenizer, eos=(not args.no_eos), capitalize=args.capitalize, ctxs=ctxs)
binner = LMBinner(model, vocab, tokenizer, eos=args.eos, capitalize=args.capitalize, ctxs=ctxs)
# What data do we use?
if args.mode == 'hyp':
@@ -419,5 +422,5 @@ def cmd_finetune(args: argparse.Namespace) -> None:
### FINETUNING LOOP
tuner = RegressionFinetuner(model, vocab, tokenizer, eos=(not args.no_eos), wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
tuner = RegressionFinetuner(model, vocab, tokenizer, eos=args.eos, wwm=args.whole_word_mask, capitalize=args.capitalize, ctxs=ctxs)
tuner.tune(scored_corpus, ratio=1, split_size=args.split_size, output_dir=Path(args.output_dir))