Skip to content

Commit

Permalink
Make rouge metrics available in sockeye.evaluate CLI (#471)
Browse files Browse the repository at this point in the history
* Make rouge metrics available in sockeye.evaluate CLI

* Add chrf to optimizable metrics

* fix

* compute rouge in system tests

* fix
  • Loading branch information
fhieber authored and David Vilar committed Jul 12, 2018
1 parent 7629fae commit 7b9fc92
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 75 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [1.18.35]
### Added
- ROUGE scores are now available in `sockeye-evaluate`.
- Enabled CHRF as an early-stopping metric.

## [1.18.34]
### Added
- Added support for `--beam-search-stop first` for decoding jobs with `--batch-size > 1`.
Expand Down
2 changes: 1 addition & 1 deletion sockeye/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '1.18.34'
__version__ = '1.18.35'
22 changes: 14 additions & 8 deletions sockeye/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
import yaml
from typing import Any, Callable, Dict, List, Tuple, Optional

from sockeye.lr_scheduler import LearningRateSchedulerFixedStep
from . import constants as C
from . import data_io
from .lr_scheduler import LearningRateSchedulerFixedStep


class ConfigArgumentParser(argparse.ArgumentParser):
Expand All @@ -32,7 +32,7 @@ class ConfigArgumentParser(argparse.ArgumentParser):
The option --config is added automatically and expects a YAML serialized
dictionary, similar to the return value of parse_args(). Command line
parameters have precendence over config file values. Usage should be
parameters have precedence over config file values. Usage should be
transparent, just substitute argparse.ArgumentParser with this class.
Extended from
Expand Down Expand Up @@ -1051,7 +1051,8 @@ def add_max_output_cli_args(params):
params.add_argument('--max-output-length',
type=int,
default=None,
help='Maximum number of words to generate during translation. If None, it will be computed automatically. Default: %(default)s.')
help='Maximum number of words to generate during translation. '
'If None, it will be computed automatically. Default: %(default)s.')


def add_inference_args(params):
Expand Down Expand Up @@ -1104,11 +1105,13 @@ def add_inference_args(params):
type=float,
default=0,
help='Pruning threshold for beam search. All hypotheses with scores not within '
'this amount of the best finished hypothesis are discarded (0 = off). Default: %(default)s.')
'this amount of the best finished hypothesis are discarded (0 = off). '
'Default: %(default)s.')
decode_params.add_argument('--beam-search-stop',
choices=[C.BEAM_SEARCH_STOP_ALL, C.BEAM_SEARCH_STOP_FIRST],
default=C.BEAM_SEARCH_STOP_ALL,
help='Stopping criteria. Quit when (all) hypotheses are finished or when a finished hypothesis is in (first) position. Default: %(default)s.')
help='Stopping criteria. Quit when (all) hypotheses are finished '
'or when a finished hypothesis is in (first) position. Default: %(default)s.')
decode_params.add_argument('--batch-size',
type=int_greater_or_equal(1),
default=1,
Expand Down Expand Up @@ -1160,7 +1163,8 @@ def add_inference_args(params):
decode_params.add_argument('--avoid-list',
type=str,
default=None,
help="Specify a file containing phrases (pre-processed, one per line) to block from the output. Default: %(default)s.")
help="Specify a file containing phrases (pre-processed, one per line) to block "
"from the output. Default: %(default)s.")
decode_params.add_argument('--strip-unknown-words',
action='store_true',
default=False,
Expand Down Expand Up @@ -1191,6 +1195,7 @@ def add_inference_args(params):
help='EXPERIMENTAL: may be changed or removed in future. Overrides training dtype of '
'encoders and decoders during inference. Default: %(default)s')


def add_evaluate_args(params):
eval_params = params.add_argument_group("Evaluate parameters")
eval_params.add_argument('--references', '-r',
Expand All @@ -1201,9 +1206,10 @@ def add_evaluate_args(params):
type=file_or_stdin(),
default=[sys.stdin],
nargs='+',
help="File(s) with hypotheses. If none will read from stdin. Default: %(default)s.")
help="File(s) with hypotheses. If none will read from stdin. Default: stdin.")
eval_params.add_argument('--metrics',
nargs='+',
choices=C.EVALUATE_METRICS,
default=[C.BLEU, C.CHRF],
help='List of metrics to compute. Default: %(default)s.')
eval_params.add_argument('--sentence', '-s',
Expand All @@ -1212,7 +1218,7 @@ def add_evaluate_args(params):
eval_params.add_argument('--offset',
type=float,
default=0.01,
help="Numerical value of the offset of zero n-gram counts. Default: %(default)s.")
help="Numerical value of the offset of zero n-gram counts for BLEU. Default: %(default)s.")
eval_params.add_argument('--not-strict', '-n',
action="store_true",
help="Do not fail if number of hypotheses does not match number of references. "
Expand Down
8 changes: 5 additions & 3 deletions sockeye/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,9 +348,11 @@
ROUGE_L_VAL = ROUGEL + "-val"
AVG_TIME = "avg-sec-per-sent-val"
DECODING_TIME = "decode-walltime-val"
METRICS = [PERPLEXITY, ACCURACY, BLEU, ROUGE1]
METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, ROUGE1: True, PERPLEXITY: False}
METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf}
METRICS = [PERPLEXITY, ACCURACY, BLEU, CHRF, ROUGE1]
METRIC_MAXIMIZE = {ACCURACY: True, BLEU: True, CHRF: True, ROUGE1: True, PERPLEXITY: False}
METRIC_WORST = {ACCURACY: 0.0, BLEU: 0.0, CHRF: 0.0, ROUGE1: 0.0, PERPLEXITY: np.inf}
METRICS_REQUIRING_DECODER = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL]
EVALUATE_METRICS = [BLEU, CHRF, ROUGE1, ROUGE2, ROUGEL]

# loss
CROSS_ENTROPY = 'cross-entropy'
Expand Down
120 changes: 64 additions & 56 deletions sockeye/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,23 @@
# permissions and limitations under the License.

"""
Evaluation CLI. Prints corpus BLEU
Evaluation CLI.
"""
import argparse
import logging
import sys
import numpy as np
from typing import Iterable, Optional
from collections import defaultdict
from functools import partial
from typing import Callable, Iterable, Dict, List, Tuple, Optional

import numpy as np

from contrib import sacrebleu, rouge
from sockeye.log import setup_main_logger, log_sockeye_version
from . import arguments
from . import constants as C
from . import data_io
from . import utils
from .log import setup_main_logger, log_sockeye_version

logger = setup_main_logger(__name__, file_logging=False)

Expand Down Expand Up @@ -54,35 +56,39 @@ def raw_corpus_chrf(hypotheses: Iterable[str], references: Iterable[str]) -> flo
return sacrebleu.corpus_chrf(hypotheses, references, order=sacrebleu.CHRF_ORDER, beta=sacrebleu.CHRF_BETA,
remove_whitespace=True)


def raw_corpus_rouge1(hypotheses: Iterable[str], references: Iterable[str]) -> float:
"""
Simple wrapper around ROUGE-1 implementation.
"""
Simple wrapper around ROUGE-1 implementation.
:param hypotheses: Hypotheses stream.
:param references: Reference stream.
:return: ROUGE-1 score as float between 0 and 1.
"""
return rouge.rouge_1(hypotheses, references)

:param hypotheses: Hypotheses stream.
:param references: Reference stream.
:return: ROUGE-1 score as float between 0 and 1.
"""
return rouge.rouge_1(hypotheses, references)

def raw_corpus_rouge2(hypotheses: Iterable[str], references: Iterable[str]) -> float:
"""
Simple wrapper around ROUGE-2 implementation.
"""
Simple wrapper around ROUGE-2 implementation.
:param hypotheses: Hypotheses stream.
:param references: Reference stream.
:return: ROUGE-2 score as float between 0 and 1.
"""
return rouge.rouge_2(hypotheses, references)

:param hypotheses: Hypotheses stream.
:param references: Reference stream.
:return: ROUGE-2 score as float between 0 and 1.
"""
return rouge.rouge_2(hypotheses, references)

def raw_corpus_rougel(hypotheses: Iterable[str], references: Iterable[str]) -> float:
"""
Simple wrapper around ROUGE-1 implementation.
"""
Simple wrapper around ROUGE-1 implementation.
:param hypotheses: Hypotheses stream.
:param references: Reference stream.
:return: ROUGE-L score as float between 0 and 1.
"""
return rouge.rouge_l(hypotheses, references)

:param hypotheses: Hypotheses stream.
:param references: Reference stream.
:return: ROUGE-L score as float between 0 and 1.
"""
return rouge.rouge_l(hypotheses, references)

def main():
params = argparse.ArgumentParser(description='Evaluate translations by calculating metrics with '
Expand All @@ -103,55 +109,57 @@ def main():

references = [' '.join(e) for e in data_io.read_content(args.references)]
all_hypotheses = [[h.strip() for h in hypotheses] for hypotheses in args.hypotheses]
metrics = args.metrics
logger.info("%d hypotheses | %d references", len(all_hypotheses), len(references))

if not args.not_strict:
for hypotheses in all_hypotheses:
utils.check_condition(len(hypotheses) == len(references),
"Number of hypotheses (%d) and references (%d) does not match." % (len(hypotheses),
len(references)))
metric_info = []
for metric in metrics:
metric_info.append("%s\t(s_opt)" % metric)
logger.info("%d hypothesis set(s) | %d hypotheses | %d references",
len(all_hypotheses), len(all_hypotheses[0]), len(references))

metric_info = ["%s\t(s_opt)" % name for name in args.metrics]
logger.info("\t".join(metric_info))

metrics = [] # type: List[Tuple[str, Callable]]
for name in args.metrics:
if name == C.BLEU:
func = partial(raw_corpus_bleu, offset=args.offset)
elif name == C.CHRF:
func = raw_corpus_chrf
elif name == C.ROUGE1:
func = raw_corpus_rouge1
elif name == C.ROUGE2:
func = raw_corpus_rouge2
elif name == C.ROUGEL:
func = raw_corpus_rougel
else:
raise ValueError("Unknown metric %s." % name)
metrics.append((name, func))

if not args.sentence:
scores = defaultdict(list)
scores = defaultdict(list) # type: Dict[str, List[float]]
for hypotheses in all_hypotheses:
for metric in metrics:
if metric == C.BLEU:
score = raw_corpus_bleu(hypotheses, references, args.offset)
elif metric == C.CHRF:
score = raw_corpus_chrf(hypotheses, references)
else:
raise ValueError("Unknown metric %s." % metric)
scores[metric].append(score)
for name, metric in metrics:
scores[name].append(metric(hypotheses, references))
_print_mean_std_score(metrics, scores)
else:
for hypotheses in all_hypotheses:
for h, r in zip(hypotheses, references):
scores = defaultdict(list)
for metric in metrics:
if metric == C.BLEU:
score = raw_corpus_bleu([h], [r], args.offset)
elif metric == C.CHRF:
score = raw_corpus_chrf(h, r)
else:
raise ValueError("Unknown metric %s." % metric)
scores[metric].append(score)
scores = defaultdict(list) # type: Dict[str, List[float]]
for name, metric in metrics:
scores[name].append(metric([h], [r]))
_print_mean_std_score(metrics, scores)


def _print_mean_std_score(metrics, scores):
scores_mean_std = []
for metric in metrics:
if len(scores[metric]) > 1:
score_mean = np.asscalar(np.mean(scores[metric]))
score_std = np.asscalar(np.std(scores[metric], ddof=1))
def _print_mean_std_score(metrics: List[Tuple[str, Callable]], scores: Dict[str, List[float]]):
scores_mean_std = [] # type: List[str]
for name, _ in metrics:
if len(scores[name]) > 1:
score_mean = np.asscalar(np.mean(scores[name]))
score_std = np.asscalar(np.std(scores[name], ddof=1))
scores_mean_std.append("%.3f\t%.3f" % (score_mean, score_std))
else:
score = scores[metric][0]
score = scores[name][0]
scores_mean_std.append("%.3f\t(-)" % score)
print("\t".join(scores_mean_std))

Expand Down
3 changes: 0 additions & 3 deletions sockeye/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,6 @@ def check_arg_compatibility(args: argparse.Namespace):
:param args: Arguments as returned by argparse.
"""
check_condition(args.optimized_metric == C.BLEU or args.optimized_metric == C.ROUGE1 or args.optimized_metric in args.metrics,
"Must optimize either BLEU, ROUGE or one of tracked metrics (--metrics)")

if args.encoder == C.TRANSFORMER_TYPE:
check_condition(args.transformer_model_size[0] == args.num_embed[0],
"Source embedding size must match transformer model size: %s vs. %s"
Expand Down
4 changes: 2 additions & 2 deletions sockeye/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,8 +905,8 @@ def _check_args(self,

utils.check_condition(early_stopping_metric in C.METRICS,
"Unsupported early-stopping metric: %s" % early_stopping_metric)
if early_stopping_metric == C.BLEU:
utils.check_condition(cp_decoder is not None, "%s requires CheckpointDecoder" % C.BLEU)
if early_stopping_metric in C.METRICS_REQUIRING_DECODER:
utils.check_condition(cp_decoder is not None, "%s requires CheckpointDecoder" % early_stopping_metric)

def _save_params(self):
"""
Expand Down
4 changes: 2 additions & 2 deletions test/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,11 +459,11 @@ def run_train_translate(train_params: str,
if restrict_lexicon:
bleu_restrict = raw_corpus_bleu(hypotheses=hypotheses, references=references, offset=0.01)

# Run BLEU cli
# Run evaluate cli
eval_params = "{} {} ".format(sockeye.evaluate.__file__,
_EVAL_PARAMS_COMMON.format(hypotheses=out_path,
references=test_target_path,
metrics="bleu chrf",
metrics="bleu chrf rouge1",
quiet=quiet_arg), )
with patch.object(sys, "argv", eval_params.split()):
sockeye.evaluate.main()
Expand Down

0 comments on commit 7b9fc92

Please sign in to comment.