Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More evaluation methods including CIDEr, METEOR and ROUGE besides BLEU1-4 #62

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 29 additions & 5 deletions eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
from nltk.translate.bleu_score import corpus_bleu
import torch.nn.functional as F
from tqdm import tqdm
#socre evaluation packages from COCO.api
from evalfunc.bleu.bleu import Bleu
from evalfunc.rouge.rouge import Rouge
from evalfunc.cider.cider import Cider
from evalfunc.meteor.meteor import Meteor

# Parameters
data_folder = '/media/ssd/caption data' # folder with data files saved by create_input_files.py
Expand Down Expand Up @@ -168,12 +173,31 @@ def evaluate(beam_size):

assert len(references) == len(hypotheses)

# Calculate BLEU-4 scores
bleu4 = corpus_bleu(references, hypotheses)

return bleu4
# Calculate BLEU & CIDEr & METEOR & ROUGE scores
scorers = [
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
(Cider(), "CIDEr"),
(Meteor(), "METEOR"),
(Rouge(), "ROUGE_L")
]

hypo = [[' '.join(hypo)] for hypo in [[str(x) for x in hypo] for hypo in hypotheses]]
ref = [[' '.join(reft) for reft in reftmp] for reftmp in [[[str(x) for x in reft] for reft in reftmp]for reftmp in references]]

score = []
method = []
for scorer, method_i in scorers:
score_i, scores_i = scorer.compute_score(ref, hypo)
score.extend(score_i) if isinstance(score_i, list) else score.append(score_i)
method.extend(method_i) if isinstance(method_i, list) else method.append(method_i)
score_dict = dict(zip(method, score))

return score_dict


if __name__ == '__main__':
beam_size = 1
print("\nBLEU-4 score @ beam size of %d is %.4f." % (beam_size, evaluate(beam_size)))
score_dict = evaluate(beam_size)
for method, score in score_dict.items():
print('%s: %.4f' % (method, score))
print("\n%s score @ beam size of %d is %.4f." % (method, beam_size, score))
19 changes: 19 additions & 0 deletions evalfunc/bleu/LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
1 change: 1 addition & 0 deletions evalfunc/bleu/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'tylin'
Binary file added evalfunc/bleu/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added evalfunc/bleu/__pycache__/bleu.cpython-36.pyc
Binary file not shown.
Binary file not shown.
44 changes: 44 additions & 0 deletions evalfunc/bleu/bleu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python
#
# File Name : bleu.py
#
# Description : Wrapper for BLEU scorer.
#
# Creation Date : 06-01-2015
# Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
# Authors : Hao Fang <[email protected]> and Tsung-Yi Lin <[email protected]>

from .bleu_scorer import BleuScorer


class Bleu:
def __init__(self, n=4):
# default compute Blue score up to 4
self._n = n
self._hypo_for_image = {}
self.ref_for_image = {}

def compute_score(self, gts, res):

bleu_scorer = BleuScorer(n=self._n)
for i in range(len(res)):
hypo = res[i]
ref = gts[i]

# Sanity check.
assert(type(hypo) is list)
assert(len(hypo) == 1)
assert(type(ref) is list)
assert(len(ref) >= 1)

bleu_scorer += (hypo[0], ref)

#score, scores = bleu_scorer.compute_score(option='shortest')
score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
#score, scores = bleu_scorer.compute_score(option='average', verbose=1)

# return (bleu, bleu_info)
return score, scores

def method(self):
return "Bleu"
263 changes: 263 additions & 0 deletions evalfunc/bleu/bleu_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,263 @@
#!/usr/bin/env python

# bleu_scorer.py
# David Chiang <[email protected]>

# Copyright (c) 2004-2006 University of Maryland. All rights
# reserved. Do not redistribute without permission from the
# author. Not for commercial use.

# Modified by:
# Hao Fang <[email protected]>
# Tsung-Yi Lin <[email protected]>

'''Provides:
cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
'''

import copy
import sys, math, re
from collections import defaultdict

def precook(s, n=4, out=False):
"""Takes a string as input and returns an object that can be given to
either cook_refs or cook_test. This is optional: cook_refs and cook_test
can take string arguments as well."""
words = s.split()
counts = defaultdict(int)
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)

def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average"
'''Takes a list of reference sentences for a single segment
and returns an object that encapsulates everything that BLEU
needs to know about them.'''

reflen = []
maxcounts = {}
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in counts.items():
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)

# Calculate effective reference sentence length.
if eff == "shortest":
reflen = min(reflen)
elif eff == "average":
reflen = float(sum(reflen))/len(reflen)

## lhuang: N.B.: leave reflen computaiton to the very end!!

## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)

return (reflen, maxcounts)

def cook_test(test, xxx_todo_changeme, eff=None, n=4):
'''Takes a test sentence and returns an object that
encapsulates everything that BLEU needs to know about it.'''
(reflen, refmaxcounts) = xxx_todo_changeme
testlen, counts = precook(test, n, True)

result = {}

# Calculate effective reference sentence length.

if eff == "closest":
result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
else: ## i.e., "average" or "shortest" or None
result["reflen"] = reflen

result["testlen"] = testlen

result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]

result['correct'] = [0]*n
for (ngram, count) in counts.items():
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)

return result

class BleuScorer(object):
"""Bleu scorer.
"""

__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
# special_reflen is used in oracle (proportional effective ref len for a node).

def copy(self):
''' copy the refs.'''
new = BleuScorer(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
new._score = None
return new

def __init__(self, test=None, refs=None, n=4, special_reflen=None):
''' singular instance '''

self.n = n
self.crefs = []
self.ctest = []
self.cook_append(test, refs)
self.special_reflen = special_reflen

def cook_append(self, test, refs):
'''called by constructor and __iadd__ to avoid creating new instances.'''

if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1])
self.ctest.append(cooked_test) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match

self._score = None ## need to recompute

def ratio(self, option=None):
self.compute_score(option=option)
return self._ratio

def score_ratio(self, option=None):
'''return (bleu, len_ratio) pair'''
return (self.fscore(option=option), self.ratio(option=option))

def score_ratio_str(self, option=None):
return "%.4f (%.2f)" % self.score_ratio(option)

def reflen(self, option=None):
self.compute_score(option=option)
return self._reflen

def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen

def retest(self, new_test):
if type(new_test) is str:
new_test = [new_test]
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs))
self._score = None

return self

def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''

return self.retest(new_test).compute_score()

def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)

def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''

if type(other) is tuple:
## avoid creating new BleuScorer instances
self.cook_append(other[0], other[1])
else:
assert self.compatible(other), "incompatible BLEUs."
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
self._score = None ## need to recompute

return self

def compatible(self, other):
return isinstance(other, BleuScorer) and self.n == other.n

def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)

def _single_reflen(self, reflens, option=None, testlen=None):

if option == "shortest":
reflen = min(reflens)
elif option == "average":
reflen = float(sum(reflens))/len(reflens)
elif option == "closest":
reflen = min((abs(l-testlen), l) for l in reflens)[1]
else:
assert False, "unsupported reflen option %s" % option

return reflen

def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)

def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
tiny = 1e-15 ## so that if guess is 0 still return 0
bleu_list = [[] for _ in range(n)]

if self._score is not None:
return self._score

if option is None:
option = "average" if len(self.crefs) == 1 else "closest"

self._testlen = 0
self._reflen = 0
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}

# for each sentence
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen

if self.special_reflen is None: ## need computation
reflen = self._single_reflen(comps['reflen'], option, testlen)
else:
reflen = self.special_reflen

self._reflen += reflen

for key in ['guess','correct']:
for k in range(n):
totalcomps[key][k] += comps[key][k]

# append per image bleu score
bleu = 1.
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)

# if verbose > 1:
# print(comps, reflen)

totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen

bleus = []
bleu = 1.
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleus[k] *= math.exp(1 - 1/ratio)

# if verbose > 0:
# print(totalcomps)
# print("ratio:", ratio)

self._score = bleus
return self._score, bleu_list
1 change: 1 addition & 0 deletions evalfunc/cider/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__author__ = 'tylin'
Binary file added evalfunc/cider/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added evalfunc/cider/__pycache__/cider.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Loading