This repository has been archived by the owner on Feb 1, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
bleu.py
84 lines (67 loc) · 2.38 KB
/
bleu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from __future__ import division
from xml.etree.ElementTree import ElementTree
from xml.etree.cElementTree import parse as xmlparse
def clean(word):
return word.strip(",. ")
def bleu(text, entailment):
def ngrams(stringlist, n):
i = 0
while i + n <= len(stringlist):
yield tuple(stringlist[i:i+n])
i += 1
words = [clean(x) for x in text.lower().split()]
hwords = [clean(x) for x in entailment.lower().split()]
bleus = 0
for N in range(1,1+len(hwords)):
wn = list(ngrams(words,N))
hn = list(ngrams(hwords,N))
cm = filter(lambda x: x in wn, hn)
bleus += len(cm) / len(hn)
bleus /= len(hwords)
return bleus
def parse_xml(fileh):
tree = ElementTree()
tree.parse(fileh)
parsed = {}
for pair in list(tree.findall('pair')):
attrib = pair.attrib
t = pair.find('t').text
h = pair.find('h').text
parsed[attrib['id']] = (attrib,t,h)
return parsed
class Pair(object):
def __init__(self, etree):
self.id = etree.attrib['id'].strip()
self.tast = etree.attrib['task'].strip()
self.text = [Sentence(s) for s in etree.iterfind('text/sentence')]
self.hypothesis = [Sentence(s) for s in etree.iterfind('hypothesis/sentence')]
self.entailment = etree.attrib['entailment']
class Sentence(object): # list of nodes
def __init__(self, etree):
self.serial = etree.attrib['serial'].strip()
self.nodes = [Node(n) for n in etree.iterfind('node')]
class Node(object):
def __init__(self, etree):
self.id = etree.attrib['id']
if self.id[0] == 'E': # artificial node
self.isWord = False
else:
self.isWord = True
self.word = etree.findtext('word').strip()
self.lemma = etree.findtext('lemma').strip()
self.postag = etree.findtext('pos-tag').strip()
self.relation = etree.findtext('relation')
if self.relation: self.relations = self.relation.strip()
def traverse(tree, function, threshold):
print "ranked: no"
for i,(a,t,h) in tree.items():
#c = a['entailment'] == 'YES'
print i,
if function(t,h) > threshold:
print 'YES'
else:
print 'NO'
if __name__ == '__main__':
import sys
data = parse_xml(sys.argv[1])
traverse(data, bleu, float(sys.argv[2]))