-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
54 lines (44 loc) · 2.03 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from nltk.tokenize import TweetTokenizer
from nltk.translate import bleu_score
import string
"""
We give a sentence and it tokenizes it.
When we set preserve_case to False it will downcase the string.
It is True by default.
"""
def tokenize(s):
return TweetTokenizer(preserve_case=False).tokenize(s)
"""
Input : candidate sequence should be provided as a list of tokens and
reference sequences in the form of list of references where each
reference is list of tokens again.
Output : sentence_bleu calculates the score.
BLEU score will vary from 0 to 1.
SmoothingFunction is used when sentences are smaller.
Smoothing method 2: Adds 1 to both numerator and denominator from Chin-Yew Lin and Franz Josef Och (2004)
Automatic evaluation of machine translation quality using longest common subsequence
and skip-bigram statistics. In ACL04(http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf).
"""
def calc_bleu_many(cand_sequence,ref_sequences):
return bleu_score.sentence_bleu(ref_sequences,cand_sequence,
smoothing_function=bleu_score.SmoothingFunction().method1,
weights=(0.5,0.5))
"""
When we have just one generated sequence and one reference sequence.
We calculate bleu score with the other function we created here.
Input : candidate sequence should be provided as a list of tokens and
reference sequences in the form of list of tokens.
Output : sentence_bleu calculates the score.
"""
def calc_bleu(cand_sequence,ref_sequence):
return calc_bleu_many(cand_sequence,[ref_sequence])
"""
Input : List of Tokens.
Output : Corresponding String.
"""
def untokenize(tokens):
return "".join([" " + i for i in tokens]).strip()
#print(untokenize(["Hey",",","How","are","you","?"]))
#print(calc_bleu(["I","am","feeling","good"],["I","am","not","feeling","well"]))
#print(calc_bleu_many(["Hey",",","How","are","you","?"],[["You","are","well","?"],["How","you","doing","?"]]))
#print(tokenize("How are you?"))