-
Notifications
You must be signed in to change notification settings - Fork 37
/
EnSentenceEvaluation.py
51 lines (45 loc) · 1.28 KB
/
EnSentenceEvaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#-*- encoding: utf-8 -*-
'''
@chenbjin 2015-04-26
获取duc2002单文档语料的摘要
'''
import os
import re
from EnExtractor import EnExtractor
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getdoclist(path):
return sorted(os.listdir(path))
def getfilelist(path,doc):
return sorted(os.listdir(path+'/'+doc))
def getsentences(path,doc,filename):
allsens = open(path+'/'+doc+'/'+filename).readlines()
result = []
for sen in allsens:
tmp = sen.strip()
if len(tmp) > 1:
result.append(tmp)#remove blank line
return result
def get_summary(sentences):
extractor = EnExtractor()
summary = extractor.summary_train(sentences)
return summary
def main(path):
abspath = '/home/chenbjin/SearchJob/DUC2002_Summarization_Documents/wordnet.system.summary/'
doclist = sorted(getdoclist(path))
for doc in doclist:
print 'dealing with doc ',doc
filelist = sorted(getfilelist(path, doc))
for filename in filelist:
print "------",filename
f = open(abspath+doc+'.'+filename,'w+')
sentences = getsentences(path, doc, filename)
summary = get_summary(sentences)
for line in summary:
f.write(line+'\n')
f.close()
print '-------done----------'
if __name__ == '__main__':
path = '/home/chenbjin/SearchJob/DUC2002_Summarization_Documents/DUC2002_test_data'
main(path)