-
Notifications
You must be signed in to change notification settings - Fork 5
/
conllu-spaceafter.py
49 lines (45 loc) · 1.46 KB
/
conllu-spaceafter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding: UTF-8 -*-
# Author: Aibek Makazhanov <[email protected]>
import sys
import os
import codecs
import re
def main():
tm = '# text = '
fd = sys.stdout
lp = 0
mwt = range(0,0)
for l in codecs.open(sys.argv[1],'r','utf-8').readlines():
l = l.strip()
if not l:
lp = 0
mwt = range(0,0)
if l.startswith(tm):
txt = l[len(tm):]
#fd.write('%s\n'%txt.encode('utf-8'))
fields = l.split('\t')
if len(fields)==10:
tid = fields[0]
if tid.count('-'):
rng = tid.split('-')
mwt = range(int(rng[0]),int(rng[1])+1)
tid = int(rng[0]) - 1
else: tid = int(tid)
tok = fields[1]
#fd.write('%s\n'%tok.encode('utf-8'))
p = txt.find(tok,lp)
if p>=0 and not (re.sub('\s','',txt[lp:p]) or tid in mwt):
lp = p + len(tok)
#fd.write('%d\t%d\t%s\n'%(p,lp,txt[lp:].encode('utf-8')))
if lp<len(txt) and not re.match('^\s',txt[lp:]):
if fields[9] == '' or fields[9] == '_':
fields[9] = 'SpaceAfter=No'
else:
fields[9] += '|SpaceAfter=No'
else:
fd.write('%s\n'%l.encode('utf-8'))
continue
l = '\t'.join(fields)
fd.write('%s\n'%l.encode('utf-8'))
if __name__=='__main__':
main()