-
Notifications
You must be signed in to change notification settings - Fork 0
/
mayal_with_nulls.py
116 lines (95 loc) · 4.75 KB
/
mayal_with_nulls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import nltk
import re
from nltk.corpus import PlaintextCorpusReader
import pandas as pd
import dataframe_image as dfi
root = ".\\corpora\\"
files = PlaintextCorpusReader(root, ".*")
#raw = files.raw("பதிற்றுப்பத்து.txt")
# punct = {'.', '[', "'", ']', ',', ')', '\ufeff', ':', '-', '!', ';', '*', '='}
punct = re.compile("[\'\]\-\:\[\,!\.\=\*\);]")
dropper = re.compile("[\d\(]")
pulli = '\u0BCD'
con = ['க', 'ங', 'ச', 'ஞ', 'ட', 'ண', 'ற', 'ன', 'த', 'ந', 'ப', 'ம', 'ய', 'வ', 'ர', 'ல', 'ள', 'ழ']
cons = ['க்', 'ங்', 'ச்' , 'ஞ்', 'ட்', 'ண்', 'ற்', 'ன்', 'த்', 'ந்', 'ப்', 'ம்', 'ய்', 'வ்', 'ர்', 'ல்', 'ள்', 'ழ்']
iso = {'': '∅', 'க' : 'k', 'ங': 'ṅ', 'ச': 'c', 'ஞ': 'ñ', 'ட': 'ṭ', 'ண': 'ṇ', 'ற': 'ṟ', 'ன': 'ṉ', 'த': 't', 'ந': 'n', 'ப': 'p', 'ம': 'm', 'ய': 'y', 'வ': 'v', 'ர': 'r', 'ல': 'l', 'ள': 'ḷ', 'ழ': 'ḻ'}
iso_cons = ['∅', 'k', 'ṅ', 'c' , 'ñ', 'ṭ', 'ṇ', 'ṟ', 'ṉ', 't', 'n', 'p', 'm', 'y', 'v', 'r', 'l', 'ḷ', 'ḻ']
class MayalProcessor:
def max_likelihood(self, s: pd.Series):
'''
Maximum Likelihood Estimation: P(c2|c1)= count(c1,c2)/count(c1)
'''
return s/s.sum()
def highlight_max_both_axes(self, s: pd.DataFrame):
'''
Assign a background colour showing rowwise and columnwise maxes.
'''
ret = pd.DataFrame(0, index=self.nilai, columns=self.varu)
rmax = s.max(axis=1)
cmax = s.max()
for i, n in enumerate(self.nilai):
for j, v in enumerate(self.varu):
if s[v][n] == rmax[n] and s[v][n] == cmax[v]:
color = "teal"
elif s[v][n] == rmax[n]:
color = "pink"
elif s[v][n] == cmax[v] and s[v][n] > 0:
color = "yellow"
else:
color = "white"
ret.iloc[i, j] = "background-color: %s" % color
return ret
def process(self, work):
def get_css(s: pd.Series):
'''
pick css value for a series
'''
ret = [css.loc[i, s.name] for i in s.index]
return ret
print("Processing " + work)
sents = self.preprocess_work(work)
cfd = nltk.ConditionalFreqDist(self.compute_cfd(sents))
self.nilai = iso_cons
self.varu = iso_cons
frame = pd.DataFrame(0, index=self.nilai, columns=self.varu)
for c1, v in cfd.items():
for c2 in v.keys():
frame[c2][c1] = v[c2]
css = self.highlight_max_both_axes(frame)
dfi.export(frame.style.set_properties(**{'border': '1.3px solid black', 'color': 'black', 'padding': '5px'}).apply(get_css), "out\\" + work + ".png", dpi=300)
pd.set_option("styler.format.precision", 3)
row_mle = frame.apply(self.max_likelihood, axis = 1)
css = self.highlight_max_both_axes(row_mle)
row_mle.fillna('-', inplace=True)
dfi.export(row_mle.style.set_properties(**{'border': '1.3px solid black', 'color': 'black', 'padding': '5px'}).apply(get_css), "out\\" + work + "_row_mle.png", dpi=300)
col_mle = frame.apply(self.max_likelihood, axis = 0)
css = self.highlight_max_both_axes(col_mle)
col_mle.fillna('-', inplace=True)
dfi.export(col_mle.style.set_properties(**{'border': '1.3px solid black', 'color': 'black', 'padding': '5px'}).apply(get_css), "out\\" + work + "_col_mle.png", dpi=300)
def preprocess_work(self, work):
sents = []
text = root + work + ".txt"
with open(text, encoding="utf8") as input:
for sent in input.readlines():
sent = re.sub(dropper, "", sent)
sent = re.sub("\s+", " ", re.sub(punct, " ", sent)).replace("", "ரி")
if sent.count(" ") > 2: # at least two cheers
sents.append(sent)
return sents
def compute_cfd(self, sents):
ret = []
for sent in sents:
for word in sent.split():
for con1 in con:
if word.startswith(con1):
ret.append((iso[''], iso[con1]))
if word.endswith(con1):
ret.append((iso[con1], iso['']))
for con2 in con:
if con1 + pulli + con2 in word:
ret.append((iso[con1], iso[con2]))
return ret
p = MayalProcessor()
works = ["ainkurunuru", "akananuru", "kalithokai", "kurunthokai", "natrinai", "paripadal", "pathittrupathu", "purananuru", "எட்டுத்தொகை-consolidated"]
for work in works:
p.process(work)