-
Notifications
You must be signed in to change notification settings - Fork 0
/
indexer.py
187 lines (135 loc) · 6.24 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
from heapq import merge
from nltk.corpus import brown
import json
import pickle
from encoding import gap_encoding, gamma_encoding, gap_decoding, gamma_decoding, bitstring_to_bytes, bytes_to_bit_string
from copy import deepcopy
class Indexer:
def __init__(self, name):
self.name = name
self.indexer = []
self.number_of_words = 0
self.term_id_to_term = []
self.term_to_term_id = {}
def add_word_to_indexer(self, word):
insert_pos = len(self.term_id_to_term)
for i in range(len(self.term_id_to_term)):
if self.term_id_to_term[i] > word:
insert_pos = i
break
self.term_id_to_term.insert(insert_pos, word)
self.indexer.insert(insert_pos, [])
for word_iter in self.term_to_term_id.keys():
if self.term_to_term_id[word_iter] >= insert_pos:
self.term_to_term_id[word_iter] = self.term_to_term_id[word_iter] + 1
self.term_to_term_id[word] = insert_pos
def add_word_to_document(self, word, doc_id):
if word not in self.term_id_to_term:
self.add_word_to_indexer(word)
term_id = self.term_to_term_id[word]
if doc_id not in self.indexer[term_id]:
self.indexer[term_id].append(doc_id)
self.indexer[term_id] = sorted(self.indexer[term_id])
self.number_of_words = self.number_of_words + 1
def save_indexer_to_disk(self):
compressed_index = deepcopy(self.indexer)
for i in range(len(compressed_index)):
compressed_index[i] = gap_encoding(compressed_index[i])
for i in range(len(compressed_index)):
compressed_index[i] = gamma_encoding(compressed_index[i])
for i in range(len(compressed_index)):
compressed_index[i] = bitstring_to_bytes(compressed_index[i])
with open('term_to_term_id/term_to_term_id_{}.txt'.format(self.name), 'w') as filehandle:
json.dump(self.term_to_term_id, filehandle)
with open('indexers/{}_index.data'.format(self.name), 'wb') as filehandle:
pickle.dump(compressed_index, filehandle)
def load_indexer_from_disk(self):
self.indexer = []
self.number_of_words = 0
self.term_id_to_term = []
self.term_to_term_id = {}
with open('term_to_term_id/term_to_term_id_{}.txt'.format(self.name), 'r') as filehandle:
self.term_to_term_id = json.load(filehandle)
self.term_id_to_term = [-1 for i in range(len(self.term_to_term_id))]
for term, term_id in self.term_to_term_id.items():
self.term_id_to_term[term_id] = term
with open('indexers/{}_index.data'.format(self.name), 'rb') as filehandle:
compressed_index = pickle.load(filehandle)
for i in range(len(compressed_index)):
compressed_index[i] = bytes_to_bit_string(compressed_index[i])
for i in range(len(compressed_index)):
compressed_index[i] = gamma_decoding(compressed_index[i])
for i in range(len(compressed_index)):
compressed_index[i] = gap_decoding(compressed_index[i])
self.indexer = compressed_index
for i in range(len(self.indexer)):
self.number_of_words += len(self.indexer[i])
def __str__(self):
res = ""
for i in range(len(self.term_id_to_term)):
res = res + self.term_id_to_term[i] + " : " + str(self.indexer[i]) + "\n"
return res
def merge_indexes(index_A, index_B):
term_id_to_term = sorted(list(set(merge(index_A.term_id_to_term, index_B.term_id_to_term))))
term_to_term_id = {term : term_id for term_id, term in enumerate(term_id_to_term)}
total_size = 0
indexer = [list() for i in range(len(term_id_to_term))]
for term in term_id_to_term:
postings_1 = [] if term not in index_A.term_to_term_id else index_A.indexer[index_A.term_to_term_id[term]]
postings_2 = [] if term not in index_B.term_to_term_id else index_B.indexer[index_B.term_to_term_id[term]]
indexer[term_to_term_id[term]] = list(merge(postings_1, postings_2))
total_size += len(indexer[term_to_term_id[term]])
result = Indexer("merge_{}_{}".format(index_A.name, index_B.name))
result.term_id_to_term = term_id_to_term
result.term_to_term_id = term_to_term_id
result.indexer = indexer
result.number_of_words = total_size
return result
if __name__ == '__main__':
def try_on_small_data():
docs = ["Good morning new york q",
"Bonjour le monde",
"Winek Pacine",
"Good morning sna francisco",
"Bonjour le monde",
"Winek 7awem"]
indexer = Indexer("my_small_index1")
for doc_id in range(4):
doc = docs[doc_id]
for word in doc.split():
indexer.add_word_to_document(word.lower(), doc_id + 1)
print(indexer)
indexer.save_indexer_to_disk()
new_index = Indexer("my_small_index1")
new_index.load_indexer_from_disk()
print(new_index)
indexer2 = Indexer("my_small_index2")
for doc_id in range(4, len(docs)):
doc = docs[doc_id]
for word in doc.split():
indexer2.add_word_to_document(word.lower(), doc_id + 1)
print(indexer2)
merge_index = merge_indexes(indexer, indexer2)
print(merge_index)
def try_on_larger_data():
docs = ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', 'ca08']
index1 = Indexer("my_index1")
for doc_id in range(4):
doc_name = docs[doc_id]
for word in brown.words(doc_name):
index1.add_word_to_document(word.lower(), doc_id + 1)
print(index1)
index1.save_indexer_to_disk()
index2 = Indexer("my_index2")
for doc_id in range(4, len(docs)):
doc_name = docs[doc_id]
for word in brown.words(doc_name):
index2.add_word_to_document(word, doc_id + 1)
print(index2)
index2.save_indexer_to_disk()
index1.load_indexer_from_disk()
index2.load_indexer_from_disk()
merge_index = merge_indexes(index1, index2)
print(merge_index)
merge_index.save_indexer_to_disk()
try_on_larger_data()