forked from jin-zhe/boolean-retrieval-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
355 lines (308 loc) · 13 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#!/usr/bin/python
import re
import nltk
import sys
import getopt
import codecs
import struct
import math
import io
import collections
import timeit
RECORD_TIME = False # toggling for recording the time taken for indexer
BYTE_SIZE = 4 # docID is in int
"""
conducts boolean queries from queries_file and writes outputs to output_file
params:
dictionary_file: dictionary file produced by indexer
postings_file: postings file produced by indexer
queries_file: file of boolean queries
output_file: responses to boolean queries
"""
def search(dictionary_file, postings_file, queries_file, output_file):
# open files
dict_file = codecs.open(dictionary_file, encoding='utf-8')
post_file = io.open(postings_file, 'rb')
query_file = codecs.open(queries_file, encoding='utf-8')
out_file = open(output_file, 'w')
# load dictionary to memory
loaded_dict = load_dictionary(dict_file)
dictionary = loaded_dict[0] # dictionary map
indexed_docIDs = loaded_dict[1] # list of all docIDs indexed in sorted order
dict_file.close()
# process each query
queries_list = query_file.read().splitlines()
for i in range(len(queries_list)):
query = queries_list[i]
result = process_query(query, dictionary, post_file, indexed_docIDs)
# write each result to output
for j in range(len(result)):
docID = str(result[j])
if (j != len(result) - 1):
docID += ' '
out_file.write(docID)
if (i != len(queries_list) - 1):
out_file.write('\n')
# close files
post_file.close()
query_file.close()
out_file.close()
"""
returns 2-tuple of loaded dictionary and total df
params:
dict_file: opened dictionary file
"""
def load_dictionary(dict_file):
dictionary = {} # dictionary map loaded
indexed_docIDs = [] # list of all docIDs indexed
docIDs_processed = False # if indexed_docIDs is processed
# load each term along with its df and postings file pointer to dictionary
for entry in dict_file.read().split('\n'):
# if entry is not empty (last line in dictionary file is empty)
if (entry):
# if first line of dictionary, process list of docIDs indexed
if (not docIDs_processed):
indexed_docIDs = [int(docID) for docID in entry[20:-1].split(',')]
docIDs_processed = True
# else if dictionary terms and their attributes
else:
token = entry.split(" ")
term = token[0]
df = int(token[1])
offset = int(token[2])
dictionary[term] = (df, offset)
return (dictionary, indexed_docIDs)
"""
returns the list of docIDs in the result for the given query
params:
query: the query string e.g. 'bill OR Gates AND (vista OR XP) AND NOT mac'
dictionary: the dictionary in memory
indexed_docIDs: the list of all docIDs indexed (used for negations)
"""
def process_query(query, dictionary, post_file, indexed_docIDs):
stemmer = nltk.stem.porter.PorterStemmer() # instantiate stemmer
# prepare query list
query = query.replace('(', '( ')
query = query.replace(')', ' )')
query = query.split(' ')
results_stack = []
postfix_queue = collections.deque(shunting_yard(query)) # get query in postfix notation as a queue
while postfix_queue:
token = postfix_queue.popleft()
result = [] # the evaluated result at each stage
# if operand, add postings list for term to results stack
if (token != 'AND' and token != 'OR' and token != 'NOT'):
token = stemmer.stem(token) # stem the token
# default empty list if not in dictionary
if (token in dictionary):
result = load_posting_list(post_file, dictionary[token][0], dictionary[token][1])
# else if AND operator
elif (token == 'AND'):
right_operand = results_stack.pop()
left_operand = results_stack.pop()
# print(left_operand, 'AND', left_operand) # check
result = boolean_AND(left_operand, right_operand) # evaluate AND
# else if OR operator
elif (token == 'OR'):
right_operand = results_stack.pop()
left_operand = results_stack.pop()
# print(left_operand, 'OR', left_operand) # check
result = boolean_OR(left_operand, right_operand) # evaluate OR
# else if NOT operator
elif (token == 'NOT'):
right_operand = results_stack.pop()
# print('NOT', right_operand) # check
result = boolean_NOT(right_operand, indexed_docIDs) # evaluate NOT
# push evaluated result back to stack
results_stack.append(result)
# print ('result', result) # check
# NOTE: at this point results_stack should only have one item and it is the final result
if len(results_stack) != 1: print ("ERROR: results_stack. Please check valid query") # check for errors
return results_stack.pop()
"""
returns posting list for term corresponding to the given offset
params:
post_file: opened postings file
length: length of posting list (same as df for the term)
offset: byte offset which acts as pointer to start of posting list in postings file
"""
def load_posting_list(post_file, length, offset):
post_file.seek(offset)
posting_list = []
for i in range(length):
posting = post_file.read(BYTE_SIZE)
docID = struct.unpack('I', posting)[0]
posting_list.append(docID)
return posting_list
"""
returns the list of postfix tokens converted from the given infix expression
params:
infix_tokens: list of tokens in original query of infix notation
"""
def shunting_yard(infix_tokens):
# define precedences
precedence = {}
precedence['NOT'] = 3
precedence['AND'] = 2
precedence['OR'] = 1
precedence['('] = 0
precedence[')'] = 0
# declare data strucures
output = []
operator_stack = []
# while there are tokens to be read
for token in infix_tokens:
# if left bracket
if (token == '('):
operator_stack.append(token)
# if right bracket, pop all operators from operator stack onto output until we hit left bracket
elif (token == ')'):
operator = operator_stack.pop()
while operator != '(':
output.append(operator)
operator = operator_stack.pop()
# if operator, pop operators from operator stack to queue if they are of higher precedence
elif (token in precedence):
# if operator stack is not empty
if (operator_stack):
current_operator = operator_stack[-1]
while (operator_stack and precedence[current_operator] > precedence[token]):
output.append(operator_stack.pop())
if (operator_stack):
current_operator = operator_stack[-1]
operator_stack.append(token) # add token to stack
# else if operands, add to output list
else:
output.append(token.lower())
# while there are still operators on the stack, pop them into the queue
while (operator_stack):
output.append(operator_stack.pop())
# print ('postfix:', output) # check
return output
"""
returns the list of docIDs which is the compliment of given right_operand
params:
right_operand: sorted list of docIDs to be complimented
indexed_docIDs: sorted list of all docIDs indexed
"""
def boolean_NOT(right_operand, indexed_docIDs):
# complement of an empty list is list of all indexed docIDs
if (not right_operand):
return indexed_docIDs
result = []
r_index = 0 # index for right operand
for item in indexed_docIDs:
# if item do not match that in right_operand, it belongs to compliment
if (item != right_operand[r_index]):
result.append(item)
# else if item matches and r_index still can progress, advance it by 1
elif (r_index + 1 < len(right_operand)):
r_index += 1
return result
"""
returns list of docIDs that results from 'OR' operation between left and right operands
params:
left_operand: docID list on the left
right_operand: docID list on the right
"""
def boolean_OR(left_operand, right_operand):
result = [] # union of left and right operand
l_index = 0 # current index in left_operand
r_index = 0 # current index in right_operand
# while lists have not yet been covered
while (l_index < len(left_operand) or r_index < len(right_operand)):
# if both list are not yet exhausted
if (l_index < len(left_operand) and r_index < len(right_operand)):
l_item = left_operand[l_index] # current item in left_operand
r_item = right_operand[r_index] # current item in right_operand
# case 1: if items are equal, add either one to result and advance both pointers
if (l_item == r_item):
result.append(l_item)
l_index += 1
r_index += 1
# case 2: l_item greater than r_item, add r_item and advance r_index
elif (l_item > r_item):
result.append(r_item)
r_index += 1
# case 3: l_item lower than r_item, add l_item and advance l_index
else:
result.append(l_item)
l_index += 1
# if left_operand list is exhausted, append r_item and advance r_index
elif (l_index >= len(left_operand)):
r_item = right_operand[r_index]
result.append(r_item)
r_index += 1
# else if right_operand list is exhausted, append l_item and advance l_index
else:
l_item = left_operand[l_index]
result.append(l_item)
l_index += 1
return result
"""
returns list of docIDs that results from 'AND' operation between left and right operands
params:
left_operand: docID list on the left
right_operand: docID list on the right
"""
def boolean_AND(left_operand, right_operand):
# perform 'merge'
result = [] # results list to be returned
l_index = 0 # current index in left_operand
r_index = 0 # current index in right_operand
l_skip = int(math.sqrt(len(left_operand))) # skip pointer distance for l_index
r_skip = int(math.sqrt(len(right_operand))) # skip pointer distance for r_index
while (l_index < len(left_operand) and r_index < len(right_operand)):
l_item = left_operand[l_index] # current item in left_operand
r_item = right_operand[r_index] # current item in right_operand
# case 1: if match
if (l_item == r_item):
result.append(l_item) # add to results
l_index += 1 # advance left index
r_index += 1 # advance right index
# case 2: if left item is more than right item
elif (l_item > r_item):
# if r_index can be skipped (if new r_index is still within range and resulting item is <= left item)
if (r_index + r_skip < len(right_operand)) and right_operand[r_index + r_skip] <= l_item:
r_index += r_skip
# else advance r_index by 1
else:
r_index += 1
# case 3: if left item is less than right item
else:
# if l_index can be skipped (if new l_index is still within range and resulting item is <= right item)
if (l_index + l_skip < len(left_operand)) and left_operand[l_index + l_skip] <= r_item:
l_index += l_skip
# else advance l_index by 1
else:
l_index += 1
return result
"""
prints the proper command usage
"""
def print_usage():
print ("usage: " + sys.argv[0] + " -d dictionary-file -p postings-file -q file-of-queries -o output-file-of-results")
dictionary_file = postings_file = queries_file = output_file = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'd:p:q:o:')
except (getopt.GetoptError, err):
usage()
sys.exit(2)
for o, a in opts:
if o == '-d':
dictionary_file = a
elif o == '-p':
postings_file = a
elif o == '-q':
queries_file = a
elif o == '-o':
output_file = a
else:
assert False, "unhandled option"
if (dictionary_file == None or postings_file == None or queries_file == None or output_file == None):
print_usage()
sys.exit(2)
if (RECORD_TIME): start = timeit.default_timer() # start time
search(dictionary_file, postings_file, queries_file, output_file) # call the search engine on queries
if (RECORD_TIME): stop = timeit.default_timer() # stop time
if (RECORD_TIME): print ('Querying time:' + str(stop - start)) # print time taken