-
Notifications
You must be signed in to change notification settings - Fork 0
/
thesaurus.py
244 lines (190 loc) · 8.38 KB
/
thesaurus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
import requests
from bs4 import BeautifulSoup
from pprint import pprint
"""
I will do my best to come back and add better comments/docstrings to this file
as quick as I can, but for now I think the README should suffice for your
documentation. If I'm being really lazy about it though, just make an issue or
email me and tell me I'm a bitch.
"""
def formatWordUrl(inputWord):
url = 'http://www.thesaurus.com/browse/'
url = url + inputWord.strip().lower().replace(' ', '%20')
return url
def btw(inputString, lh, rh):
# extract a string between two other strings.
return inputString.split(lh, 1)[1].split(rh, 1)[0]
def getFilter(keyName, filters):
return filters['filters'][keyName] if keyName in filters['filters'] else None
def fetchWordData(inputWord):
url = formatWordUrl(inputWord)
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
definitionCount = len(soup.select("div.mask a.pos-tab"))
defns = []
# part of speech and meaning
posTags = soup.select("div.mask a.pos-tab")
pos = [[z.text for z in x.select('em')][0] for x in posTags]
meaning = [[z.text for z in x.select('strong')][0] for x in posTags]
for defnNum in range(0, definitionCount):
wordPath = 'div#synonyms-{} li a'
data = soup.select(wordPath.format(defnNum))
curr_def = {
'partOfSpeech': pos[defnNum],
'meaning': meaning[defnNum],
'syn': [],
'ant': []
}
for x in data:
# tuple key is (word, relevance, length, complexity, form)
entry = ()
category = int(btw(x.attrs['data-category'], 'relevant-', '"'))
if category > 0:
# the -4 is to remove the star text. I figured string manip.
# would be faster than doing another select on the lower span.
# I may have to change this in the future if they remove the
# star thing. It works with Unicode... even though str()
# doesnt.
c = 'syn'
entry += (x.text[:-4],)
else:
# antonyms don't have the star text.
c = 'ant'
entry += (str(x.text),)
entry += (abs(category), int(x.attrs['data-length']))
entry += (int(x.attrs['data-complexity']),)
try:
entry += (x.attrs['class'][0],)
except:
entry += (None,)
curr_def[c].append(entry)
defns.append(curr_def)
# add origin and examples to the last element so we can .pop() it out later
clean = lambda x: x.strip().replace('\u201d', '"').replace('\u201c', '"')
origin = [clean(x.text) for x in soup.select("div#word-origin div p")]
defns.append({
'examples': [clean(x.text) for x in soup.select("div#example-sentences div p")],
# TODO: fix this, as there is a '...' that appears. Use span.oneClick-link
'origin': origin[0] if len(origin) > 0 else ''
})
return defns
class Word:
def __init__(self, inputWord):
# in case you want to visit it later
self.url = formatWordUrl(inputWord)
self.data = fetchWordData(inputWord) # fetch the data from thesaurus.com
self.extra = self.data.pop()
self.synonym_list=self.synonyms()
def __len__(self):
# returns the number of definitions the word has
return len(self.data)
### FUNCTIONS TO HELP ORGANIZE DATA WITHIN THE CLASS ###
def filter(self, defnNum='all', **filters):
"""filter out our self.data to reflect only what we need/want in
different functions
"""
if len(self) == 0:
return []
# here are the available filters that we will pull out of the args.
relevance = getFilter('relevance', filters)
partOfSpeech = getFilter('partOfSpeech', filters)
length = getFilter('length', filters)
complexity = getFilter('complexity', filters)
form = getFilter('form', filters)
# just in-case there is some sort of user error in entering word form.
if form: # make sure it's not NoneType first.
if 'informal' in form.lower():
form = 'informal-word'
elif 'common' in form.lower():
form = 'common-word'
# we are going to assume they want to filter all of the definitions.
# if not, we will need to only filter over that ONE definition number.
if defnNum == 'all':
startRange, endRange = 0, len(self.data)
else:
startRange, endRange = defnNum, defnNum + 1
fdata = [] # the data we are going to return
options = [relevance, length, complexity, form]
temp_options = list(options)
made_changes = False
for x in range(0, len(options)):
# turn all of our inputs into list forms of said input.
if type(options[x]) != list:
options[x] = [options[x]]
made_changes = True
if not made_changes:
options = temp_options # change it back to the fast and easy one.
optIdx = [i for i, x in enumerate(options) if x is not None]
# returns the relevant data (aka not the word) for tuple entries
f = lambda x: [x[1:][z] for z in optIdx] == [options[z] for z in optIdx]
for x in range(0, len(self.data)):
# remember: tuple key is (word, relevance, length, complexity,
# form)
if (partOfSpeech == None) or (self.data[x]['partOfSpeech'] == partOfSpeech):
fdata.append({
'syn': [y for y in self.data[x]['syn'] if f(y)],
'ant': [y for y in self.data[x]['ant'] if f(y)]
})
else:
continue
return fdata
# we're SOL. Time to do it the hard'n slow way.
optIdx = [i for i, x in enumerate(options) if x != [None]]
options = [options[z] for z in optIdx]
# tuple key is (word, relevance, length, complexity, form)
for x in range(startRange, endRange):
# iterate through definitions
if (partOfSpeech != None) and (self.data[x]['partOfSpeech'] not in partOfSpeech):
fdata.append({})
continue
c_entry = {'syn': [], 'ant': []}
for entry_type in ['syn', 'ant']:
c_def = self.data[x]
for y in range(0, len(c_def[entry_type])):
# iterate through synonym entries
word = [c_def[entry_type][y][1:][yy] for yy in optIdx]
z, zz = 0, len(word)
looksGood = True
while (looksGood == True) and (z < zz):
opt = word[z]
looksGood = True if opt in options[z] else False
z += 1
if looksGood == True:
c_entry[entry_type].append(c_def[entry_type][y])
fdata.append(c_entry)
return fdata
### FUNCTIONS TO RETURN DATA YOU WANT ###
"""
Each of the following functions allow you to filter the output
accordingly: relevance, partOfSpeech, length, complexity, form.
"""
def synonyms(self, defnNum=0, allowEmpty=True, **filters):
data = [x['syn'] if 'syn' in x else [] for x in self.filter(defnNum=defnNum, filters=filters)]
# the word does not exist. return empty.
if not data:
return []
data = [[y[0] for y in x] for x in data]
if defnNum != 'all':
return data[0]
else:
if allowEmpty == True:
return data
else:
return [x for x in data if len(x) is not 0]
def antonyms(self, defnNum=0, allowEmpty=True, **filters):
data = [x['ant'] if 'ant' in x else [] for x in self.filter(defnNum=defnNum, filters=filters)]
# word does not exist. return empty.
if not data:
return []
data = [[y[0] for y in x] for x in data]
if defnNum != 'all':
return data[0]
else:
if allowEmpty == True:
return data
else:
return [x for x in data if len(x) is not 0]
def origin(self):
return self.extra['origin']
def examples(self):
return self.extra['examples']