-
Notifications
You must be signed in to change notification settings - Fork 1
/
model.py
471 lines (351 loc) · 18 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
"""
Models and database functions for Hackbright Independent project.
by Patricia Decker 11/2015.
"""
# import correlation # correlation.pearson
from flask_sqlalchemy import SQLAlchemy
import datetime
# This is the connection to the SQLite database; we're getting this through
# the Flask-SQLAlchemy helper library. On this, we can find the `session`
# object, where we do most of our interactions (like committing, etc.)
db = SQLAlchemy()
CAT_CODES = {'unknown': 'unkn',
'gluten': 'gltn',
'allergy': 'algy',
'paleo': 'pleo',
'kosher': 'kshr',
'vegan': 'vgan'
}
##############################################################################
# Model definitions
class YelpBiz(db.Model):
"""Business in Yelp Academic Dataset."""
__tablename__ = "yelpBiz"
# yelp_biz_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
biz_id = db.Column(db.Unicode(32), primary_key=True)
name = db.Column(db.Unicode(200), nullable=False)
address = db.Column(db.Unicode(200), nullable=False)
city = db.Column(db.Unicode(64), nullable=False)
state = db.Column(db.String(3), nullable=False)
lat = db.Column(db.Float(Precision=64), nullable=False) #TODO: is there a lat/long type?
lng = db.Column(db.Float(Precision=64), nullable=False)
stars = db.Column(db.Float, nullable=True) # biz can have no reviews
review_count = db.Column(db.Integer, nullable=False, default=0)
is_open = db.Column(db.Integer, nullable=False, default=True)
photo_url = db.Column(db.String(200), nullable=True)
# yelp_url = db.Column(db.String(200), nullable=False)
# V2TODO: neighborhoods
# V2TODO: schools (nearby universities)
def __repr__(self):
return "<YelpBiz biz_id=%d name=%s>" % (self.biz_id, self.name)
class YelpUser(db.Model):
"""User in Yelp Academic Dataset."""
__tablename__ = "yelpUsers"
user_id = db.Column(db.Unicode(32), primary_key=True)
name = db.Column(db.Unicode(64), nullable=False)
review_count = db.Column(db.Integer, nullable=False, default=0) # a user might have no reviews
average_stars = db.Column(db.Float, nullable=False, default=0.0) # this is calculable from other tables
# V2TODO: votes
def __repr__(self):
return "<YelpUser user_id=%s name=%s>" % (self.user_id, self.name)
class YelpReview(db.Model):
"""Review in Yelp Academic Dataset."""
__tablename__ = "yelpReviews"
# TODO: make business_id primary key??
review_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
biz_id = db.Column(db.Unicode(32), db.ForeignKey('yelpBiz.biz_id'))
user_id = db.Column(db.Unicode(32), db.ForeignKey('yelpUsers.user_id'))
stars = db.Column(db.Integer, nullable=False) # integer 1-5 #TODO: restrict or check
text = db.Column(db.Text, nullable=False)
date = db.Column(db.Date)
biz = db.relationship('YelpBiz',
backref=db.backref('reviews', order_by=review_id))
user = db.relationship('YelpUser',
backref=db.backref('reviews', order_by=review_id))
def __repr__(self):
return "<YelpReview biz_id=%d user_id=%d>" % (self.biz_id, self.user_id)
class PlatePalBiz(db.Model):
"""Business on PlatePal website. (Builds on Yelp database.)"""
__tablename__ = "biz"
biz_id = db.Column(db.Integer, autoincrement=True, primary_key=True) # TODO: yelp biz id?
yelp_biz_id = db.Column(db.Unicode(32), db.ForeignKey('yelpBiz.biz_id')) # TODO: can this be nullable???
name = db.Column(db.Unicode(200), nullable=False)
address = db.Column(db.Unicode(200), nullable=False)
city = db.Column(db.Unicode(64), nullable=False)
state = db.Column(db.String(3), nullable=False)
lat = db.Column(db.Float(Precision=64), nullable=False) #TODO: is there a lat/long type?
lng = db.Column(db.Float(Precision=64), nullable=False)
is_open = db.Column(db.Integer, nullable=False)
photo_url = db.Column(db.String(200), nullable=True)
# pp_url = db.Column(db.String(200), nullable=False) # PlatePal url of biz listing
# sen_score calculated in BizSentiments or calculable from ...
# review_count = db.Column(db.Integer, nullable=True)
# TODO: neighborhoods?
def __repr__(self):
return "<PlatePalBiz biz_id=%d name=%s>" % (self.biz_id, self.name)
class PlatePalUser(db.Model):
"""
User of PlatePal website.
(Builds on Yelp to allow future OAuth integration.)
"""
# no need at moment for yelp user info
__tablename__ = "users"
user_id = db.Column(db.Integer, primary_key=True)
email = db.Column(db.String(64), nullable=False, unique=True)
password = db.Column(db.String(32), nullable=False)
fname = db.Column(db.String(32), nullable=False)
lname = db.Column(db.String(32), nullable=False) # is this long enough?
bday = db.Column(db.Date, nullable=False)
# city = db.Column(db.String(30), nullable=False)
# get around this by doing a browser request for geolocation info
sentiments = db.relationship('ReviewCategory', secondary='reviews',
backref='user')
def __repr__(self):
return "<PlatePalUser user_id=%d (fname lname)=%s %s>" % (self.user_id, self.fname, self.lname)
class PlatePalReview(db.Model):
"""
Compiles Yelp Reviews, if they exist, or allows users to review a business
directly on PlatePal.
"""
__tablename__ = "reviews"
review_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
yelp_review_id = db.Column(db.Integer, db.ForeignKey('yelpReviews.review_id'))
yelp_stars = db.Column(db.Integer, db.ForeignKey('yelpReviews.stars'))
biz_id = db.Column(db.Integer, db.ForeignKey('biz.biz_id'))
user_id = db.Column(db.Integer, db.ForeignKey('users.user_id'))
yelp_user_id = db.Column(db.Unicode(32), db.ForeignKey('yelpUsers.user_id'))
cat_code = db.Column(db.Integer, db.ForeignKey('categories.cat_code')) # this is actually a string...
stars = db.Column(db.Integer, nullable=True)
review_date = db.Column(db.Date, default=datetime.datetime.utcnow)
text = db.Column(db.Text, nullable=False)
biz = db.relationship('PlatePalBiz',
backref=db.backref('reviews', order_by=review_id))
user = db.relationship('PlatePalUser',
backref=db.backref('reviews', order_by=review_id))
yelp_user = db.relationship('YelpUser',
backref=db.backref('ppreviews', order_by=review_id))
def __repr__(self):
return "<PlatePalReview review_id=%s biz_id=%s>" % (self.review_id, self.biz_id)
class UserList(db.Model):
"""User-generated list by category of restaurants."""
__tablename__ = "lists"
list_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
user_id = db.Column(db.Integer, db.ForeignKey('users.user_id'))
cat_name = db.Column(db.String(32), db.ForeignKey('categories.cat_name'))
list_name = db.Column(db.String(64), nullable=False)
user = db.relationship('PlatePalUser',
backref=db.backref('lists', order_by=list_id))
def __repr__(self):
return "<UserList list_id=%s user_id=%s>" % (self.list_id, self.user_id)
class ListEntry(db.Model):
"""Restaurant entries in user-generated list, UserList."""
# TODO: is this table necessary? really storing unique pairs
# of list_id & biz_id ...
__tablename__ = "entries"
entry_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
list_id = db.Column(db.Integer, db.ForeignKey('lists.list_id'))
biz_id = db.Column(db.Integer, db.ForeignKey('biz.biz_id'))
user_list = db.relationship('UserList',
backref=db.backref('entries', order_by=biz_id))
def __repr__(self):
return "<ListEntry list_id=%s biz_id=%s>" % (self.list_id, self.biz_id)
class Category(db.Model):
"""Categories for classification and targeting sentiment analysis."""
__tablename__ = "categories"
cat_code = db.Column(db.String(4), primary_key=True)
cat_name = db.Column(db.String(32), nullable=False)
cat_description = db.Column(db.Text, nullable=True)
def __repr__(self):
return "<Category cat_name=%s>" % self.cat_name
class ReviewCategory(db.Model):
"""
Association table between reviews and categories.
Allows for determination of a sentiment score on an individual review,
as a review has many categories and a category has many reviews. Space
to store user-generated sentiment score (feedback on machine-generated
score).
"""
__tablename__ = "revcats"
# TODO: using unique pairs as primary key??
revcat_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
review_id = db.Column(db.Integer, db.ForeignKey('reviews.review_id'))
biz_id = db.Column(db.Integer, db.ForeignKey('biz.biz_id'))
cat_code = db.Column(db.Integer, db.ForeignKey('categories.cat_code')) # this is actually a string...
sen_score = db.Column(db.Float, nullable=True) # machine generated score
user_sen = db.Column(db.Float, nullable=True) # for user feedback on score
biz = db.relationship('PlatePalBiz',
backref=db.backref('revcat', order_by=cat_code))
review = db.relationship('PlatePalReview',
backref=db.backref('revcat', order_by=cat_code))
def __repr__(self):
return "<ReviewCategory revcat_id=%s>" % (self.revcat_id)
def compare_sen_scores(self):
"""
for a review in a category (a revcat), compare the review-level
sentiment score (revcats.sen_score) to the average of the
sentence-level sentiment scores
(sentcat.sen_score for sent_id where sentences.review_id == revcats.review_id
>>> revcat = db.session.query(ReviewCategory).filter(ReviewCategory.revcat_id==245).first()
>>> revcat.compare_sen_scores()
sentcat.cat_code does not match revcat.cat_code
avg_sen_score not computed.
COMPARING REVIEW SENTIMENT TO SENTENCE SENTIMENT(S):
The review-category sentiment score is: 0.982730
The average sentence-category sentiment score is: 0.000000
>>> revcat = db.session.query(ReviewCategory).filter(ReviewCategory.revcat_id==114).first()
>>> revcat.compare_sen_scores()
computing avg_sen_score ...
COMPARING REVIEW SENTIMENT TO SENTENCE SENTIMENT(S):
The review-category sentiment score is: 0.979323
The average sentence-category sentiment score is: 0.984512
TODO: figure out why sentiment classifier is classifying differently for
reviews vs. sentences, as in this case (revcat_id=114), the review text is
a single sentence.
"""
# query db for sentcat sen_scores where sentences.review_id == revcat.review_id
sentcats = db.session.query(SentenceCategory).join(Sentence).filter(Sentence.review_id==self.review_id, SentenceCategory.cat_code==self.cat_code).all()
# import pdb; pdb.set_trace()
tot_sen_score = 0
num_scores = 0
for sc in sentcats:
if sc.cat_code == self.cat_code:
num_scores += 1
tot_sen_score += sc.sen_score
if num_scores != 0:
print "computing avg_sen_score ..."
avg_sen_score = (tot_sen_score / num_scores) / 1.0
else:
print "sentcat.cat_code does not match revcat.cat_code"
print "avg_sen_score not computed."
avg_sen_score = 0.0
print "COMPARING REVIEW SENTIMENT TO SENTENCE SENTIMENT(S):"
print "The review-category sentiment score is: %f" % self.sen_score
print "The average sentence-category sentiment score is: %f" % avg_sen_score
return avg_sen_score
class BizSentiment(db.Model):
"""Calculation table for aggregate sentiment for a business-category pair."""
# in postgreSQL, this would have a watcher on it (KLF)
__tablename__ = "bizsentiments"
bizsen_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
biz_id = db.Column(db.Integer, db.ForeignKey('biz.biz_id'))
cat_code = db.Column(db.Integer, db.ForeignKey('categories.cat_code')) # this is actually a string...
agg_sen_score = db.Column(db.Float, nullable=True) # to be calculated for individual scores an updated periodically
avg_cat_review = db.Column(db.Float, nullable=True)
num_revs = db.Column(db.Integer, nullable=True)
biz = db.relationship('PlatePalBiz',
backref=db.backref('sentiments', order_by=cat_code))
def __repr__(self):
return "<BizSentiment bizsen_id=%s>" % self.bizsen_id
# # MVP 3a. build class/method for avg rating per cat
# def calc_avg_rating_per_cat(self):
# """Calculate average stars for business by category"""
# # for biz_id
# # for a category
# # find all reviews in the current category
# # cat_code = category
# # take average of stars for all of those reviews
# # for review in list, sum = sum + review(stars)
# # average = sum/len(list)
# # update attribute
# # update database.
## post-MVP ##
# class ReviewSentence(db.Model):
# """
# Association table between review-categories and sentence-categories.
# A review has many categories --> review-categories
# A review-category has many sentences --> review-sentences
# A sentence has many categories --> sentence-categories
# """
# __tablename__ = "revsents"
# revsent_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
# sent_id = db.Column(db.Integer, db.ForeignKey('sentcats.sent_id'))
# review_id = db.Column(db.Integer, db.ForeignKey('reviews.review_id')) # TODO: should this be revcat id?
# # sent_text = db.Column(db.Text, nullable=False)
# sen_score = db.Column(db.Float, nullable=True)
# def __repr__(self):
# return "<ReviewSentence sent_id=%s review_id=%s>" % (self.sent_id, self.review_id) # TODO: revcat?
class SentenceCategory(db.Model):
"""
Association table between sentences and categories.
A sentence has many categories <--> a category has many sentences
"""
__tablename__ = "sentcats"
sentcat_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
sent_id = db.Column(db.Integer, db.ForeignKey('sentences.sent_id'))
cat_code = db.Column(db.Integer, db.ForeignKey('categories.cat_code'))
sen_score = db.Column(db.Integer, nullable=True)
def __repr__(self):
return "<SentenceCategory sent_id=%s cat_code=%s>" % (self.sent_id, self.cat_code)
class Sentence(db.Model):
"""
Storing individual sentences of reviews, assuming sentiment analysis
performed on a sentence-by-sentence level of granularity.
"""
__tablename__ = "sentences"
sent_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
review_id = db.Column(db.Integer, db.ForeignKey('reviews.review_id'))
sent_text = db.Column(db.Text, nullable=False)
def __repr__(self):
return "<Sentence sent_id=%s review_id=%s>" % (self.sent_id, self.review_id)
class City(db.Model):
"""
Store city information (used for geolocation of businesses)
"""
__tablename__ = "cities"
city_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
city = db.Column(db.Unicode(64), nullable=False)
state = db.Column(db.String(3), nullable=False)
lat = db.Column(db.Float(Precision=64), nullable=False)
lng = db.Column(db.Float(Precision=64), nullable=False)
def __repr__(self):
return "<City name=%s state=%s>" % (self.city, self.state)
# class NearbyCity(db.Model):
# """ store list of nearby cities"""
# __tablename__ = "nearbycities"
# near_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
# city_id = db.Column(db.Integer, db.ForeignKey('cities.city_id'))
# nearcity_id = db.Column(db.Integer, db.ForeignKey('cities.city_id'))
# def __repr__(self):
# return "<NearbyCity target-city=%s nearby-city=%s>" % (self.city_id, self.nearcity_id)
class CityDistance(db.Model):
""" store distances between cities"""
__tablename__ = "citydistances"
distance_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
city1_id = db.Column(db.Integer, db.ForeignKey('cities.city_id'))
city2_id = db.Column(db.Integer, nullable=False)
miles = db.Column(db.Float(Precision=64), nullable=False) # distance in miles
def __repr__(self):
return "<CityDistance target-city=%s nearby-city=%s>" % (self.city1_id, self.city2_id)
class CityDistCat(db.Model):
"""Same as CityDistance, but only cities with reviews in at least one Plate Pal category"""
__tablename__ = "citycats"
distance_id = db.Column(db.Integer, autoincrement=True, primary_key=True)
city1_id = db.Column(db.Integer, db.ForeignKey('cities.city_id'))
city2_id = db.Column(db.Integer, nullable=False)
miles = db.Column(db.Float(Precision=64), nullable=False) # distance in miles
def __repr__(self):
return "<CityDistCat target-city=%s nearby-city=%s>" % (self.city1_id, self.city2_id)
##############################################################################
# Helper functions
def connect_to_db(app):
"""Connect the database to our Flask app."""
# Configure to use our SQLite database
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///platepal.db'
# app.config['SQLALCHEMY_BINDS'] = {'yelp': 'sqlite:///yelp.db'}
app.config['SQLALCHEMY_ECHO'] = True
db.app = app
db.init_app(app)
# replaced with SQLAlchemy Bind
# def connect_to_yelp_db(app):
# """Connect the Yelp database to our Flask app."""
# # Configure to use our SQLite database
# app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///yelp.db'
# app.config['SQLALCHEMY_ECHO'] = True
# db.app = app
# db.init_app(app)
if __name__ == "__main__":
# As a convenience, if we run this module interactively, it will leave
# you in a state of being able to work with the database directly.
from server import app
connect_to_db(app)
print "Connected to DB."