-
Notifications
You must be signed in to change notification settings - Fork 0
/
reviewdataNB.py
105 lines (91 loc) · 3.8 KB
/
reviewdataNB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
import sys
import os.path
# container class for review information
class Review:
def __init__(self, reviewerID, productID, upVotes, totalVotes, reviewText, overallRating):
self.reviewerID = reviewerID
self.productID = productID
self.upVotes = upVotes
self.totalVotes = totalVotes
self.reviewText = reviewText
self.overallRating = overallRating
# metrics
self.favorableRating = float(upVotes) / float(totalVotes)
self.reviewAge = 0
# reviewer dict
self.reviewedAlsoBought = 0
self.reviewedAlsoViewed = 0
self.reviewedAlsoBoughtTogether = 0
self.NumberOfUserReviews = 0
self.AverageReviewRating = 0
# product dict
self.reviewDeviationFromMean = 0
self.reviewProductMean = 0
# nlp library
self.reviewLength = 0
self.numberStopWords = 0
self.numberOfPunctuations = 0
self.averageWordLength = 0
self.averageSentanceLength = 0
self.numberExclamationPoints = 0
self.numberQuestionMarks = 0
self.readability = 0
self.namedEntities = 0
self.numberNouns = 0
self.numberPassiveVerbs = 0
self.numberActiveVerbs = 0
self.numberAdjectives = 0
self.numberPronous = 0
self.helpfulLabel = "NA"
def to_dict(self):
return {
'upVotes': self.upVotes,
'totalVotes': self.totalVotes,
'overallRating': self.overallRating,
'favorableRating': self.favorableRating,
'reviewAge': self.reviewAge,
'reviewedAlsoBought': self.reviewedAlsoBought,
'reviewedAlsoViewed': self.reviewedAlsoViewed,
'reviewedAlsoBoughtTogether': self.reviewedAlsoBoughtTogether,
'NumberOfUserReviews': self.NumberOfUserReviews,
'AverageReviewRating': self.AverageReviewRating,
'reviewDeviationFromMean': self.reviewDeviationFromMean,
'reviewProductMean': self.reviewProductMean,
'reviewLength': self.reviewLength,
'numberStopWords': self.numberStopWords,
'numberOfPunctuations': self.numberOfPunctuations,
'averageWordLength': self.averageWordLength,
'averageSentanceLength': self.averageSentanceLength,
'numberExclamationPoints': self.numberExclamationPoints,
'numberQuestionMarks': self.numberExclamationPoints,
'namedEntities': self.namedEntities,
'numberNouns': self.numberNouns,
'numberPassiveVerbs': self.numberPassiveVerbs,
'numberActiveVerbs': self.numberActiveVerbs,
'numberAdjectives': self.numberAdjectives,
'numberPronous': self.numberPronous,
'readability': self.readability,
'helpfulLabel': self.helpfulLabel
}
# Read in review data if there is no pre calculated file
def readInReviewData(fileName, minVoteThreshold, minReviewLength):
reviewsList = []
dataStore = []
for line in open(fileName, 'r'):
# Get data from json file
data = json.loads(line)
# Grab each data component
reviewerID = data['reviewerID']
productID = data['asin']
upVotes = data['helpful'][0]
totalVotes = data['helpful'][1]
reviewText = data['reviewText']
rating = data['overall']
# Initialize data components into review class
# Only add to list of reviews if review meets minimum vote threshold and minimum text length threshold
if (totalVotes >= minVoteThreshold) and (len(reviewText) >= minReviewLength):
reviewEntry = Review(reviewerID, productID, upVotes, totalVotes, reviewText, rating)
reviewsList.append(reviewEntry)
dataStore.append(data)
return reviewsList