Skip to content

Commit

Permalink
Merge pull request #151 from CPSSD/n/update-opinion#122
Browse files Browse the repository at this point in the history
N/update opinion#122
  • Loading branch information
m1cr0man committed Mar 14, 2016
2 parents 635e256 + e6d19a9 commit d119a7c
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 1 deletion.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,8 @@ This is where we keep the server, which routes HTTP and renders responses.
#### `./topics`

This contains the tool to parse an article and pick out the topics it relates to.

#### `./update_opinion`

This contains the tool that is called whenever a user votes on an article. It updates the list of topics they have opinions on in the database, and updates that user's machine learning model with the new data.

1 change: 1 addition & 0 deletions Vagrantfile
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ Vagrant.configure(2) do |config|
sudo apt-get install build-essential python-dev
sudo pip install spacy
sudo python -m spacy.en.download
sudo apt-get install -y python-sklearn
sudo su -c "gem install sass"
cd /vagrant/server
npm install -y
Expand Down
8 changes: 7 additions & 1 deletion doc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@ File Summaries
</td>
<td>This file describes the database that holds the data on each user, eg. auth data.</td>
</tr>
<tr>
<td>
<code>db/vote.md</code>
</td>
<td>This file describes the database collection that holds logs of each time a user voted on an article.</td>
</tr>
<tr>
<td>
<code>db/feed.md</code>
Expand All @@ -33,6 +39,6 @@ File Summaries
<code>server/logging-spec.md</code>
</td>
<td>This file describes the format that all program console outputs should adhere to.</td>
<tr>
</tr>
</tbody>
</table>
51 changes: 51 additions & 0 deletions update_opinion/updater.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import gearman
import bson
import pickle
from datetime import datetime

from sklearn import linear_model

gearman_client = None

def log(level, message):
Expand All @@ -19,6 +22,49 @@ def update_topic_counts(old_topics, changes, is_positive):
old_topics[change] = diff
return old_topics

def update_model(user_data, article_data, is_positive):
"""update the actual model in the user db with the new data"""
model = None
log(0, "Getting pickled model")
if "model" not in user_data:
log(1, "No model included in user data, creating new one.")
model = linear_model.SGDClassifier(loss='log')
else:
try:
pickled_model = user_data["model"]
model = pickle.loads(pickled_model)
except Exception as e:
log(2, "Error depickling model: " + str(e))
model = linear_model.SGDClassifier(loss='log')

log(0, "Training model with new data")
topic_crossover = 0 # a comparison of how close the articles are in terms of topic, taken from the worker in /aggregator
log(0, str(user_data['words']))
log(0, str(article_data['topics']))
score_data = bson.BSON.decode(bson.BSON(gearman_client.submit_job('fast_score', str(bson.BSON.encode({'article_words':user_data['words'], 'user_words':article_data['topics']}))).result))
if score_data['status'] == 'ok':
topic_crossover = score_data['score']
else:
log(2, "Error getting crossover score: " + str(score_data['description']))
age = (datetime.now() - article_data['pub_date']).total_seconds()*1000 # get the number of millis in difference

inputs = [topic_crossover, age]
output = 0 if is_positive else 1
log(0, str(inputs) + " " + str(output))
try:
model.partial_fit([inputs], [output], classes=[0, 1])
except Exception as e:
log(2, "Could not train model: " + str(e))

log(0, "Repickling model")
try:
user_data['model'] = pickle.dumps(model)
except Exception as e:
log(2, "Error pickling model: " + str(e))
return user_data



def add_update_to_db(data):
"""log the given user opinion to the vote db collection"""
req_data = {"database":"feedlark", "collection":"vote", "data":data}
Expand Down Expand Up @@ -94,8 +140,13 @@ def update_user_model(worker, job):
user_words = user_data['words']
for item in feed_data['items']:
if item['link'] == job_input['article_url']:
if not 'topics' in item:
log(1, "No topics associated with given article.")
break
topics = item['topics']
user_words = update_topic_counts(user_words, topics, job_input['positive_opinion'])
user_data = update_model(user_data, item, job_input["positive_opinion"]) # update the pickled user model
break

log(0, "Updating user db with new topic weights")
user_data['words'] = user_words
Expand Down

0 comments on commit d119a7c

Please sign in to comment.