Merge pull request #151 from CPSSD/n/update-opinion#122

N/update opinion#122
CPSSD · Mar 14, 2016 · d119a7c · d119a7c
2 parents 635e256 + e6d19a9
commit d119a7c
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -108,3 +108,8 @@ This is where we keep the server, which routes HTTP and renders responses.
 #### `./topics`
 
 This contains the tool to parse an article and pick out the topics it relates to.
+
+#### `./update_opinion`
+
+This contains the tool that is called whenever a user votes on an article. It updates the list of topics they have opinions on in the database, and updates that user's machine learning model with the new data.
+
diff --git a/Vagrantfile b/Vagrantfile
@@ -89,6 +89,7 @@ Vagrant.configure(2) do |config|
     sudo apt-get install build-essential python-dev
     sudo pip install spacy
     sudo python -m spacy.en.download
+    sudo apt-get install -y python-sklearn
     sudo su -c "gem install sass"
     cd /vagrant/server
     npm install -y

diff --git a/doc/README.md b/doc/README.md
@@ -22,6 +22,12 @@ File Summaries
       </td>
       <td>This file describes the database that holds the data on each user, eg. auth data.</td>
     </tr>
+    <tr>
+      <td>
+        <code>db/vote.md</code>
+      </td>
+      <td>This file describes the database collection that holds logs of each time a user voted on an article.</td>
+    </tr>
     <tr>
       <td>
         <code>db/feed.md</code>
@@ -33,6 +39,6 @@ File Summaries
         <code>server/logging-spec.md</code>
       </td>
       <td>This file describes the format that all program console outputs should adhere to.</td>
-    <tr>
+    </tr>
   </tbody>
 </table>
diff --git a/update_opinion/updater.py b/update_opinion/updater.py
@@ -1,7 +1,10 @@
 import gearman
 import bson
+import pickle
 from datetime import datetime
 
+from sklearn import linear_model
+
 gearman_client = None
 
 def log(level, message):
@@ -19,6 +22,49 @@ def update_topic_counts(old_topics, changes, is_positive):
             old_topics[change] = diff
     return old_topics
 
+def update_model(user_data, article_data, is_positive):
+    """update the actual model in the user db with the new data"""
+    model = None
+    log(0, "Getting pickled model")
+    if "model" not in user_data:
+        log(1, "No model included in user data, creating new one.")
+        model = linear_model.SGDClassifier(loss='log')
+    else:
+        try:
+            pickled_model = user_data["model"]
+            model = pickle.loads(pickled_model)
+        except Exception as e:
+            log(2, "Error depickling model: " + str(e))
+            model = linear_model.SGDClassifier(loss='log')
+
+    log(0, "Training model with new data")
+    topic_crossover = 0 # a comparison of how close the articles are in terms of topic, taken from the worker in /aggregator
+    log(0, str(user_data['words']))
+    log(0, str(article_data['topics']))
+    score_data = bson.BSON.decode(bson.BSON(gearman_client.submit_job('fast_score', str(bson.BSON.encode({'article_words':user_data['words'], 'user_words':article_data['topics']}))).result))
+    if score_data['status'] == 'ok':
+        topic_crossover = score_data['score']
+    else:
+        log(2, "Error getting crossover score: " + str(score_data['description']))
+    age = (datetime.now() - article_data['pub_date']).total_seconds()*1000 # get the number of millis in difference
+
+    inputs = [topic_crossover, age]
+    output = 0 if is_positive else 1
+    log(0, str(inputs) + " " + str(output))
+    try:
+        model.partial_fit([inputs], [output], classes=[0, 1])
+    except Exception as e:
+        log(2, "Could not train model: " + str(e))
+
+    log(0, "Repickling model")
+    try:
+        user_data['model'] = pickle.dumps(model)
+    except Exception as e:
+        log(2, "Error pickling model: " + str(e))
+    return user_data
+
+
+
 def add_update_to_db(data):
     """log the given user opinion to the vote db collection"""
     req_data = {"database":"feedlark", "collection":"vote", "data":data}
@@ -94,8 +140,13 @@ def update_user_model(worker, job):
     user_words = user_data['words']
     for item in feed_data['items']:
         if item['link'] == job_input['article_url']:
+            if not 'topics' in item:
+                log(1, "No topics associated with given article.")
+                break
             topics = item['topics']
             user_words = update_topic_counts(user_words, topics, job_input['positive_opinion'])
+            user_data = update_model(user_data, item, job_input["positive_opinion"]) # update the pickled user model
+            break
 
     log(0, "Updating user db with new topic weights")
     user_data['words'] = user_words