Merge branch 'master' of github.com:CPSSD/feedlark

CPSSD · Mar 14, 2016 · 7b6e8a9 · 7b6e8a9
2 parents 31e84e4 + d119a7c
commit 7b6e8a9
Show file tree

Hide file tree

Showing 24 changed files with 561 additions and 76 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1,3 +1,4 @@
 * text eol=lf
+*.png -crlf
 /server/assets/** binary
 
diff --git a/README.md b/README.md
@@ -11,6 +11,10 @@ Feedlark is using [MongoDB](http://mongodb.org) for its data storage. There are
 
 The front end uses [express.js](http://expressjs.com/), an MVC framework in Node.js.
 
+Setting up the Vagrant VM for the first time is very very slow (potentially upwards of 30 minutes), as there are a lot of dependencies. In particular, `spacy`, the tool we are using for Python natural language processing, requires a 500mb download of its English NLP model. However, once this initial setup has been completed, the VM can be booted in less than a minute.
+
+The whole virtual machine will currently be no larger than 6Gb on disc.
+
 
 Dependencies
 ------------
@@ -75,7 +79,7 @@ Project Directory Overview
 
 #### `./aggregator`
 
-This is the code that coalesces the database collections `feed` and `user`, and places the data in `g2g`. That is, it takes the feed data, and the user data, and creates the feeds tailored to each individual user.
+This is the code that coalesces the database collections `feed` and `user`, and places the data in `g2g`. That is, it takes the feed data, and the user data, and creates the feeds tailored to each individual user. It also includes the tool to compute the similarity of a user's interests and an article's topics.
 
 #### `./dbtools`
 
@@ -104,3 +108,8 @@ This is where we keep the server, which routes HTTP and renders responses.
 #### `./topics`
 
 This contains the tool to parse an article and pick out the topics it relates to.
+
+#### `./update_opinion`
+
+This contains the tool that is called whenever a user votes on an article. It updates the list of topics they have opinions on in the database, and updates that user's machine learning model with the new data.
+
diff --git a/aggregator/README.md b/aggregator/README.md
@@ -17,12 +17,41 @@ What is this?
 This is the code that ties the three Feedlark databases together, it coalesces `feed` and `user` and places the data in `g2g`.
 The machine learning components will be put here eventually to decide the order of the items but for now they are sorted chronologically.
 
+kw_score.py
+-----------
+
 This directory also includes `kw_score.py` which provides two functions, `score` and `fast_score` to measure the crossover between the keywords of an article and a user's interests.
 The functions assign the articles a score between -1 and 1 which can be used to help judge what articles a user may prefer. Higher is better.
 
 The `score` function uses word vectors to find which words in the users list match up best with the keywords in the article and generates a score based on those.
 The `fast_score` function checks for any words common to both the users words and the article's keywords, this is faster but less general.
 
+The `score` and `fast_score` Gearman workers take BSON encoded data in the form:
+
+```js
+{
+	'article_words': {'word1':0.3, 'word2':0.1, 'wordn':0.4},
+	'user_words': {'space':44, 'truckin':12, 'gardening':-10},
+}
+```
+
+And return
+
+```js
+{
+	"status":"ok",
+	"score":0.7821,
+}
+```
+or
+```js
+{
+	"status":"error",
+	"description":"Somethin' bad happened bro, here's where I tell you about it"
+}
+```
+
+
 How to do tests
 ---------------
 

diff --git a/aggregator/kw_score.py b/aggregator/kw_score.py
@@ -1,5 +1,7 @@
 from datetime import datetime
 from spacy.en import English
+from bson import BSON
+import gearman
 
 #This is outside a function so it runs only once, on import.
 nlp = English()
@@ -53,8 +55,9 @@ def score(article_words, user_words):
 				best_word = str(u).strip()
 
 		article_word = str(a).strip()
-		log("Best match for '",article_word,"' is '",best_word,"', similarity: ",best_sim)
-		total += a_words_norm[article_word] * u_words_norm[best_word] * best_sim
+		if a != '':
+			log("Best match for '",article_word,"' is '",best_word,"', similarity: ",best_sim)
+			total += a_words_norm[article_word] * u_words_norm[best_word] * best_sim
 
 	log("Total: ",total,", total count: ",len(article_words))
 	return total/len(article_words)
@@ -93,3 +96,51 @@ def fast_score(article_words, user_words):
         return total/float(total_count)
     else:
         return 0
+
+def score_gm(worker, job):
+	word_data = BSON(job.data).decode()
+	try:
+		a_words = word_data['article_words']
+		u_words = word_data['user_words']
+	except:
+		log("Problem with data provided",level=2)
+		return str(BSON.encode({"status":"error","description":"Problem with data provided"}))
+
+	try:
+		a_score = score(a_words,u_words)
+	except:
+		log("Problem when scoring, is the data in the right format?")
+		return str(BSON.encode({"status":"error","description":"Problem when scoring, is the data in the right format?"}))
+
+	return str(BSON.encode({"status":"ok","score":a_score}))
+
+
+def fast_score_gm(worker, job):
+	word_data = BSON(job.data).decode()
+	try:
+		a_words = word_data['article_words']
+		u_words = word_data['user_words']
+	except:
+		log("Problem with data provided",level=2)
+		return str(BSON.encode({"status":"error","description":"Problem with data provided"}))
+
+	try:
+		a_score = fast_score(a_words,u_words)
+	except:
+		log("Problem when scoring, is the data in the right format?")
+		return str(BSON.encode({"status":"error","description":"Problem when scoring, is the data in the right format?"}))
+
+	return str(BSON.encode({"status":"ok","score":a_score}))
+
+
+if __name__ == '__main__':
+	log("Starting Gearman worker")
+	gm_worker = gearman.GearmanWorker(['localhost:4730'])
+	gm_worker.set_client_id('kw-scoring')
+
+	log("Registering tasks")
+	gm_worker.register_task('fast_score', fast_score_gm)
+	gm_worker.register_task('score', score_gm)
+
+	gm_worker.work()
+
diff --git a/doc/README.md b/doc/README.md
@@ -22,6 +22,12 @@ File Summaries
       </td>
       <td>This file describes the database that holds the data on each user, eg. auth data.</td>
     </tr>
+    <tr>
+      <td>
+        <code>db/vote.md</code>
+      </td>
+      <td>This file describes the database collection that holds logs of each time a user voted on an article.</td>
+    </tr>
     <tr>
       <td>
         <code>db/feed.md</code>
@@ -33,6 +39,6 @@ File Summaries
         <code>server/logging-spec.md</code>
       </td>
       <td>This file describes the format that all program console outputs should adhere to.</td>
-    <tr>
+    </tr>
   </tbody>
 </table>
diff --git a/doc/db/user.md b/doc/db/user.md
@@ -10,6 +10,10 @@ Database to create entries in the finished G2G database.
 The user password will not be stored, instead, a salt and the hashed password
 and salt will be stored. These are stored in the same string in bcrypt.
 
+Tokens represents the randomly generated API tokens. Currently they point to a
+true value but this could be modified such that each token had validation date
+and permissions.
+
 Example Document
 ----------------
 
@@ -22,6 +26,7 @@ Example Document
 	"password": "$2a$08$ThzPc3zm84JPb6LmvcfCkuXkwyh8H.Mn1VC4EKu9guksI9lbdb7Fa",
 	"subscribed_feeds": ["news.ycombinator.com/rss", "pssd.computing.dcu.ie/rss.xml"],
 	"words": {"butter":2, "milk":13, "antidisestablishmentarianism":-33},
+	"tokens": {"add15f620657bb3fd8ce7fa9611f1aaba8717559295706a6d80f9e8cf58e81d7":true}
 }
 ```
 

diff --git a/doc/style.md b/doc/style.md
@@ -1,26 +1,17 @@
 # Style guide.
-This page will hold links to the style guides that the feedlark team will aim to uphold for this project. This is done to improve readability and functionality in our code.
-
-### Markdown
-The following is a link to the Google markdown style guide which we will use for all markdown code in our project.
-https://github.com/google/styleguide/blob/gh-pages/docguide/style.md
+This page will hold links to the style guides that the feedlark team will aim to
+uphold for this project. This is done to improve readability and functionality
+in our code.
 
 ### Python Style guide
-The following is a link to the Pocoo style guide which we will use for all Python code in our project.
+The following is a link to the Pocoo style guide which we will use for all
+Python code in our project.
 http://www.pocoo.org/internal/styleguide/
 
 ### Go Style guide
-The following is a link to the golang style guide which we will use for all Go code in our project.
+The following is a link to the `golang` style guide which we will use for all
+Go code in our project.
 https://github.com/golang/go/wiki/CodeReviewComments
 
 ### JavaScript Style guide
-The following is a link to the airbnb JavaScript style guide which we will use for all JavaScript code in our project
-https://github.com/airbnb/javascript/blob/master/README.md
-
-### Shell Script Style guide
-The following is a link to the Google Bash style guide which we will use for all Bash Script code in our project
-https://google.github.io/styleguide/shell.xml
-
-### HTML/CSS Style guide
-The following is a link to the Google HTML/CSS style guide which we will use for all HTML/CSS code in our project
-https://google.github.io/styleguide/htmlcssguide.xml
+All our javascript is run against [jshint](http://jshint.com/).
diff --git a/script/aggregator b/script/aggregator
@@ -3,7 +3,6 @@ set -xe
 
 #Dependencies
 mongo localhost:27017/feedlark script/vagrant/create_feed_user_db.js
-python dbtools/getter/getter.py &
 go run dbtools/start_workers.go &
 
 #Unit tests

diff --git a/server/.jshintrc b/server/.jshintrc
@@ -1,3 +1,4 @@
 {
-  "esversion": 6
+  "node": true,
+  "esnext": true
 }
diff --git a/server/README.md b/server/README.md
@@ -19,34 +19,55 @@ Feedlark Web Server
 Requirements & Setup
 --------------------
 
-The Feedlark Vagrant box will provide you with all the tools needed to run the web server. If you have not set it up already, see the instructions in the main README.md located in the root of the repository.
+The Feedlark Vagrant box will provide you with all the tools needed to run the
+web server. If you have not set it up already, see the instructions in the main
+ README.md located in the root of the repository.
 
 Attention Windows Vagrant Users
 -------------------------------
 
+Make sure you start your Command Prompt as Admin.
+
 Run `vagrant up` as admin. Not doing that causes problems when `npm install`
 trys to make symlinks in the directory. Admin is required so vagrant can
 force VirtualBox to allow symlinks.
 
-Anyway, to do that, make sure you start your Command Prompt as Admin.
-
 Usage
 -----
 
-First, make sure you are in the server folder inside the Vagrant box with the commands `vagrant ssh` and `cd /vagrant/server`
+First, make sure you are in the server folder inside the Vagrant box with the
+commands `vagrant ssh` and `cd /vagrant/server`
 
 
 Here's a list of what you can do:
 
 | command         | description                                               |
 | -------         | -----------                                               |
-| `npm run start` | starts the server, using `npm bin/www`                    |
+| `npm run start` | starts the server, using `node bin/www`                   |
 | `npm run test`  | starts the mocha.js tests located in `tests/`             |
+| `npm run hint`  | lint everything                                           |
 
 Once started, the server will be available on http://192.168.2.2:3000
 
-Here's a list of what we _maybe should_ be able to do:
+Pagination
+----------
 
-| command         | description                                               |
-| -------         | -----------                                               |
-| `npm run lint`  | lint everything                                           |
+Query strings control how much data is loaded per page.
+
+| query string   | arguments | description                |
+| ------------   | --------- | --------------------       |
+|  `page`        | int       | Current page to view       |
+|  `page_length` | int       | Amount of links per page   |
+
+
+Plaintext
+----------
+
+To access the plaintext endpoint, first generate an API token in your profile
+page.
+
+To make a plaintext request, generate a key and request the stream like so:
+
+    http://feedlark.com/plaintext?token=$token&username=$username
+
+    where $token is your token and $username is your username
diff --git a/server/controllers/feeds.js b/server/controllers/feeds.js
@@ -33,12 +33,13 @@ module.exports = {
     feedModel.create(url, db => {
 
       // Add to current user
-      userModel.addFeed(db, req.session.username, url, _ => {
+      userModel.addFeed(db, req.session.username, url, subscribed_feeds => {
         // Call gearman
         gearman.startJob('update-single-feed', url, undefined, () => {});
 
         // Return to feed manager page
         req.session.msg = "Successfully added feed!";
+        req.session.subscribed_feeds = subscribed_feeds;
         return res.redirect(302, "/feeds");
       });
     });
@@ -55,10 +56,11 @@ module.exports = {
     }
     var url = req.query.url.toLowerCase();
 
-    userModel.removeFeed(req.session.username, url, _ => {
+    userModel.removeFeed(req.session.username, url, subscribed_feeds => {
 
       // Return to feed manager page
       req.session.msg = "Successfully removed feed!";
+      req.session.subscribed_feeds = subscribed_feeds;
       return res.redirect(302, "/feeds");
     });
   }