diff --git a/.gitattributes b/.gitattributes index ec8c572..8d74334 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ * text eol=lf +*.png -crlf /server/assets/** binary diff --git a/README.md b/README.md index e1099e1..6b670f2 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,10 @@ Feedlark is using [MongoDB](http://mongodb.org) for its data storage. There are The front end uses [express.js](http://expressjs.com/), an MVC framework in Node.js. +Setting up the Vagrant VM for the first time is very very slow (potentially upwards of 30 minutes), as there are a lot of dependencies. In particular, `spacy`, the tool we are using for Python natural language processing, requires a 500mb download of its English NLP model. However, once this initial setup has been completed, the VM can be booted in less than a minute. + +The whole virtual machine will currently be no larger than 6Gb on disc. + Dependencies ------------ @@ -75,7 +79,7 @@ Project Directory Overview #### `./aggregator` -This is the code that coalesces the database collections `feed` and `user`, and places the data in `g2g`. That is, it takes the feed data, and the user data, and creates the feeds tailored to each individual user. +This is the code that coalesces the database collections `feed` and `user`, and places the data in `g2g`. That is, it takes the feed data, and the user data, and creates the feeds tailored to each individual user. It also includes the tool to compute the similarity of a user's interests and an article's topics. #### `./dbtools` @@ -104,3 +108,8 @@ This is where we keep the server, which routes HTTP and renders responses. #### `./topics` This contains the tool to parse an article and pick out the topics it relates to. + +#### `./update_opinion` + +This contains the tool that is called whenever a user votes on an article. It updates the list of topics they have opinions on in the database, and updates that user's machine learning model with the new data. + diff --git a/aggregator/README.md b/aggregator/README.md index 48f214f..3263de1 100644 --- a/aggregator/README.md +++ b/aggregator/README.md @@ -17,12 +17,41 @@ What is this? This is the code that ties the three Feedlark databases together, it coalesces `feed` and `user` and places the data in `g2g`. The machine learning components will be put here eventually to decide the order of the items but for now they are sorted chronologically. +kw_score.py +----------- + This directory also includes `kw_score.py` which provides two functions, `score` and `fast_score` to measure the crossover between the keywords of an article and a user's interests. The functions assign the articles a score between -1 and 1 which can be used to help judge what articles a user may prefer. Higher is better. The `score` function uses word vectors to find which words in the users list match up best with the keywords in the article and generates a score based on those. The `fast_score` function checks for any words common to both the users words and the article's keywords, this is faster but less general. +The `score` and `fast_score` Gearman workers take BSON encoded data in the form: + +```js +{ + 'article_words': {'word1':0.3, 'word2':0.1, 'wordn':0.4}, + 'user_words': {'space':44, 'truckin':12, 'gardening':-10}, +} +``` + +And return + +```js +{ + "status":"ok", + "score":0.7821, +} +``` +or +```js +{ + "status":"error", + "description":"Somethin' bad happened bro, here's where I tell you about it" +} +``` + + How to do tests --------------- diff --git a/aggregator/kw_score.py b/aggregator/kw_score.py index 2afc215..02eb3b4 100644 --- a/aggregator/kw_score.py +++ b/aggregator/kw_score.py @@ -1,5 +1,7 @@ from datetime import datetime from spacy.en import English +from bson import BSON +import gearman #This is outside a function so it runs only once, on import. nlp = English() @@ -53,8 +55,9 @@ def score(article_words, user_words): best_word = str(u).strip() article_word = str(a).strip() - log("Best match for '",article_word,"' is '",best_word,"', similarity: ",best_sim) - total += a_words_norm[article_word] * u_words_norm[best_word] * best_sim + if a != '': + log("Best match for '",article_word,"' is '",best_word,"', similarity: ",best_sim) + total += a_words_norm[article_word] * u_words_norm[best_word] * best_sim log("Total: ",total,", total count: ",len(article_words)) return total/len(article_words) @@ -93,3 +96,51 @@ def fast_score(article_words, user_words): return total/float(total_count) else: return 0 + +def score_gm(worker, job): + word_data = BSON(job.data).decode() + try: + a_words = word_data['article_words'] + u_words = word_data['user_words'] + except: + log("Problem with data provided",level=2) + return str(BSON.encode({"status":"error","description":"Problem with data provided"})) + + try: + a_score = score(a_words,u_words) + except: + log("Problem when scoring, is the data in the right format?") + return str(BSON.encode({"status":"error","description":"Problem when scoring, is the data in the right format?"})) + + return str(BSON.encode({"status":"ok","score":a_score})) + + +def fast_score_gm(worker, job): + word_data = BSON(job.data).decode() + try: + a_words = word_data['article_words'] + u_words = word_data['user_words'] + except: + log("Problem with data provided",level=2) + return str(BSON.encode({"status":"error","description":"Problem with data provided"})) + + try: + a_score = fast_score(a_words,u_words) + except: + log("Problem when scoring, is the data in the right format?") + return str(BSON.encode({"status":"error","description":"Problem when scoring, is the data in the right format?"})) + + return str(BSON.encode({"status":"ok","score":a_score})) + + +if __name__ == '__main__': + log("Starting Gearman worker") + gm_worker = gearman.GearmanWorker(['localhost:4730']) + gm_worker.set_client_id('kw-scoring') + + log("Registering tasks") + gm_worker.register_task('fast_score', fast_score_gm) + gm_worker.register_task('score', score_gm) + + gm_worker.work() + diff --git a/doc/README.md b/doc/README.md index 24e682c..1a15a3b 100644 --- a/doc/README.md +++ b/doc/README.md @@ -22,6 +22,12 @@ File Summaries This file describes the database that holds the data on each user, eg. auth data. + + + db/vote.md + + This file describes the database collection that holds logs of each time a user voted on an article. + db/feed.md @@ -33,6 +39,6 @@ File Summaries server/logging-spec.md This file describes the format that all program console outputs should adhere to. - + diff --git a/doc/db/user.md b/doc/db/user.md index 5649d04..4866361 100644 --- a/doc/db/user.md +++ b/doc/db/user.md @@ -10,6 +10,10 @@ Database to create entries in the finished G2G database. The user password will not be stored, instead, a salt and the hashed password and salt will be stored. These are stored in the same string in bcrypt. +Tokens represents the randomly generated API tokens. Currently they point to a +true value but this could be modified such that each token had validation date +and permissions. + Example Document ---------------- @@ -22,6 +26,7 @@ Example Document "password": "$2a$08$ThzPc3zm84JPb6LmvcfCkuXkwyh8H.Mn1VC4EKu9guksI9lbdb7Fa", "subscribed_feeds": ["news.ycombinator.com/rss", "pssd.computing.dcu.ie/rss.xml"], "words": {"butter":2, "milk":13, "antidisestablishmentarianism":-33}, + "tokens": {"add15f620657bb3fd8ce7fa9611f1aaba8717559295706a6d80f9e8cf58e81d7":true} } ``` diff --git a/doc/style.md b/doc/style.md index 0b094bc..f0907ed 100644 --- a/doc/style.md +++ b/doc/style.md @@ -1,26 +1,17 @@ # Style guide. -This page will hold links to the style guides that the feedlark team will aim to uphold for this project. This is done to improve readability and functionality in our code. - -### Markdown -The following is a link to the Google markdown style guide which we will use for all markdown code in our project. -https://github.com/google/styleguide/blob/gh-pages/docguide/style.md +This page will hold links to the style guides that the feedlark team will aim to +uphold for this project. This is done to improve readability and functionality +in our code. ### Python Style guide -The following is a link to the Pocoo style guide which we will use for all Python code in our project. +The following is a link to the Pocoo style guide which we will use for all +Python code in our project. http://www.pocoo.org/internal/styleguide/ ### Go Style guide -The following is a link to the golang style guide which we will use for all Go code in our project. +The following is a link to the `golang` style guide which we will use for all +Go code in our project. https://github.com/golang/go/wiki/CodeReviewComments ### JavaScript Style guide -The following is a link to the airbnb JavaScript style guide which we will use for all JavaScript code in our project -https://github.com/airbnb/javascript/blob/master/README.md - -### Shell Script Style guide -The following is a link to the Google Bash style guide which we will use for all Bash Script code in our project -https://google.github.io/styleguide/shell.xml - -### HTML/CSS Style guide -The following is a link to the Google HTML/CSS style guide which we will use for all HTML/CSS code in our project -https://google.github.io/styleguide/htmlcssguide.xml +All our javascript is run against [jshint](http://jshint.com/). diff --git a/script/aggregator b/script/aggregator index 19025c9..988944a 100755 --- a/script/aggregator +++ b/script/aggregator @@ -3,7 +3,6 @@ set -xe #Dependencies mongo localhost:27017/feedlark script/vagrant/create_feed_user_db.js -python dbtools/getter/getter.py & go run dbtools/start_workers.go & #Unit tests diff --git a/server/.jshintrc b/server/.jshintrc index 2b6f469..59f01b9 100644 --- a/server/.jshintrc +++ b/server/.jshintrc @@ -1,3 +1,4 @@ { - "esversion": 6 + "node": true, + "esnext": true } diff --git a/server/README.md b/server/README.md index 1b038a0..a62921d 100644 --- a/server/README.md +++ b/server/README.md @@ -19,34 +19,55 @@ Feedlark Web Server Requirements & Setup -------------------- -The Feedlark Vagrant box will provide you with all the tools needed to run the web server. If you have not set it up already, see the instructions in the main README.md located in the root of the repository. +The Feedlark Vagrant box will provide you with all the tools needed to run the +web server. If you have not set it up already, see the instructions in the main + README.md located in the root of the repository. Attention Windows Vagrant Users ------------------------------- +Make sure you start your Command Prompt as Admin. + Run `vagrant up` as admin. Not doing that causes problems when `npm install` trys to make symlinks in the directory. Admin is required so vagrant can force VirtualBox to allow symlinks. -Anyway, to do that, make sure you start your Command Prompt as Admin. - Usage ----- -First, make sure you are in the server folder inside the Vagrant box with the commands `vagrant ssh` and `cd /vagrant/server` +First, make sure you are in the server folder inside the Vagrant box with the +commands `vagrant ssh` and `cd /vagrant/server` Here's a list of what you can do: | command | description | | ------- | ----------- | -| `npm run start` | starts the server, using `npm bin/www` | +| `npm run start` | starts the server, using `node bin/www` | | `npm run test` | starts the mocha.js tests located in `tests/` | +| `npm run hint` | lint everything | Once started, the server will be available on http://192.168.2.2:3000 -Here's a list of what we _maybe should_ be able to do: +Pagination +---------- -| command | description | -| ------- | ----------- | -| `npm run lint` | lint everything | +Query strings control how much data is loaded per page. + +| query string | arguments | description | +| ------------ | --------- | -------------------- | +| `page` | int | Current page to view | +| `page_length` | int | Amount of links per page | + + +Plaintext +---------- + +To access the plaintext endpoint, first generate an API token in your profile +page. + +To make a plaintext request, generate a key and request the stream like so: + + http://feedlark.com/plaintext?token=$token&username=$username + + where $token is your token and $username is your username diff --git a/server/controllers/feeds.js b/server/controllers/feeds.js index 987b962..ca14b89 100755 --- a/server/controllers/feeds.js +++ b/server/controllers/feeds.js @@ -33,12 +33,13 @@ module.exports = { feedModel.create(url, db => { // Add to current user - userModel.addFeed(db, req.session.username, url, _ => { + userModel.addFeed(db, req.session.username, url, subscribed_feeds => { // Call gearman gearman.startJob('update-single-feed', url, undefined, () => {}); // Return to feed manager page req.session.msg = "Successfully added feed!"; + req.session.subscribed_feeds = subscribed_feeds; return res.redirect(302, "/feeds"); }); }); @@ -55,10 +56,11 @@ module.exports = { } var url = req.query.url.toLowerCase(); - userModel.removeFeed(req.session.username, url, _ => { + userModel.removeFeed(req.session.username, url, subscribed_feeds => { // Return to feed manager page req.session.msg = "Successfully removed feed!"; + req.session.subscribed_feeds = subscribed_feeds; return res.redirect(302, "/feeds"); }); } diff --git a/server/controllers/streams.js b/server/controllers/streams.js index 866b95b..b9a4f07 100755 --- a/server/controllers/streams.js +++ b/server/controllers/streams.js @@ -14,22 +14,72 @@ module.exports = { // Get & verify the page length & number // This Lo-Dash function is lovely - var page_length = _.toSafeInteger(req.params.page_length); - var page = _.toSafeInteger(req.params.page); + var page_length = _.toSafeInteger(req.query.page_length); + if (page_length <= 0) { + page_length = 20; // Default Page Length = 20 + } + var page = _.toSafeInteger(req.query.page); // defaults to 0 if undefined + + // Sort out the filters + var keywords = []; + if (typeof req.query.keywords != "undefined" && req.query.keywords.length > 1) { + keywords = req.query.keywords.split(" ").map(val => val.toLowerCase()); + } getFeeds(req.session.username, feeds => { - // Get a page worth of feeds - var pageinated_feeds = []; - for (var i = page * page_length; i < (page + 1) * page_length && i < feeds.length; i++) { - pageinated_feeds.push(feeds[i]); - } + // Filter the feeds + var filtered_feeds = feeds.filter((feed, index, src) => { + + // Match with the filters + return (typeof req.query.source == "undefined" || req.query.source.length < 1 || feed.feed == req.query.source) && + (keywords.length < 1 || keywords.every(val => feed.name.toLowerCase().includes(val))); + }); + + // Make sure the page number is less than the max available feeds + while (page > filtered_feeds / page_length) page -= 1; + + // Take a page worth of feeds + var pageinated_feeds = _.slice(filtered_feeds, page*page_length, (page+1)*page_length); + + var next_page = page + 1; + if (next_page * page_length > filtered_feeds.length) next_page = 0; + + res.status(200).render("stream_index", { + feeds: pageinated_feeds, + page: page, + next_page: next_page, + page_length: page_length, + subscribed_feeds: req.session.subscribed_feeds, + keywords: req.query.keywords || "", + source: req.query.source || "" + }); + }); + }, + + plaintext: (req, res) => { + var page_length = _.toSafeInteger(req.query.page_length); + if (page_length <= 0) { + page_length = 20; // Default Page Length = 20 + } + var page = _.toSafeInteger(req.query.page); + + + var username = req.query.username; + res.type('.txt'); + + getFeeds(username, feeds => { - // Work out the next page number now, because Lo-Dash sorted out the param already + var pageinated_feeds = _.slice(feeds, page*page_length, (page+1)*page_length); var next_page = page + 1; if ((page + 1) * page_length > feeds.length) next_page = 0; - res.status(200).render("stream_index", {feeds: pageinated_feeds, page: next_page, page_length: page_length}); + res.status(200).render("stream_plaintext", { + feeds: pageinated_feeds, + page: page, + next_page: next_page, + page_length: page_length + }); }); } }; diff --git a/server/controllers/users.js b/server/controllers/users.js index f4a027f..87f5521 100755 --- a/server/controllers/users.js +++ b/server/controllers/users.js @@ -7,6 +7,7 @@ const bcrypt = require("bcrypt-nodejs"); const userModel = require("../models/user"); +const crypto = require("crypto"); const _ = require("lodash"); module.exports = { @@ -30,6 +31,7 @@ module.exports = { // Set session vars and redirect req.session.username = user.username; + req.session.subscribed_feeds = user.subscribed_feeds; req.session.msg = "Successfully logged in."; return res.redirect(302, "/user"); }); @@ -69,6 +71,79 @@ module.exports = { return res.redirect(302, "/user"); }); }); + }, + + // Render the user profile + profile: (req, res) => { + userModel.findByUsername(req.session.username, user => { + res.status(200).render("profile", { + user: user + }); + }); + }, + + addToken: (req, res) => { + const username = req.session.username; + // TODO: add per token permissions + // Get the user's details from the DB + crypto.randomBytes(32, (err, buf) => { + if (err) return res.status(400).render("/user", {err: "Oops, something broke."}); + const token = buf.toString('hex'); + userModel.addToken(username, token, status => { + + if (typeof status == "undefined" || status == "err") { + return res.redirect(400 , "/user"); + } + else { + return res.redirect(302, "/user"); + } + }); + }); + }, + + removeToken: (req, res) => { + const username = req.session.username; + const token = req.query.token; + // TODO: add per token permissions + userModel.removeToken(username, token, (data) => { + return res.redirect(302, "/user"); + }); + }, + + listTokens: (req, res) => { + const username = req.session.username; + userModel.findByUsername(username, user => { + if ( user ) { + if (user.tokens) { + return res.status(200).send(user.tokens); + } + else { + res.status(403).end(); + } + } + else { + res.status(403).end(); + } + }); + }, + + validToken: (req, res, next) => { + const username = req.query.username; + const token = req.query.token; + userModel.findByUsername(username, user => { + if ( user ) { + if (user.tokens[token]) { + next(); + } + else { + res.status(403).end(); + } + } + else { + res.status(403).end(); + } + }); } + }; diff --git a/server/middleware/db.js b/server/middleware/db.js index 29dfa65..459b81b 100755 --- a/server/middleware/db.js +++ b/server/middleware/db.js @@ -37,10 +37,9 @@ module.exports = { update: (db, collection, selector, data, cb) => { db.collection(collection).updateOne(selector, {$set: data}, (err, data) => { - if (err) throw err; - return cb(); + return cb(data); }); }, diff --git a/server/middleware/routes.js b/server/middleware/routes.js index 416fc9a..ec77f09 100755 --- a/server/middleware/routes.js +++ b/server/middleware/routes.js @@ -3,10 +3,12 @@ // like express and the auth middleware aren't imported N times const router = require("express").Router(); + const userController = require("../controllers/users"); const feedController = require("../controllers/feeds"); const streamController = require("../controllers/streams"); + // Checks if the client is authenticated function isAuthed(req, res, next) { if (req.session.username) { @@ -40,7 +42,7 @@ router.get("/user/login", (req, res) => { }); // Profile -router.get("/user", isAuthed, (req, res) => { res.render("profile"); }); +router.get("/user", isAuthed, userController.profile); // Feeds pages @@ -57,7 +59,15 @@ router.get("/feeds/remove", isAuthed, feedController.remove); router.get("/feeds", isAuthed, feedController.index); // Stream -router.get("/stream/:page_length/:page", isAuthed, streamController.index); +router.get("/stream", isAuthed, streamController.index); + +// Tokens (for API stuff!) +router.get("/token/add", isAuthed, userController.addToken); +router.get("/token/remove", isAuthed, userController.removeToken); +router.get("/token/list", isAuthed, userController.listTokens); + +// Plaintext Endpoint +router.get("/plaintext", userController.validToken, streamController.plaintext); // Home/Index router.get("/", (req, res) => { diff --git a/server/models/user.js b/server/models/user.js index acf2eb3..dbf531e 100755 --- a/server/models/user.js +++ b/server/models/user.js @@ -64,7 +64,7 @@ module.exports = { "user", {username: username}, {subscribed_feeds: user.subscribed_feeds}, - cb + _ => cb(user.subscribed_feeds) ); }); }, @@ -84,8 +84,48 @@ module.exports = { "user", {username: username}, {subscribed_feeds: user.subscribed_feeds}, + _ => cb(user.subscribed_feeds) + ); + })); + }, + + addToken: (username, token, cb) => { + if (!token || !username) { + return cb(); + } + dbFuncs.transaction(db => dbFuncs.findOne(db, "user", {username: username}, user => { + + // TODO Add and render error message + if (!user.tokens) { + user.tokens = {}; + } + user.tokens[token] = true; + + + // TODO: check for max tokens + dbFuncs.update( + db, + "user", + {username: username}, + {tokens: user.tokens}, cb ); })); - } + }, + + removeToken: (username, token, cb) => { + dbFuncs.transaction(db => dbFuncs.findOne(db, "user", {username: username}, user => { + if (typeof user.tokens != "undefined") { + delete user.tokens[token]; + } + dbFuncs.update( + db, + "user", + {username: username}, + {tokens: user.tokens}, + cb + ); + })); + }, + }; diff --git a/server/package.json b/server/package.json index 53558fc..e2a84ef 100644 --- a/server/package.json +++ b/server/package.json @@ -4,7 +4,7 @@ "private": true, "scripts": { "start": "node ./bin/www", - "test": "NODE_ENV=test node ./node_modules/mocha/bin/mocha test/bootstrap.test.js test/integration/**/User.test.js test/integration/**/Feed.test.js", + "test": "NODE_ENV=test node ./node_modules/mocha/bin/mocha test/bootstrap.test.js test/integration/**/User.test.js test/integration/**/Feed.test.js test/integration/**/Stream.test.js", "hint": "jshint controllers/*.js middleware/*.js models/*.js test/*.js || true " }, "dependencies": { diff --git a/server/test/integration/Stream.test.js b/server/test/integration/Stream.test.js new file mode 100644 index 0000000..e21975e --- /dev/null +++ b/server/test/integration/Stream.test.js @@ -0,0 +1,65 @@ +const app = require("../../app"); +const assert = require('assert'); +const request = require('supertest'); +const lodash = require('lodash'); + +describe('StreamController', _ => { + var agent = request.agent(app); + + var user_details_base = { + username: "rmss", + email: "rms@gnu.org", + password: "gnuisnotlinux" + } + + describe('#index()', _ => { + it("Stream is unavailable to users logged out", done => { + agent + .get('/stream') + .expect(403, done); + }); + it('User can login', done => { + agent + .post('/user/login') + .type('form') + .send(user_details_base) + .expect(302, done); + }); + it("Stream is available to users", done => { + agent + .get('/stream') + .expect(200, done); + }); + }); + + describe('#plaintext()', _ => { + var api_key = null; + it("Able to generate api token", done => { + agent + .get('/token/add') + .expect(302, done); + }); + + it("Able to fetch api token", done => { + agent + .get('/token/list') + .expect(200) + .end( (err, res) => { + if (err) throw err; + api_key = Object.keys(res.body)[0]; + if ( lodash.isString(api_key) ) { + done(); + } + else { + done("Key doesn't exist"); + } + }); + }); + + it("Able to get plaintext", done => { + agent + .get('/plaintext?username=' + user_details_base.username + "&token=" + api_key) + .expect(200, done); + }); + }); +}); diff --git a/server/views/partials/footer.ejs b/server/views/partials/footer.ejs index 2cced08..33c104a 100644 --- a/server/views/partials/footer.ejs +++ b/server/views/partials/footer.ejs @@ -4,23 +4,19 @@ - + - + <% if (typeof session != "undefined" && typeof session.msg != "undefined") { %> diff --git a/server/views/partials/header.ejs b/server/views/partials/header.ejs index 41dc0fb..d4d72b8 100644 --- a/server/views/partials/header.ejs +++ b/server/views/partials/header.ejs @@ -36,7 +36,7 @@