Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Background task #2

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
__pycache__
.env
.vscode
.idea
.idea
*.pyc
14 changes: 14 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
flask-inputs = "*"
celery = "*"
redis = "*"

[dev-packages]

[requires]
python_version = "3.6"
132 changes: 132 additions & 0 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Procfile
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
web: gunicorn run
web: gunicorn run
worker: celery worker -A tasks -l INFO
15 changes: 15 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os
from flask import Flask

from app.celery import make_celery


flask_app = Flask(__name__)
flask_app.config.update(
CELERY_BROKER_URL=os.environ.get('CELERY_BROKER_URL', 'redis://localhost:6379'),
CELERY_RESULT_BACKEND=os.environ.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379')
)

from app import routes

celery = make_celery(flask_app)
11 changes: 11 additions & 0 deletions app/celery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from celery import Celery


def make_celery(app):
celery = Celery(
app.import_name,
broker=app.config['CELERY_BROKER_URL']
)
celery.conf.update(app.config)

return celery
7 changes: 4 additions & 3 deletions extractor/extractor.py → app/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@
from newspaper.configuration import Configuration
from newspaper.cleaners import DocumentCleaner

from logger import Logger

def get_data_from_html(html):

def get_data_from_html(article_url, html):
result = {}
parsed_html = Parser.fromstring(html)

Expand All @@ -18,16 +20,15 @@ def get_data_from_html(html):
cleaner = DocumentCleaner(config)

result['title'] = extractor.get_title(parsed_html)

publishing_date = extractor.get_publishing_date('', parsed_html)
if publishing_date is None:
publishing_date = datetime.datetime.now()

result['published_at'] = publishing_date.isoformat()

cleaned_html = cleaner.clean(parsed_html)
top_node = extractor.calculate_best_node(cleaned_html)
top_node = extractor.post_cleanup(top_node)
result['content'], _ = formatter.get_formatted(top_node)
result['meta_img'] = extractor.get_meta_img_url(article_url, parsed_html)

return result
3 changes: 2 additions & 1 deletion extractor/logger.py → app/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

logging.basicConfig(filename='info.log', format='%(levelname)s:%(message)s', level=logging.INFO)

class Logger():

class Logger:
@staticmethod
def info(msg):
print("INFO: %s" % msg)
Expand Down
29 changes: 29 additions & 0 deletions app/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from flask import request
from app import flask_app, extractor, utils
from app.logger import Logger
from tasks import extract_content

@flask_app.route("/extract", methods=["POST"])
def extract():
if not request.is_json:
return 'Invalid payload format. Should be application/json', 400

request_payload = request.get_json()
client_token = request_payload['token']
if not client_token:
Logger.error("Invalid [token] parameter")
return 'Invalid [token] parameter', 400
article_url = request_payload['url']
if not article_url:
Logger.error("Invalid [article_url] parameter")
return 'Invalid [article_url] parameter', 400
article_html = request_payload['pageHtml']
if not article_html:
Logger.error("Invalid [article_html] parameter")
return 'Invalid [article_html] parameter', 400

Logger.info('URL received for extraction:[%s]' % article_url)

extract_content.delay(article_url, article_html, client_token)

return 'Article content was sent to processing', 200
12 changes: 6 additions & 6 deletions extractor/api.py → app/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,20 @@

API_URL = 'https://svandis-api-prod.herokuapp.com'

def upload_post_data(api_token, post_data):
def upload_article_data(client_token, article_data):
url = API_URL + '/api/post'
headers = _get_request_headers(api_token)
headers = _get_request_headers(client_token)

r = requests.post(url, headers=headers, data=json.dumps({'post': post_data}))
r = requests.post(url, headers=headers, data=json.dumps({'post': article_data}))
r.raise_for_status()
return r.status_code


def invalidate_url(api_token, url):
def invalidate_url(client_token, article_url):
url = API_URL + '/api/post/invalidate-url'
headers = _get_request_headers(api_token)
headers = _get_request_headers(client_token)

r = requests.post(url, headers=headers, data=json.dumps({'url': url}))
r = requests.post(url, headers=headers, data=json.dumps({'url': article_url}))
r.raise_for_status()
return r.status_code

Expand Down
Binary file added dump.rdb
Binary file not shown.
5 changes: 0 additions & 5 deletions extractor/__init__.py

This file was deleted.

48 changes: 0 additions & 48 deletions extractor/routes.py

This file was deleted.

8 changes: 0 additions & 8 deletions info.log
Original file line number Diff line number Diff line change
@@ -1,8 +0,0 @@
INFO: * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
INFO: * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
INFO:127.0.0.1 - - [27/Jun/2018 17:03:14] "GET / HTTP/1.1" 404 -
INFO:127.0.0.1 - - [27/Jun/2018 17:03:15] "GET / HTTP/1.1" 404 -
INFO:127.0.0.1 - - [27/Jun/2018 17:03:18] "GET /extract HTTP/1.1" 400 -
INFO: * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
INFO:127.0.0.1 - - [27/Jun/2018 17:03:50] "GET /extract HTTP/1.1" 405 -
INFO:127.0.0.1 - - [27/Jun/2018 17:03:51] "GET /extract HTTP/1.1" 405 -
Loading