-
Notifications
You must be signed in to change notification settings - Fork 13
Attach document image to tweet #29
base: master
Are you sure you want to change the base?
Changes from 13 commits
d8c8e46
d082972
5ad5a34
1b054ae
7822531
768c1bc
653275f
3bccd45
eb0d8cd
e5a53dc
f0b16cb
1f5fd5f
38c2b5d
32be9f4
5f0cae7
b0dc505
c841e32
e88e8f2
fa7a805
622268a
60d87be
23f997b
2be5a44
0c065d8
3c4d8e0
1a5d446
0fccf2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,10 @@ | ||
amqp==2.2.2 | ||
celery==4.1.0 | ||
ipdb==0.10.3 | ||
opencv-python==3.4.0.12 | ||
pandas==0.21.0 | ||
pymongo==3.5.1 | ||
python-twitter==3.3 | ||
requests==2.18.4 | ||
serenata-toolbox==12.2.2 | ||
wand==0.4.4 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import datetime | ||
from unittest import TestCase, mock | ||
from io import BytesIO, BufferedReader | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As a matter of style, would you mind putting sorting this? I mean, |
||
|
||
import pandas as pd | ||
from twitter import TwitterError | ||
|
@@ -108,6 +109,8 @@ def setUp(self): | |
self.reimbursement = { | ||
'congressperson_name': 'Eduardo Cunha', | ||
'document_id': 10, | ||
'applicant_id': 10, | ||
'year': 2015, | ||
'state': 'RJ', | ||
'twitter_profile': 'DepEduardoCunha', | ||
} | ||
|
@@ -117,19 +120,48 @@ def setUp(self): | |
|
||
def test_publish(self): | ||
self.subject.publish() | ||
self.api.PostUpdate.assert_called_once_with(self.subject.text()) | ||
text, reimbursement_image = self.subject.tweet_data() | ||
self.api.PostUpdate.assert_called_once_with( | ||
media=reimbursement_image, status=text) | ||
dict_representation = dict(self.subject) | ||
self.database.posts.insert_one.assert_called_once_with( | ||
dict_representation) | ||
|
||
def test_text(self): | ||
def test_tweet_data(self): | ||
message = ( | ||
'🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). ' | ||
'Você pode me ajudar a verificar? ' | ||
'https://jarbas.serenata.ai/layers/#/documentId/10 ' | ||
'#SerenataDeAmor na @CamaraDeputados' | ||
) | ||
self.assertEqual(message, self.subject.text()) | ||
reimbursement_image = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I could see you defined There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a mistake, I'll remove it. I had some doubts about how to test the methods that call the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd mock the @patch.object(Post, 'tweet_image')
def test_something_else(self, tweet_image_mock):
# do magic
tweet_image_mock.assert_called_once_with() |
||
self.assertEqual( | ||
(message, None), self.subject.tweet_data()) | ||
self.reimbursement['twitter_profile'] = None | ||
with self.assertRaises(ValueError): | ||
self.subject.text() | ||
self.subject.tweet_data() | ||
|
||
def test_tweet_text(self): | ||
message = ( | ||
'🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). ' | ||
'Você pode me ajudar a verificar? ' | ||
'https://jarbas.serenata.ai/layers/#/documentId/10 ' | ||
'#SerenataDeAmor na @CamaraDeputados' | ||
) | ||
self.assertEqual(message, self.subject.tweet_text()) | ||
|
||
def test_camara_image_url(self): | ||
url = 'http://www.camara.gov.br/cota-parlamentar/documentos/publ/10/2015/10.pdf' | ||
self.assertEqual(url, self.subject.camara_image_url()) | ||
|
||
@mock.patch('whistleblower.targets.twitter.urllib.request.urlopen') | ||
def test_tweet_image(self, urlopen_mock): | ||
with open('tests/fixtures/10.pdf', 'rb') as pdf_fixture: | ||
mock_response = pdf_fixture | ||
mock_response_read = BytesIO(pdf_fixture.read()) | ||
urlopen_mock.return_value = mock_response_read | ||
self.assertIsInstance( | ||
self.subject.tweet_image(), BufferedReader) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have the impression that this block could be simpler: with open('tests/fixtures/10.pdf', 'rb') as pdf_fixture:
mock_response = pdf_fixture
mock_response_read = BytesIO(pdf_fixture.read())
urlopen_mock.return_value = mock_response_read
self.assertIsInstance(self.subject.tweet_image(), BufferedReader) You create with open('tests/fixtures/10.pdf', 'rb') as mock_response:
urlopen_mock.return_value = mock_response
self.assertIsInstance(self.subject.tweet_image(), BufferedReader) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got this error when trying to just assign this variable to the
When debugging this code I came to the conclusion that in this way the I could simplify the test writing it like down below, but I would still have to read the content of the opened file file and cast it to a @mock.patch('whistleblower.targets.twitter.urllib.request.urlopen')
def test_tweet_image(self, urlopen_mock):
with open('tests/fixtures/10.pdf', 'rb') as mock_response:
urlopen_mock.return_value = BytesIO(mock_response.read())
self.assertIsInstance(self.subject.tweet_image(), BufferedReader) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks great : ) Way better! |
||
|
||
urlopen_mock.side_effect = Exception() | ||
self.assertIsNone(self.subject.tweet_image()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This could be a new test. The first assert would be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure! I did in this way because the other tests have both the success and error cases in the same method. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import sys | ||
|
||
import cv2 | ||
import numpy | ||
|
||
TEXT_MIN_WIDTH = 35 | ||
TEXT_MIN_HEIGHT = 10 | ||
|
||
DEFAULT_WIDTH = 850 | ||
DEFAULT_HEIGHT = 1100 | ||
|
||
KERNEL_WIDTH = 25 | ||
KERNEL_HEIGHT = 15 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What are the meaning of these constants? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In theory, these are the distance (in pixels) between letters in the document to be considered the same line. They are used in the mathematical operations which glue the letters of the same line together. I'll add these comments. |
||
|
||
|
||
def remove_borders(image, threshold, max_width, max_height): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you mind adding doctrings to explain a bit more about these arguments? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code is mine. I'll improve them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Many thanks, @begnini : ) |
||
height, width = image.shape[:2] | ||
|
||
for i in range(max_width): | ||
total = image[:, i].sum() / 255 | ||
if total > threshold: | ||
image[:, i] = numpy.ones(height) * 255 | ||
|
||
total = image[:, width - i - 1].sum() / 255 | ||
if total > threshold: | ||
image[:, i - 1] = numpy.ones(height) * 255 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could extract this logic into a helper function def remove_borders(image, threshold, max_width, max_height):
height. width, *_ = image.shape
image = _remove_border(image, width, max_width, threshold)
image = _remove_border(image, height, max_height, threshold)
return image And in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The indexes is different. The first loop look at columns, the other looks at lines. To break in one function, the code should rotate the image and this will be a lot weird. I can refactor to create a remove_column_borders and remove_line_borders ou something like that. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True story! I missed that, my bad. Just ignore my comment then : ) |
||
|
||
for i in range(max_height): | ||
total = image[i, :].sum() / 255 | ||
if total > threshold: | ||
image[i, :] = numpy.ones(width) * 255 | ||
|
||
total = image[height - i - 1, :].sum() | ||
if total > threshold: | ||
image[height - i - 1, :] = numpy.ones(width) * 255 | ||
|
||
return image | ||
|
||
|
||
def crop(numpy_array, filename): | ||
image = cv2.imdecode(numpy_array, cv2.IMREAD_COLOR) | ||
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) | ||
|
||
gray = remove_borders(gray, 0.8, 15, 15) | ||
|
||
adjusted_width = image.shape[1] / DEFAULT_WIDTH | ||
adjusted_height = image.shape[0] / DEFAULT_HEIGHT | ||
|
||
kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (KERNEL_WIDTH, KERNEL_HEIGHT)) | ||
eroded = cv2.erode(gray, kernel) | ||
|
||
_, bw = cv2.threshold(eroded, 127, 255, cv2.THRESH_BINARY_INV) | ||
|
||
total, markers = cv2.connectedComponents(bw) | ||
|
||
images = [numpy.uint8(markers==i) * 255 for i in range(total) if numpy.uint8(markers==i).sum() > 10] | ||
|
||
rectangles = [] | ||
|
||
for label in images: | ||
countours = cv2.findContours(label, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) | ||
|
||
(x,y,w,h) = cv2.boundingRect(countours[0]) | ||
|
||
rectangles.append((x, y, w, h, label.sum() / 255.0)) | ||
|
||
rectangles = sorted(rectangles, key=lambda x:x[4], reverse=True) | ||
|
||
rectangles = rectangles[1:] | ||
|
||
expanded = [sys.maxsize, sys.maxsize, -sys.maxsize, -sys.maxsize] | ||
|
||
for rect in rectangles: | ||
|
||
x0, y0, w0, h0 = expanded | ||
x1, y1, w1, h1, _ = rect | ||
|
||
if w1 <= (TEXT_MIN_WIDTH * adjusted_width): | ||
continue | ||
|
||
if h1 <= (TEXT_MIN_HEIGHT * adjusted_height): | ||
continue | ||
|
||
x = min(x0, x1) | ||
y = min(y0, y1) | ||
|
||
w = max(x0 + w0, x1 + w1) - x | ||
h = max(y0 + h0, y1 + h1) - y | ||
|
||
expanded = [x, y, w, h] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Roughly from line 50 and on you started to always use a blank line between instructions. I think a better use of blank likes could enhance the readability of this code making it more meaningful ; ) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They are grouped by similarity . For example, computing width and height, are always grouped, x0 and y0 too (is the same computing in different variables). I think if we add more blank lines will loose a little of reading. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't mean adding more spaces, but removing them : ) For example, grouping by similarity isn't clear in this snnipet:
They are defining the same variable, so I expected these lines to be grouped, not separated. Anyway: those are one of the minor comments. Feel free to ignore them anyway and altogether : ) |
||
|
||
cv2.imwrite(filename, image[y:y+h, x:x+w]) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,16 @@ | |
import os | ||
import re | ||
import urllib.request | ||
from tempfile import NamedTemporaryFile | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from pymongo import MongoClient | ||
import twitter | ||
from wand.image import Image | ||
|
||
from whistleblower.suspicions import Suspicions | ||
from whistleblower.helpers.crop import crop | ||
|
||
ACCESS_TOKEN_KEY = os.environ['TWITTER_ACCESS_TOKEN_KEY'] | ||
ACCESS_TOKEN_SECRET = os.environ['TWITTER_ACCESS_TOKEN_SECRET'] | ||
|
@@ -138,27 +141,73 @@ def __iter__(self): | |
yield 'text', self.status.text | ||
yield 'document_id', self.reimbursement['document_id'] | ||
|
||
def text(self): | ||
def tweet_data(self): | ||
""" | ||
Proper tweet message for the given reimbursement. | ||
Proper tweet data for the given reimbursement. | ||
""" | ||
profile = self.reimbursement['twitter_profile'] | ||
if profile: | ||
link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format( | ||
self.reimbursement['document_id']) | ||
message = ( | ||
'🚨Gasto suspeito de Dep. @{} ({}). ' | ||
'Você pode me ajudar a verificar? ' | ||
'{} #SerenataDeAmor na @CamaraDeputados' | ||
).format(profile, self.reimbursement['state'], link) | ||
return message | ||
return self.tweet_text(), self.tweet_image() | ||
else: | ||
raise ValueError( | ||
'Congressperson does not have a registered Twitter account.') | ||
|
||
|
||
def tweet_text(self): | ||
link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format( | ||
self.reimbursement['document_id']) | ||
message = ( | ||
'🚨Gasto suspeito de Dep. @{} ({}). ' | ||
'Você pode me ajudar a verificar? ' | ||
'{} #SerenataDeAmor na @CamaraDeputados' | ||
).format( | ||
self.reimbursement['twitter_profile'], | ||
self.reimbursement['state'], | ||
link | ||
) | ||
return message | ||
|
||
def camara_image_url(self): | ||
""" | ||
Proper image url for the given reimbursement. | ||
""" | ||
url = ( | ||
'http://www.camara.gov.br/cota-parlamentar/documentos/publ/' | ||
'{}/{}/{}.pdf'.format( | ||
self.reimbursement['applicant_id'], | ||
self.reimbursement['year'], | ||
self.reimbursement['document_id']) | ||
) | ||
|
||
return url | ||
|
||
def tweet_image(self): | ||
""" | ||
Download, crop and open the image for the given reimbursement. | ||
""" | ||
try: | ||
response = urllib.request.urlopen(self.camara_image_url()) | ||
|
||
image_bin = Image(file=response).make_blob('png') | ||
numpy_array = np.frombuffer(image_bin, np.uint8) | ||
|
||
with NamedTemporaryFile(suffix='.png') as temp: | ||
crop(numpy_array, temp.name) | ||
|
||
with open(temp.name, 'rb') as cropped_file: | ||
cropped_image = cropped_file | ||
except: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bare Can you refactor these lines (188-202) using shorter blocks of code (usually a single line, but not strictly necessary) in the |
||
return None | ||
|
||
return cropped_image | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this code work? I mean, when I exit a context manager created with Alternatively you could write your own context manager to have more control and to be more explicit about the temporary file deletion: …
from contextlib import contextmanager
…
class Tweet:
…
@contextmanager
def receipt_pdf_as_png(self, pdf_response):
image_bin = Image(file= pdf_response).make_blob('png')
numpy_array = np.frombuffer(image_bin, np.uint8)
# write PNG to a temp file
temp = NamedTemporaryFile(delete=False, suffix='.png')
with open(temp.name, 'wb') as fobj:
crop(numpy_array, temp.name)
# return PNG as a file object
with open(temp.name, 'rb') as fobj:
yield fobj
# delete temporary file
os.remove(temp.name)
def tweet_image(self):
try:
response = urllib.request.urlopen(self.camara_image_url())
except urllib.error.HTTPError:
return None
with self.receipt_pdf_as_png(response) as image:
return image |
||
|
||
def publish(self): | ||
""" | ||
Post the update to Twitter's timeline. | ||
""" | ||
self.status = self.api.PostUpdate(self.text()) | ||
text, reimbursement_image = self.tweet_data() | ||
|
||
self.status = self.api.PostUpdate( | ||
status=text, | ||
media=reimbursement_image) | ||
self.database.posts.insert_one(dict(self)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this a lot of code just to install OpenCV: I see two alternatives here.
The first one is to avoid adding too many layers (too many separated
RUN
commands). This makes the image grows big and makes it difficult to maintain (after a couple of weeks/months people might wonder which lines to remove if we don't need openCV anymore). This solution might be as simple as:Another alternative is to use an image that already has Python 3.6, OpenCV and the data science kit (NumPy, Pandas, scikit-learn) instead of the simple
python:3.6.3-alpine
. Not sure if it exists though.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://hub.docker.com/r/continuumio/anaconda3/ may be a good alternative.