Skip to content
This repository has been archived by the owner on Sep 18, 2018. It is now read-only.

Attach document image to tweet #29

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d8c8e46
Update Dockerfile with imagemagick packages
rodolfolottin Feb 3, 2018
d082972
Rename text() method and create a method to get all the tweet data
rodolfolottin Feb 19, 2018
5ad5a34
Implements method to build the camara image url
rodolfolottin Feb 19, 2018
1b054ae
Add wand in requirements.txt
rodolfolottin Feb 24, 2018
7822531
Update Dockerfile with opencv and ghostscript
rodolfolottin Mar 14, 2018
768c1bc
Crop a given png file in order to remove its white areas
rodolfolottin Mar 14, 2018
653275f
Add image pdf and png to fixtures
rodolfolottin Mar 15, 2018
3bccd45
Change camara url
rodolfolottin Apr 8, 2018
eb0d8cd
Add tweet_image call in publish method
rodolfolottin Apr 8, 2018
e5a53dc
Implement method to retrieve the reimbursement pdf and call the crop …
rodolfolottin Apr 8, 2018
f0b16cb
Add the image to tweet_data
rodolfolottin Apr 8, 2018
1f5fd5f
Resolve merge conflicts
rodolfolottin Apr 8, 2018
38c2b5d
Add opencv to requirements.txt file
rodolfolottin Apr 8, 2018
32be9f4
Sort imports descending
rodolfolottin Apr 15, 2018
5f0cae7
Remove unused var
rodolfolottin Apr 15, 2018
b0dc505
Specify HTTPError except when trying to retrieve the reimbursement image
rodolfolottin Apr 15, 2018
c841e32
Start using anaconda docker image instead of the simple alpine python…
rodolfolottin Apr 22, 2018
e88e8f2
Add apt-get command to install libmagickwand-dev
rodolfolottin Apr 23, 2018
fa7a805
Change adduser parameter to add a system user
rodolfolottin Apr 23, 2018
622268a
Remove code from try block
rodolfolottin Apr 23, 2018
60d87be
Remove unused fixtures
rodolfolottin Apr 28, 2018
23f997b
Refactor tweet_image and tests
rodolfolottin Apr 28, 2018
2be5a44
Update Dockerfile with ghostscript related packages
rodolfolottin Apr 28, 2018
0c065d8
Update Dockerfile with imagemagick package
rodolfolottin Apr 28, 2018
3c4d8e0
Add apt-get update to Dockerfile script
rodolfolottin May 4, 2018
1a5d446
Update Dockerfile and travis.yml with Wand required packages
rodolfolottin May 5, 2018
0fccf2e
Update .travis.yml dist and scripts
rodolfolottin May 5, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
dist: trusty
language: python
python: 3.6
cache: pip
install:
- "travis_retry sudo apt-get update"
- "travis_retry sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev libjpeg-turbo-progs cmake imagemagick"
- pip install -r requirements.txt
- pip install coveralls
env:
Expand Down
26 changes: 13 additions & 13 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
FROM python:3.6.3-alpine
FROM continuumio/anaconda3

RUN apk add --no-cache --virtual build-base \
&& apk add --no-cache --virtual libxml2-dev \
&& apk add --no-cache --virtual libxslt-dev \
&& mkdir -p /usr/include/libxml \
&& ln -s /usr/include/libxml2/libxml/xmlexports.h /usr/include/libxml/xmlexports.h \
&& ln -s /usr/include/libxml2/libxml/xmlversion.h /usr/include/libxml/xmlversion.h

# RUN mkdir rosie
# COPY rosie/config.ini.example ./config.ini
# COPY rosie/requirements.txt ./rosie
# RUN pip install -r rosie/requirements.txt
RUN apt-get update && apt-get install -y \
libmagickwand-dev \
ghostscript \
libgs-dev \
imagemagick \
build-essential \
libxml2-dev \
libxslt1-dev \
python3-dev \
unzip \
zlib1g-dev

WORKDIR /usr/src/app
COPY requirements.txt ./
RUN pip install -r requirements.txt

RUN adduser -S serenata_de_amor
RUN adduser --system serenata_de_amor
RUN chown -hR serenata_de_amor .
USER serenata_de_amor

Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
amqp==2.2.2
celery==4.1.0
ipdb==0.10.3
opencv-python==3.4.0.12
pandas==0.21.0
pymongo==3.5.1
python-twitter==3.3
requests==2.18.4
serenata-toolbox==12.2.2
wand==0.4.4
Binary file added tests/fixtures/reimbursement_file.txt
Binary file not shown.
46 changes: 42 additions & 4 deletions tests/targets/test_twitter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import datetime
from io import BufferedReader
from unittest import TestCase, mock

import pandas as pd
from twitter import TwitterError
import urllib.error

from whistleblower.targets.twitter import Post, Twitter

Expand Down Expand Up @@ -108,6 +110,8 @@ def setUp(self):
self.reimbursement = {
'congressperson_name': 'Eduardo Cunha',
'document_id': 10,
'applicant_id': 10,
'year': 2015,
'state': 'RJ',
'twitter_profile': 'DepEduardoCunha',
}
Expand All @@ -117,19 +121,53 @@ def setUp(self):

def test_publish(self):
self.subject.publish()
self.api.PostUpdate.assert_called_once_with(self.subject.text())
text, reimbursement_image = self.subject.tweet_data()
self.api.PostUpdate.assert_called_once_with(
media=reimbursement_image, status=text)
dict_representation = dict(self.subject)
self.database.posts.insert_one.assert_called_once_with(
dict_representation)

def test_text(self):
def test_tweet_data(self):
message = (
'🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). '
'Você pode me ajudar a verificar? '
'https://jarbas.serenata.ai/layers/#/documentId/10 '
'#SerenataDeAmor na @CamaraDeputados'
)
self.assertEqual(message, self.subject.text())
self.assertEqual(
(message, None), self.subject.tweet_data())
self.reimbursement['twitter_profile'] = None
with self.assertRaises(ValueError):
self.subject.text()
self.subject.tweet_data()

def test_tweet_text(self):
message = (
'🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). '
'Você pode me ajudar a verificar? '
'https://jarbas.serenata.ai/layers/#/documentId/10 '
'#SerenataDeAmor na @CamaraDeputados'
)
self.assertEqual(message, self.subject.tweet_text())

def test_camara_image_url(self):
url = 'http://www.camara.gov.br/cota-parlamentar/documentos/publ/10/2015/10.pdf'
self.assertEqual(url, self.subject.camara_image_url())

@mock.patch('whistleblower.targets.twitter.os.remove')
@mock.patch('whistleblower.targets.twitter.urllib.request.urlopen')
def test_tweet_image_success(self, urlopen_mock, os_remove_mock):
mock_object = mock.MagicMock(status_code=200)
with open('tests/fixtures/reimbursement_file.txt', 'rb') as mock_response:
mock_object.read.return_value = mock_response.read()

urlopen_mock.return_value = mock_object
self.assertIsInstance(self.subject.tweet_image(), BufferedReader)
os_remove_mock.assert_called_once()

@mock.patch('whistleblower.targets.twitter.urllib.request.urlopen')
def test_tweet_image_error(self, urlopen_mock):
urlopen_mock.side_effect = urllib.error.HTTPError(
url='mock_url', code=404, msg='Not Found',
hdrs='mock_headers', fp=None)
self.assertIsNone(self.subject.tweet_image())
92 changes: 92 additions & 0 deletions whistleblower/helpers/crop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import sys

import cv2
import numpy

TEXT_MIN_WIDTH = 35
TEXT_MIN_HEIGHT = 10

DEFAULT_WIDTH = 850
DEFAULT_HEIGHT = 1100

KERNEL_WIDTH = 25
KERNEL_HEIGHT = 15
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the meaning of these constants?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory, these are the distance (in pixels) between letters in the document to be considered the same line. They are used in the mathematical operations which glue the letters of the same line together. I'll add these comments.



def remove_borders(image, threshold, max_width, max_height):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind adding doctrings to explain a bit more about these arguments?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is mine. I'll improve them.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many thanks, @begnini : )

height, width = image.shape[:2]

for i in range(max_width):
total = image[:, i].sum() / 255
if total > threshold:
image[:, i] = numpy.ones(height) * 255

total = image[:, width - i - 1].sum() / 255
if total > threshold:
image[:, i - 1] = numpy.ones(height) * 255
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could extract this logic into a helper function _remove_border(img, size, max_size, threshold) in the DRY philosophy. So remove_borders could be simpler and we could document further what happens in _remove_border:

def remove_borders(image, threshold, max_width, max_height):
    height. width, *_ = image.shape
    image = _remove_border(image, width, max_width, threshold)
    image = _remove_border(image, height, max_height, threshold)
    return image

And in _remove_border you could document the rational underneath it ; )

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The indexes is different. The first loop look at columns, the other looks at lines. To break in one function, the code should rotate the image and this will be a lot weird. I can refactor to create a remove_column_borders and remove_line_borders ou something like that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True story! I missed that, my bad. Just ignore my comment then : )


for i in range(max_height):
total = image[i, :].sum() / 255
if total > threshold:
image[i, :] = numpy.ones(width) * 255

total = image[height - i - 1, :].sum()
if total > threshold:
image[height - i - 1, :] = numpy.ones(width) * 255

return image


def crop(numpy_array, filename):
image = cv2.imdecode(numpy_array, cv2.IMREAD_COLOR)
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

gray = remove_borders(gray, 0.8, 15, 15)

adjusted_width = image.shape[1] / DEFAULT_WIDTH
adjusted_height = image.shape[0] / DEFAULT_HEIGHT

kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (KERNEL_WIDTH, KERNEL_HEIGHT))
eroded = cv2.erode(gray, kernel)

_, bw = cv2.threshold(eroded, 127, 255, cv2.THRESH_BINARY_INV)

total, markers = cv2.connectedComponents(bw)

images = [numpy.uint8(markers==i) * 255 for i in range(total) if numpy.uint8(markers==i).sum() > 10]

rectangles = []

for label in images:
countours = cv2.findContours(label, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

(x,y,w,h) = cv2.boundingRect(countours[0])

rectangles.append((x, y, w, h, label.sum() / 255.0))

rectangles = sorted(rectangles, key=lambda x:x[4], reverse=True)

rectangles = rectangles[1:]

expanded = [sys.maxsize, sys.maxsize, -sys.maxsize, -sys.maxsize]

for rect in rectangles:

x0, y0, w0, h0 = expanded
x1, y1, w1, h1, _ = rect

if w1 <= (TEXT_MIN_WIDTH * adjusted_width):
continue

if h1 <= (TEXT_MIN_HEIGHT * adjusted_height):
continue

x = min(x0, x1)
y = min(y0, y1)

w = max(x0 + w0, x1 + w1) - x
h = max(y0 + h0, y1 + h1) - y

expanded = [x, y, w, h]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Roughly from line 50 and on you started to always use a blank line between instructions. I think a better use of blank likes could enhance the readability of this code making it more meaningful ; )

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are grouped by similarity . For example, computing width and height, are always grouped, x0 and y0 too (is the same computing in different variables). I think if we add more blank lines will loose a little of reading.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't mean adding more spaces, but removing them : )

For example, grouping by similarity isn't clear in this snnipet:

rectangles = sorted(rectangles, key=lambda x:x[4], reverse=True)
 
rectangles = rectangles[1:]

They are defining the same variable, so I expected these lines to be grouped, not separated.

Anyway: those are one of the minor comments. Feel free to ignore them anyway and altogether : )


cv2.imwrite(filename, image[y:y+h, x:x+w])
82 changes: 71 additions & 11 deletions whistleblower/targets/twitter.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
import datetime
from contextlib import contextmanager
import logging
import os
import re
from tempfile import NamedTemporaryFile
import urllib.request
import urllib.error

import numpy as np
import pandas as pd
from pymongo import MongoClient
import twitter
from wand.image import Image

from whistleblower.suspicions import Suspicions
from whistleblower.helpers.crop import crop

ACCESS_TOKEN_KEY = os.environ['TWITTER_ACCESS_TOKEN_KEY']
ACCESS_TOKEN_SECRET = os.environ['TWITTER_ACCESS_TOKEN_SECRET']
Expand Down Expand Up @@ -138,27 +143,82 @@ def __iter__(self):
yield 'text', self.status.text
yield 'document_id', self.reimbursement['document_id']

def text(self):
def tweet_data(self):
"""
Proper tweet message for the given reimbursement.
Proper tweet data for the given reimbursement.
"""
profile = self.reimbursement['twitter_profile']
if profile:
link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format(
self.reimbursement['document_id'])
message = (
'🚨Gasto suspeito de Dep. @{} ({}). '
'Você pode me ajudar a verificar? '
'{} #SerenataDeAmor na @CamaraDeputados'
).format(profile, self.reimbursement['state'], link)
return message
return self.tweet_text(), self.tweet_image()
else:
raise ValueError(
'Congressperson does not have a registered Twitter account.')


def tweet_text(self):
link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format(
self.reimbursement['document_id'])
message = (
'🚨Gasto suspeito de Dep. @{} ({}). '
'Você pode me ajudar a verificar? '
'{} #SerenataDeAmor na @CamaraDeputados'
).format(
self.reimbursement['twitter_profile'],
self.reimbursement['state'],
link
)
return message

def camara_image_url(self):
"""
Proper image url for the given reimbursement.
"""
url = (
'http://www.camara.gov.br/cota-parlamentar/documentos/publ/'
'{}/{}/{}.pdf'.format(
self.reimbursement['applicant_id'],
self.reimbursement['year'],
self.reimbursement['document_id'])
)

return url

@contextmanager
def receipt_pdf_as_png(self, pdf_response):
image_bin = Image(file=pdf_response).make_blob('png')
numpy_array = np.frombuffer(image_bin, np.uint8)

# write PNG to a temp file
temp = NamedTemporaryFile(delete=False, suffix='.png')
with open(temp.name, 'wb') as fobj:
crop(numpy_array, temp.name)

# return PNG as a file object
with open(temp.name, 'rb') as fobj:
yield fobj

# delete temporary file
os.remove(temp.name)

def tweet_image(self):
"""
Download, crop and open the image for the given reimbursement.
"""
try:
response = urllib.request.urlopen(self.camara_image_url())
except urllib.error.HTTPError:
return None

with self.receipt_pdf_as_png(response) as image:
return image

def publish(self):
"""
Post the update to Twitter's timeline.
"""
self.status = self.api.PostUpdate(self.text())
text, reimbursement_image = self.tweet_data()

self.status = self.api.PostUpdate(
status=text,
media=reimbursement_image)
self.database.posts.insert_one(dict(self))