okfn-brasil · rodolfolottin · Feb 3, 2018 · Feb 19, 2018 · Feb 19, 2018 · Feb 24, 2018
diff --git a/Dockerfile b/Dockerfile
@@ -3,6 +3,9 @@ FROM python:3.6.3-alpine
 RUN apk add --no-cache --virtual build-base \
   && apk add --no-cache --virtual libxml2-dev \
   && apk add --no-cache --virtual libxslt-dev \
+  && apk add --no-cache --virtual imagemagick \
+  && apk add --no-cache --virtual imagemagick-dev \
+  && apk add --no-cache --virtual ghostscript \
   && mkdir -p /usr/include/libxml \
   && ln -s /usr/include/libxml2/libxml/xmlexports.h /usr/include/libxml/xmlexports.h \
   && ln -s /usr/include/libxml2/libxml/xmlversion.h /usr/include/libxml/xmlversion.h
@@ -11,6 +14,65 @@ RUN apk add --no-cache --virtual build-base \
 # COPY rosie/config.ini.example ./config.ini
 # COPY rosie/requirements.txt ./rosie
 # RUN pip install -r rosie/requirements.txt
+RUN echo -e '@edgunity http://nl.alpinelinux.org/alpine/edge/community\n\
+@edge http://nl.alpinelinux.org/alpine/edge/main\n\
+@testing http://nl.alpinelinux.org/alpine/edge/testing\n\
+@community http://dl-cdn.alpinelinux.org/alpine/edge/community'\
+  >> /etc/apk/repositories
+
+RUN apk add --no-cache --virtual openblas-dev \
+  && apk add --no-cache --virtual unzip \
+  && apk add --no-cache --virtual wget \
+  && apk add --no-cache --virtual cmake \
+  && apk add --no-cache --virtual libtbb@testing \
+  && apk add --no-cache --virtual libtbb-dev@testing \
+  && apk add --no-cache --virtual libjpeg \
+  && apk add --no-cache --virtual libjpeg-turbo-dev \
+  && apk add --no-cache --virtual libpng-dev \
+  && apk add --no-cache --virtual jasper-dev \
+  && apk add --no-cache --virtual tiff-dev \
+  && apk add --no-cache --virtual libwebp-dev \
+  && apk add --no-cache --virtual clang-dev \
+  && apk add --no-cache --virtual linux-headers \
+  && apk add --no-cache --virtual clang \
+  && pip install numpy
+
+ENV CC /usr/bin/clang
+ENV CXX /usr/bin/clang++
+
+ENV OPENCV_VERSION=3.1.0
+
+RUN mkdir /opt && cd /opt && \
+  wget https://github.com/opencv/opencv/archive/${OPENCV_VERSION}.zip && \
+  unzip ${OPENCV_VERSION}.zip && \
+  rm -rf ${OPENCV_VERSION}.zip
+
+RUN mkdir -p /opt/opencv-${OPENCV_VERSION}/build && \
+  cd /opt/opencv-${OPENCV_VERSION}/build && \
+  cmake \
+  -D CMAKE_BUILD_TYPE=RELEASE \
+  -D CMAKE_INSTALL_PREFIX=/usr/local \
+  -D WITH_FFMPEG=NO \
+  -D WITH_IPP=NO \
+  -D WITH_OPENEXR=NO \
+  -D WITH_TBB=YES \
+  -D BUILD_EXAMPLES=NO \
+  -D BUILD_ANDROID_EXAMPLES=NO \
+  -D INSTALL_PYTHON_EXAMPLES=NO \
+  -D BUILD_DOCS=NO \
+  -D BUILD_opencv_python2=NO \
+  -D BUILD_opencv_python3=ON \
+  -D PYTHON3_EXECUTABLE=/usr/local/bin/python \
+  -D PYTHON3_INCLUDE_DIR=/usr/local/include/python3.6m/ \
+  -D PYTHON3_LIBRARY=/usr/local/lib/libpython3.so \
+  -D PYTHON_LIBRARY=/usr/local/lib/libpython3.so \
+  -D PYTHON3_PACKAGES_PATH=/usr/local/lib/python3.6/site-packages/ \
+  -D PYTHON3_NUMPY_INCLUDE_DIRS=/usr/local/lib/python3.6/site-packages/numpy/core/include/ \
+  .. && \
+  make VERBOSE=1 && \
+  make && \
+  make install && \
+  rm -rf /opt/opencv-${OPENCV_VERSION}
 
 WORKDIR /usr/src/app
 COPY requirements.txt ./

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,10 @@
 amqp==2.2.2
 celery==4.1.0
 ipdb==0.10.3
+opencv-python==3.4.0.12
 pandas==0.21.0
 pymongo==3.5.1
 python-twitter==3.3
 requests==2.18.4
 serenata-toolbox==12.2.2
+wand==0.4.4
diff --git a/tests/fixtures/10.pdf b/tests/fixtures/10.pdf
diff --git a/tests/fixtures/10.png b/tests/fixtures/10.png
diff --git a/tests/targets/test_twitter.py b/tests/targets/test_twitter.py
@@ -1,5 +1,6 @@
 import datetime
 from unittest import TestCase, mock
+from io import BytesIO, BufferedReader
 
 import pandas as pd
 from twitter import TwitterError
@@ -108,6 +109,8 @@ def setUp(self):
         self.reimbursement = {
             'congressperson_name': 'Eduardo Cunha',
             'document_id': 10,
+            'applicant_id': 10,
+            'year': 2015,
             'state': 'RJ',
             'twitter_profile': 'DepEduardoCunha',
         }
@@ -117,19 +120,48 @@ def setUp(self):
 
     def test_publish(self):
         self.subject.publish()
-        self.api.PostUpdate.assert_called_once_with(self.subject.text())
+        text, reimbursement_image = self.subject.tweet_data()
+        self.api.PostUpdate.assert_called_once_with(
+            media=reimbursement_image, status=text)
         dict_representation = dict(self.subject)
         self.database.posts.insert_one.assert_called_once_with(
             dict_representation)
 
-    def test_text(self):
+    def test_tweet_data(self):
         message = (
             '🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). '
             'Você pode me ajudar a verificar? '
             'https://jarbas.serenata.ai/layers/#/documentId/10 '
             '#SerenataDeAmor na @CamaraDeputados'
         )
-        self.assertEqual(message, self.subject.text())
+        reimbursement_image = None
+        self.assertEqual(
+            (message, None), self.subject.tweet_data())
         self.reimbursement['twitter_profile'] = None
         with self.assertRaises(ValueError):
-            self.subject.text()
+            self.subject.tweet_data()
+
+    def test_tweet_text(self):
+        message = (
+            '🚨Gasto suspeito de Dep. @DepEduardoCunha (RJ). '
+            'Você pode me ajudar a verificar? '
+            'https://jarbas.serenata.ai/layers/#/documentId/10 '
+            '#SerenataDeAmor na @CamaraDeputados'
+        )
+        self.assertEqual(message, self.subject.tweet_text())
+
+    def test_camara_image_url(self):
+        url = 'http://www.camara.gov.br/cota-parlamentar/documentos/publ/10/2015/10.pdf'
+        self.assertEqual(url, self.subject.camara_image_url())
+
+    @mock.patch('whistleblower.targets.twitter.urllib.request.urlopen')
+    def test_tweet_image(self, urlopen_mock):
+        with open('tests/fixtures/10.pdf', 'rb') as pdf_fixture:
+            mock_response = pdf_fixture
+            mock_response_read = BytesIO(pdf_fixture.read())
+        urlopen_mock.return_value = mock_response_read
+        self.assertIsInstance(
+            self.subject.tweet_image(), BufferedReader)
+
+        urlopen_mock.side_effect = Exception()
+        self.assertIsNone(self.subject.tweet_image())
diff --git a/whistleblower/helpers/crop.py b/whistleblower/helpers/crop.py
@@ -0,0 +1,92 @@
+import sys
+
+import cv2
+import numpy
+
+TEXT_MIN_WIDTH = 35
+TEXT_MIN_HEIGHT = 10
+
+DEFAULT_WIDTH  = 850
+DEFAULT_HEIGHT = 1100
+
+KERNEL_WIDTH  = 25
+KERNEL_HEIGHT = 15
+
+
+def remove_borders(image, threshold, max_width, max_height):
+    height, width = image.shape[:2]
+
+    for i in range(max_width):
+        total = image[:, i].sum() / 255
+        if total > threshold:
+            image[:, i] = numpy.ones(height) * 255
+
+        total = image[:, width - i - 1].sum() / 255
+        if total > threshold:
+            image[:, i - 1] = numpy.ones(height) * 255
+
+    for i in range(max_height):
+        total = image[i, :].sum() / 255
+        if total > threshold:
+            image[i, :] = numpy.ones(width) * 255
+
+        total = image[height - i - 1, :].sum()
+        if total > threshold:
+            image[height - i - 1, :] = numpy.ones(width) * 255
+
+    return image
+
+
+def crop(numpy_array, filename):
+    image = cv2.imdecode(numpy_array, cv2.IMREAD_COLOR)
+    gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+
+    gray = remove_borders(gray, 0.8, 15, 15)
+
+    adjusted_width  = image.shape[1] / DEFAULT_WIDTH
+    adjusted_height = image.shape[0] / DEFAULT_HEIGHT
+
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (KERNEL_WIDTH, KERNEL_HEIGHT))
+    eroded = cv2.erode(gray, kernel)
+
+    _, bw = cv2.threshold(eroded, 127, 255, cv2.THRESH_BINARY_INV)
+
+    total, markers = cv2.connectedComponents(bw)
+
+    images = [numpy.uint8(markers==i) * 255 for i in range(total) if numpy.uint8(markers==i).sum() > 10]
+
+    rectangles = []
+
+    for label in images:
+        countours = cv2.findContours(label, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+        (x,y,w,h) = cv2.boundingRect(countours[0])
+
+        rectangles.append((x, y, w, h, label.sum() / 255.0))
+
+    rectangles = sorted(rectangles, key=lambda x:x[4], reverse=True)
+
+    rectangles = rectangles[1:]
+
+    expanded = [sys.maxsize, sys.maxsize, -sys.maxsize, -sys.maxsize]
+
+    for rect in rectangles:
+
+        x0, y0, w0, h0 = expanded
+        x1, y1, w1, h1, _ = rect
+
+        if w1 <= (TEXT_MIN_WIDTH * adjusted_width):
+            continue
+
+        if h1 <= (TEXT_MIN_HEIGHT * adjusted_height):
+            continue
+
+        x = min(x0, x1)
+        y = min(y0, y1)
+
+        w = max(x0 + w0, x1 + w1) - x
+        h = max(y0 + h0, y1 + h1) - y
+
+        expanded = [x, y, w, h]
+
+    cv2.imwrite(filename, image[y:y+h, x:x+w])
diff --git a/whistleblower/targets/twitter.py b/whistleblower/targets/twitter.py
@@ -3,13 +3,16 @@
 import os
 import re
 import urllib.request
+from tempfile import NamedTemporaryFile
 
 import numpy as np
 import pandas as pd
 from pymongo import MongoClient
 import twitter
+from wand.image import Image
 
 from whistleblower.suspicions import Suspicions
+from whistleblower.helpers.crop import crop
 
 ACCESS_TOKEN_KEY = os.environ['TWITTER_ACCESS_TOKEN_KEY']
 ACCESS_TOKEN_SECRET = os.environ['TWITTER_ACCESS_TOKEN_SECRET']
@@ -138,27 +141,73 @@ def __iter__(self):
         yield 'text', self.status.text
         yield 'document_id', self.reimbursement['document_id']
 
-    def text(self):
+    def tweet_data(self):
         """
-        Proper tweet message for the given reimbursement.
+        Proper tweet data for the given reimbursement.
         """
         profile = self.reimbursement['twitter_profile']
         if profile:
-            link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format(
-                self.reimbursement['document_id'])
-            message = (
-                '🚨Gasto suspeito de Dep. @{} ({}). '
-                'Você pode me ajudar a verificar? '
-                '{} #SerenataDeAmor na @CamaraDeputados'
-            ).format(profile, self.reimbursement['state'], link)
-            return message
+            return self.tweet_text(), self.tweet_image()
         else:
             raise ValueError(
                 'Congressperson does not have a registered Twitter account.')
 
+
+    def tweet_text(self):
+        link = 'https://jarbas.serenata.ai/layers/#/documentId/{}'.format(
+            self.reimbursement['document_id'])
+        message = (
+            '🚨Gasto suspeito de Dep. @{} ({}). '
+            'Você pode me ajudar a verificar? '
+            '{} #SerenataDeAmor na @CamaraDeputados'
+        ).format(
+            self.reimbursement['twitter_profile'],
+            self.reimbursement['state'],
+            link
+        )
+        return message
+
+    def camara_image_url(self):
+        """
+        Proper image url for the given reimbursement.
+        """
+        url = (
+            'http://www.camara.gov.br/cota-parlamentar/documentos/publ/'
+            '{}/{}/{}.pdf'.format(
+                self.reimbursement['applicant_id'],
+                self.reimbursement['year'],
+                self.reimbursement['document_id'])
+        )
+
+        return url
+
+    def tweet_image(self):
+        """
+        Download, crop and open the image for the given reimbursement.
+        """
+        try:
+            response = urllib.request.urlopen(self.camara_image_url())
+
+            image_bin = Image(file=response).make_blob('png')
+            numpy_array = np.frombuffer(image_bin, np.uint8)
+
+            with NamedTemporaryFile(suffix='.png') as temp:
+                crop(numpy_array, temp.name)
+
+                with open(temp.name, 'rb') as cropped_file:
+                    cropped_image = cropped_file
+        except:
+            return None
+
+        return cropped_image
+
     def publish(self):
         """
         Post the update to Twitter's timeline.
         """
-        self.status = self.api.PostUpdate(self.text())
+        text, reimbursement_image = self.tweet_data()
+
+        self.status = self.api.PostUpdate(
+            status=text,
+            media=reimbursement_image)
         self.database.posts.insert_one(dict(self))