okfn-brasil · silviodc · Jul 20, 2017 · Jul 28, 2017 · Jul 28, 2017 · Jul 29, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,10 @@
+dist: trusty
 language: python
 python: 3.6
 cache: pip
 install:
+  - "travis_retry sudo apt-get update"
+  - "travis_retry sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev libjpeg-turbo-progs cmake imagemagick"
   - ./setup
   - pip install coveralls
 script:

diff --git a/Dockerfile b/Dockerfile
@@ -1,22 +1,22 @@
 FROM python:3.5
-
 USER root
-
 RUN apt-get update && apt-get install -y \
   build-essential \
   libxml2-dev \
   libxslt1-dev \
   python3-dev \
   unzip \
+  libmagickwand-dev \
+  ghostscript \
   zlib1g-dev
-
-RUN pip install --upgrade pip
 
+RUN pip install --upgrade pip
 COPY requirements.txt ./
 COPY setup ./
 COPY rosie.py ./
 COPY rosie ./rosie
 COPY config.ini.example ./
+COPY config.ini ./
 
 RUN ./setup
 

diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,9 @@ pycpfcnpj==1.0.2
 scikit-learn==0.18.1
 scipy==0.19.0
 serenata-toolbox
+wand==0.4.4
+ghostscript==0.4.1
+keras==2.0.4
+tensorflow>=1.2.1
+h5py>=2.7.0
+Pillow>=4.2.1
diff --git a/rosie/chamber_of_deputies/classifiers/keras/images/images.txt b/rosie/chamber_of_deputies/classifiers/keras/images/images.txt
@@ -0,0 +1,2 @@
+This repository will save all pdf and png.
+After each prediction they are deleted.
diff --git a/rosie/chamber_of_deputies/classifiers/keras/model/weights.hdf5 b/rosie/chamber_of_deputies/classifiers/keras/model/weights.hdf5
diff --git a/rosie/chamber_of_deputies/classifiers/reimbursement_generalization.py b/rosie/chamber_of_deputies/classifiers/reimbursement_generalization.py
@@ -0,0 +1,210 @@
+import os
+import unicodedata
+import numpy as np
+import pandas as pd
+from urllib.request import urlopen
+from sklearn.base import TransformerMixin
+from keras.preprocessing.image import ImageDataGenerator
+from keras.models import Sequential
+from keras.layers import Conv2D, MaxPooling2D
+from keras.layers import Activation, Dropout, Flatten, Dense
+from keras import backend as K
+from keras.callbacks import ModelCheckpoint
+from keras.models import load_model
+from keras.preprocessing.image import img_to_array
+from wand.image import Image
+from PIL import Image as pil_image
+from io import BytesIO
+
+class MealGeneralizationClassifier(TransformerMixin):
+    """
+    Meal Generalization Classifier.
+
+    Dataset
+    -------
+    applicant_id : string column
+        A personal identifier code for every person making expenses.
+
+    category : category column
+        Category of the expense. The model will be applied just in rows where
+        the value is equal to "Meal".
+
+    document_id : string column
+        The identifier of the expense.
+
+    year : string column
+        The year the expense was generated.
+    """
+
+    COLS = ['applicant_id',
+            'document_id',
+            'category',
+            'year']
+
+
+    img_width, img_height = 300, 300
+
+    def train(self,train_data_dir,validation_data_dir,save_dir):
+        #fix random seed for reproducibility
+        seed = 2017
+        np.random.seed(seed)
+
+        nb_train_samples = sum([len(files) for r, d, files in os.walk(train_data_dir)])
+        nb_validation_samples = sum([len(files) for r, d, files in os.walk(validation_data_dir)])
+
+        print('no. of trained samples = ', nb_train_samples, ' no. of validation samples= ',nb_validation_samples)
+
+        #dimensions of our images.
+        img_width, img_height = 300, 300
+
+        epochs = 20
+        batch_size = 15
+
+        if K.image_data_format() == 'channels_first':
+            input_shape = (3, img_width, img_height)
+        else:
+            input_shape = (img_width, img_height, 3)
+
+        model = Sequential()
+        model.add(Conv2D(32, (3, 3), input_shape=input_shape))
+        model.add(Activation('relu'))
+        model.add(MaxPooling2D(pool_size=(2, 2)))
+
+        model.add(Conv2D(32, (3, 3)))
+        model.add(Activation('relu'))
+        model.add(MaxPooling2D(pool_size=(2, 2)))
+
+        model.add(Conv2D(64, (3, 3)))
+        model.add(Activation('relu'))
+        model.add(MaxPooling2D(pool_size=(2, 2)))
+
+        model.add(Flatten())
+        model.add(Dense(64))
+        model.add(Activation('relu'))
+        model.add(Dropout(0.5))
+        model.add(Dense(1))
+        model.add(Activation('sigmoid'))
+
+        model.compile(loss='binary_crossentropy',
+                      optimizer='rmsprop',
+                      metrics=['accuracy'])
+
+        #this is the augmentation configuration we will use for training
+        train_datagen = ImageDataGenerator(
+            rescale=1. / 255,
+            shear_range=0.2,
+            zoom_range=0.2,
+            horizontal_flip=False)#As you can see i put it as FALSE and on link example it is TRUE
+        #Explanation, there no possibility to write in a reverse way :P
+
+        #this is the augmentation configuration we will use for testing:
+        #only rescaling
+        test_datagen = ImageDataGenerator(rescale=1. / 255)
+
+        train_generator = train_datagen.flow_from_directory(
+            train_data_dir,
+            target_size=(img_width, img_height),
+            batch_size=batch_size,
+            class_mode='binary')
+
+        validation_generator = test_datagen.flow_from_directory(
+            validation_data_dir,
+            target_size=(img_width, img_height),
+            batch_size=batch_size,
+            class_mode='binary')
+
+        #It allow us to save only the best model between the iterations
+        checkpointer = ModelCheckpoint(filepath=save_dir+"weights.hdf5", verbose=1, save_best_only=True)
+
+        model.fit_generator(
+            train_generator,
+             callbacks=[checkpointer], #And we set the parameter to save only the best model
+            steps_per_epoch=nb_train_samples // batch_size,
+            epochs=epochs,
+            validation_data=validation_generator,
+            validation_steps=nb_validation_samples // batch_size)
+
+    def fit(self, X):
+        #Load an existent Keras model
+        self.keras_model = load_model(X)
+        return self
+
+    def transform(self, X=None):
+        pass
+
+    def predict(self, X):
+        self._X = X[self.COLS]
+        self._X = self._X[self.__applicable_rows(self._X)]
+        self._X = self.__document_url(self._X)
+        self._X['y']=False
+        result=[]
+
+        for index, item in self._X.iterrows():
+
+            png_image = self.__download_doc(item.link)
+            if png_image is not None :
+                x = img_to_array(png_image)
+                x = np.expand_dims(x, axis=0)
+
+                preds = self.keras_model.predict_classes(x, verbose=0) #predict it in our model :D
+                prob = self.keras_model.predict_proba(x, verbose=0) #get the probability of prediciton
+                if(prob>=0.8 and preds==1):#Only keep the predictions with more than 80% of accuracy and the class 1 (suspicious)
+                    result.append(True)
+                else:
+                    result.append(False)
+            else:
+                result.append(False)
+
+        self._X['y']=result
+        return self._X['y']
+
+    def __applicable_rows(self, X):
+        return (X['category'] == 'Meal')
+
+
+    """convert the row of a dataframe to a string represinting the url for the files in the chamber of deputies
+        Return a string to access the files in the chamber of deputies web site
+
+        arguments:
+        record -- row of a dataframe
+    """
+
+    def __document_url(self,X):
+        X['link']=''
+        links=list()
+        for index, x in X.iterrows():
+            links.append('http://www.camara.gov.br/cota-parlamentar/documentos/publ/{}/{}/{}.pdf'.format(x.applicant_id,x.year, x.document_id))
+        X['link']=links
+        return X
+
+    """Download a pdf file and transform it to png
+        Returns the png image using PIL image
+
+        arguments:
+        url -- the pdf url to chamber of deputies web site, e.g., http://www.../documentos/publ/2437/2015/5645177.pdf
+
+        Exception -- returns None
+    """
+    def __download_doc(self,url_link):
+            #using the doc id as file name
+            try:
+                #open the resquest and get the file
+                response = urlopen(url_link)
+                print(response)
+                #Default arguments to read the file and has a good resolution
+                with Image(file=response, resolution=300) as img:
+                    img.compression_quality = 99
+                    #Format choosed to convert the pdf to image
+                    with img.convert('png') as converted:
+                        print(converted)
+                        data = pil_image.open(BytesIO(converted.make_blob()))
+                        data = data.convert('RGB')
+                        hw_tuple = (self.img_height, self.img_width)
+                        if data.size != hw_tuple:
+                            data = data.resize(hw_tuple)
+                        print(data)
+                        return data
+            except Exception as ex:
+                print("Error during pdf download")
+                print(ex)
+                return None #case we get some exception we return None
diff --git a/rosie/chamber_of_deputies/settings.py b/rosie/chamber_of_deputies/settings.py
@@ -3,9 +3,12 @@
 from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier
 from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
 from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier
+from rosie.chamber_of_deputies.classifiers.reimbursement_generalization import MealGeneralizationClassifier
 from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier
 
+
 CLASSIFIERS = {
+    'reimbursement_generalization': MealGeneralizationClassifier,
     'meal_price_outlier': MealPriceOutlierClassifier,
     'over_monthly_subquota_limit': MonthlySubquotaLimitClassifier,
     'suspicious_traveled_speed_day': TraveledSpeedsClassifier,

diff --git a/rosie/chamber_of_deputies/tests/fixtures/generalization_reimbursements.csv b/rosie/chamber_of_deputies/tests/fixtures/generalization_reimbursements.csv
@@ -0,0 +1,9 @@
+document_id,applicant_id,year,category
+5787458,2277,2015,Meal
+5856593,3072,2015,Meal
+5739133,2987,2015,Meal
+5791826,3010,2015,Meal
+5797797,3082,2015,Meal
+5777103,1467,2015,Meal
+5630857,1467,2015,Flight ticket issue
+5630627,1467,2015,Flight ticket issue
diff --git a/rosie/chamber_of_deputies/tests/test_reimbursement_generalization.py b/rosie/chamber_of_deputies/tests/test_reimbursement_generalization.py
@@ -0,0 +1,28 @@
+from unittest import TestCase
+
+import numpy as np
+import pandas as pd
+from numpy.testing import assert_array_equal
+
+from rosie.chamber_of_deputies.classifiers.reimbursement_generalization import MealGeneralizationClassifier
+
+
+class TestMealGeneralizationClassifier(TestCase):
+
+    def setUp(self):
+        self.model = 'rosie/chamber_of_deputies/classifiers/keras/model/weights.hdf5'
+        self.dataset = pd.read_csv('rosie/chamber_of_deputies/tests/fixtures/generalization_reimbursements.csv',
+                                   dtype={'document_id': np.str,
+                          'applicant_id': np.str,
+                          'year': np.str})
+        self.subject = MealGeneralizationClassifier()
+        self.subject.fit(self.model)
+        self.prediction = self.subject.predict(self.dataset)
+
+    def test_predict_true_when_generalized(self):
+        assert_array_equal(np.repeat(True, 4),
+                           self.prediction[[0, 2, 4, 5]])
+
+    def test_predict_false_when_not_generalized(self):
+        assert_array_equal(np.repeat(False, 2),
+                          self.prediction[[1, 3]])
diff --git a/rosie/core/__init__.py b/rosie/core/__init__.py
@@ -51,7 +51,9 @@ def load_trained_model(self, classifier):
         if classifier.__name__ == 'MonthlySubquotaLimitClassifier':
             model = classifier()
             model.fit(self.dataset)
-
+        elif classifier.__name__ == 'MealGeneralizationClassifier':
+            model = classifier()
+            model.fit('rosie/chamber_of_deputies/classifiers/keras/model/weights.hdf5')
         else:
             if os.path.isfile(path):
                 model = joblib.load(path)