Skip to content
This repository has been archived by the owner on Mar 1, 2018. It is now read-only.

First steps towards generalization in reimbursiments description #66

Open
wants to merge 36 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
d62497b
First steps towards generalization in reimbursiments description
silviodc Jul 20, 2017
648bf9a
Merge branch 'master' into master
silviodc Jul 28, 2017
5b59e13
Update .travis.yml
silviodc Jul 28, 2017
f05c579
Update reimbursement_generalization.py
silviodc Jul 29, 2017
82a7395
Update reimbursement_generalization.py
silviodc Jul 29, 2017
e87c5ca
Update reimbursement_generalization.py
silviodc Jul 29, 2017
ca3a2e3
Update reimbursement_generalization.py
silviodc Jul 29, 2017
9d799dc
Update reimbursement_generalization.py
silviodc Jul 29, 2017
0c4d8f1
Update reimbursement_generalization.py
silviodc Jul 29, 2017
3c9eede
Update reimbursement_generalization.py
silviodc Jul 29, 2017
9e656d9
Update .travis.yml
silviodc Jul 29, 2017
6036199
Update .travis.yml
silviodc Jul 31, 2017
cc3a83a
Update reimbursement_generalization.py
silviodc Jul 31, 2017
487add5
Update reimbursement_generalization.py
silviodc Jul 31, 2017
2be6e50
Update reimbursement_generalization.py
silviodc Jul 31, 2017
2787189
Update reimbursement_generalization.py
silviodc Jul 31, 2017
f603520
Update reimbursement_generalization.py
silviodc Jul 31, 2017
9150535
Update reimbursement_generalization.py
silviodc Jul 31, 2017
b807779
Update reimbursement_generalization.py
silviodc Jul 31, 2017
4541b3a
Update reimbursement_generalization.py
silviodc Jul 31, 2017
e25dead
Update reimbursement_generalization.py
silviodc Aug 4, 2017
2b8a7e8
Update reimbursement_generalization.py
silviodc Aug 5, 2017
a46dfe2
Update reimbursement_generalization.py
silviodc Aug 5, 2017
24c5b91
Update reimbursement_generalization.py
silviodc Aug 5, 2017
f50112a
Update reimbursement_generalization.py
silviodc Aug 5, 2017
19bd8ae
Update .travis.yml
silviodc Aug 5, 2017
0d71c55
Update .travis.yml
silviodc Aug 5, 2017
c657d95
Update .travis.yml
silviodc Aug 5, 2017
9396ecd
Update .travis.yml
silviodc Aug 5, 2017
0300cf9
1) Review code for PEP8 requirements (using: http://pep8online.com/ch…
silviodc Aug 9, 2017
5b9952d
Merge commit '9396ecd1ab171c524827c2fd72349ac671f313ec'
silviodc Aug 9, 2017
b036f1d
Include tests for training
silviodc Aug 11, 2017
7620398
Include test to CORE
silviodc Aug 12, 2017
f919453
Changing location of supervised models to settings
silviodc Aug 12, 2017
9ce573f
Fixing integration between core and senate
silviodc Aug 12, 2017
4060063
Fixing load supervised model
silviodc Aug 12, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
dist: trusty
language: python
python: 3.6
cache: pip
install:
- "travis_retry sudo apt-get update"
- "travis_retry sudo apt-get -qq install libfreetype6-dev liblcms2-dev python-qt4 ghostscript libffi-dev libjpeg-turbo-progs cmake imagemagick"
- ./setup
- pip install coveralls
script:
Expand Down
7 changes: 3 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
FROM python:3.5

USER root

RUN apt-get update && apt-get install -y \
build-essential \
libxml2-dev \
libxslt1-dev \
python3-dev \
unzip \
libmagickwand-dev \
ghostscript \
zlib1g-dev

RUN pip install --upgrade pip

RUN pip install --upgrade pip
COPY requirements.txt ./
COPY setup ./
COPY rosie.py ./
Expand Down
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,9 @@ pycpfcnpj==1.0.2
scikit-learn==0.18.1
scipy==0.19.0
serenata-toolbox
wand==0.4.4
ghostscript==0.4.1
keras==2.0.4
tensorflow>=1.2.1
h5py>=2.7.0
Pillow>=4.2.1
2 changes: 1 addition & 1 deletion rosie/chamber_of_deputies/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class Adapter:
COMPANIES_DATASET = '2016-09-03-companies.xz'

def __init__(self, path):
self.path = path

Expand Down
238 changes: 238 additions & 0 deletions rosie/chamber_of_deputies/classifiers/reimbursement_generalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
import os
import unicodedata
import shutil
from io import BytesIO
from urllib.request import urlopen

import numpy as np
import pandas as pd
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.layers import (Activation, Conv2D, Dense, Dropout, Flatten,
MaxPooling2D)
from keras.models import Sequential, load_model
from keras.preprocessing.image import ImageDataGenerator, img_to_array
from PIL import Image as pil_image
from sklearn.base import TransformerMixin
from wand.image import Image


class MealGeneralizationClassifier(TransformerMixin):
"""
Meal Generalization Classifier.

Dataset
-------
applicant_id : string column
A personal identifier code for every person making expenses.

category : category column
Category of the expense. The model will be applied just in rows where
the value is equal to "Meal".

document_id : string column
The identifier of the expense.

year : string column
The year the expense was generated.
"""

COLUMNS = ['applicant_id', 'document_id', 'category', 'year']

# Dimensions of our images.
img_width, img_height = 300, 300

# It defines how many iterations will run to find the best model during traaining
epochs = 20
# It influences the speed of your learning (Execution)
batch_size = 15

def train(self, train_data_dir, validation_data_dir, save_dir):
# Fix random seed for reproducibility
seed = 2017
np.random.seed(seed)

nb_train_samples = sum([len(files) for r, d, files in os.walk(train_data_dir)])
nb_validation_samples = sum([len(files) for r, d, files in os.walk(validation_data_dir)])

print('no. of trained samples = ', nb_train_samples,
' no. of validation samples= ', nb_validation_samples)

if K.image_data_format() == 'channels_first':
input_shape = (3, self.img_width, self.img_height)
else:
input_shape = (self.img_width, self.img_height, 3)

model = Sequential()
# Its a stack of 3 convolution layers with a ReLU activation followed by max-pooling layers
# This is very similar to the architectures that Yann LeCun advocated in the 1990s
# For image classification (with the exception of ReLU)
model.add(Conv2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noted that you make the same process here 3 times with very little difference between parameters used. Maybe you can explain a little why is that necessary?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I included a brief description and a link to explain it: http://deeplearning.net/tutorial/lenet.html


model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

# Convolutional network is a specific artificial neural network topology
# Inspired by biological visual cortex and tailored for computer vision tasks.
# Authour: Yann LeCun in early 1990s.
# See http://deeplearning.net/tutorial/lenet.html for introduction.
# Or this simplified version: https://www.youtube.com/watch?v=JiN9p5vWHDY

# This is the augmentation configuration we will use for training
model.compile(loss='binary_crossentropy',
optimizer='rmsprop',
metrics=['accuracy'])

train_datagen = ImageDataGenerator(
rescale=1. / 255,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=False)
# I put horizontal_flip as FALSE because we can not handwrite from right to left in Portuguese

test_datagen = ImageDataGenerator(rescale=1. / 255)

# This is the augmentation configuration we will use for testing:
train_generator = train_datagen.flow_from_directory(
train_data_dir,
target_size=(self.img_width, self.img_height),
batch_size=self.batch_size,
class_mode='binary')

# Generates more images for the validation step
validation_generator = test_datagen.flow_from_directory(
validation_data_dir,
target_size=(self.img_width, self.img_height),
batch_size=self.batch_size,
class_mode='binary')

# It allow us to save only the best model between the iterations
checkpointer = ModelCheckpoint(
filepath=os.path.join(save_dir, "weights.hdf5"),
verbose=1, save_best_only=True)

# We set it as a parameter to save only the best model
model.fit_generator(
train_generator,
callbacks=[checkpointer],
steps_per_epoch=nb_train_samples // self.batch_size,
epochs=self.epochs,
validation_data=validation_generator,
validation_steps=nb_validation_samples // self.batch_size)

def fit(self, X):
# Load an existent Keras model
if (not os.path.isfile(X) and (isinstance(X, str) and ('https' in X or 'http' in X))):
response = urlopen(X)
with open('weights.hdf5', 'wb') as fp:
shutil.copyfileobj(response, fp)
X = fp.name
self.keras_model = load_model(X)
os.unlink(fp.name)
else:
self.keras_model = load_model(X)

return self

def transform(self, X=None):
pass

def predict(self, X):
# Only use the import columns for our classifier
self._X = X[self.COLUMNS]
# Remove the reimbursements from categories different from Meal
self._X = self._X[self.__applicable_rows(self._X)]
# Creates a link to the chamber of deputies
self._X = self.__document_url(self._X)
# Assumes nothing is suspicious
self._X['y'] = False
result = []

for index, item in self._X.iterrows():
# Download the reimbursements
png_image = self.download_doc(item.link)
if png_image is not None:
x = img_to_array(png_image)
x = np.expand_dims(x, axis=0)
# Predict it in our model :D
preds = self.keras_model.predict_classes(x, verbose=0)
# Get the probability of prediciton
prob = self.keras_model.predict_proba(x, verbose=0)
# Keep the predictions = (suspicious)
if(prob >= 0.8 and preds == 1):
result.append(True)
else:
result.append(False)
else:
# Case the reimbursement can not be convereted to png
result.append(False)

self._X['y'] = result
return self._X['y']

def __applicable_rows(self, X):
return (X['category'] == 'Meal')

""" Creates a new column 'links' containing an url
for the files in the chamber of deputies website
Return updated Dataframe

arguments:
record -- Dataframe
"""

def __document_url(self, X):
X['link'] = ''
links = list()
for index, x in X.iterrows():
base = "http://www.camara.gov.br/cota-parlamentar/documentos/publ"
url = '{}/{}/{}/{}.pdf'.format(base, x.applicant_id, x.year, x.document_id)
links.append(url)
X['link'] = links
return X

"""Download a pdf file and transform it to png
Returns the png image using PIL image. It is necessary for Keras API

arguments:
url -- the url to chamber of deputies web site, e.g.,
http://www.../documentos/publ/2437/2015/5645177.pdf

Exception -- returns None
"""
def download_doc(self, url_link):
try:
# Open the resquest and get the file
response = urlopen(url_link)
# Default arguments to read the file and has a good resolution
with Image(file=response, resolution=300) as img:
img.compression_quality = 99
# Chosen format to convert pdf to image
with img.convert('png') as converted:
# Converts the Wand image to PIL image
data = pil_image.open(BytesIO(converted.make_blob()))
data = data.convert('RGB')
hw_tuple = (self.img_height, self.img_width)
# Resizing of PIL image to fit our ML model
if data.size != hw_tuple:
data = data.resize(hw_tuple)
return data
except Exception as ex:
print("Error during pdf download")
print(ex)
# Case we get some exception we return None
return None
6 changes: 6 additions & 0 deletions rosie/chamber_of_deputies/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
from rosie.chamber_of_deputies.classifiers.meal_price_outlier_classifier import MealPriceOutlierClassifier
from rosie.chamber_of_deputies.classifiers.monthly_subquota_limit_classifier import MonthlySubquotaLimitClassifier
from rosie.chamber_of_deputies.classifiers.traveled_speeds_classifier import TraveledSpeedsClassifier
from rosie.chamber_of_deputies.classifiers.reimbursement_generalization import MealGeneralizationClassifier
from rosie.core.classifiers.invalid_cnpj_cpf_classifier import InvalidCnpjCpfClassifier


CLASSIFIERS = {
'reimbursement_generalization': MealGeneralizationClassifier,
'meal_price_outlier': MealPriceOutlierClassifier,
'over_monthly_subquota_limit': MonthlySubquotaLimitClassifier,
'suspicious_traveled_speed_day': TraveledSpeedsClassifier,
Expand All @@ -15,3 +18,6 @@
}

UNIQUE_IDS = ['applicant_id', 'year', 'document_id']

SUPERVISED_MODEL = {'MealGeneralizationClassifier':
'https://drive.google.com/uc?export=download&id=0B6F2XOmMAf28dUFmUU92MWpxMFU'}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
document_id,applicant_id,year,category
5787458,2277,2015,Meal
5856593,3072,2015,Meal
5739133,2987,2015,Meal
5791826,3010,2015,Meal
5797797,3082,2015,Meal
5777103,1467,2015,Meal
5630857,1467,2015,Flight ticket issue
5630627,1467,2015,Flight ticket issue
Loading