-
Notifications
You must be signed in to change notification settings - Fork 60
First steps towards generalization in reimbursiments description #66
base: master
Are you sure you want to change the base?
Changes from 29 commits
d62497b
648bf9a
5b59e13
f05c579
82a7395
e87c5ca
ca3a2e3
9d799dc
0c4d8f1
3c9eede
9e656d9
6036199
cc3a83a
487add5
2be6e50
2787189
f603520
9150535
b807779
4541b3a
e25dead
2b8a7e8
a46dfe2
24c5b91
f50112a
19bd8ae
0d71c55
c657d95
9396ecd
0300cf9
5b9952d
b036f1d
7620398
f919453
9ce573f
4060063
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
This repository will save all pdf and png. | ||
After each prediction they are deleted. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
import os | ||
import unicodedata | ||
import numpy as np | ||
import pandas as pd | ||
from urllib.request import urlopen | ||
from sklearn.base import TransformerMixin | ||
from keras.preprocessing.image import ImageDataGenerator | ||
from keras.models import Sequential | ||
from keras.layers import Conv2D, MaxPooling2D | ||
from keras.layers import Activation, Dropout, Flatten, Dense | ||
from keras import backend as K | ||
from keras.callbacks import ModelCheckpoint | ||
from keras.models import load_model | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can follow PEP-8 imports section to guide you when organizing your imports. For example, when doing Note that you should also group your imports:
Also on that note, you can make use of tools to help you automatically organize your imports like isort. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! Now they look beautiful :D |
||
from keras.preprocessing.image import img_to_array | ||
from wand.image import Image | ||
from PIL import Image as pil_image | ||
from io import BytesIO | ||
|
||
class MealGeneralizationClassifier(TransformerMixin): | ||
""" | ||
Meal Generalization Classifier. | ||
|
||
Dataset | ||
------- | ||
applicant_id : string column | ||
A personal identifier code for every person making expenses. | ||
|
||
category : category column | ||
Category of the expense. The model will be applied just in rows where | ||
the value is equal to "Meal". | ||
|
||
document_id : string column | ||
The identifier of the expense. | ||
|
||
year : string column | ||
The year the expense was generated. | ||
""" | ||
|
||
COLS = ['applicant_id', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't be afraid to use descriptive names like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In fact i tried to follow the code of other classifiers. Ctrl+C > Ctrl+V |
||
'document_id', | ||
'category', | ||
'year'] | ||
|
||
|
||
img_width, img_height = 300, 300 | ||
|
||
def train(self,train_data_dir,validation_data_dir,save_dir): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pay extra attention to spaces after commas, they help make your code easier on the eyes 😉 This method would be nicer like this:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed the code using this tool: http://pep8online.com/checkresult |
||
#fix random seed for reproducibility | ||
seed = 2017 | ||
np.random.seed(seed) | ||
|
||
nb_train_samples = sum([len(files) for r, d, files in os.walk(train_data_dir)]) | ||
nb_validation_samples = sum([len(files) for r, d, files in os.walk(validation_data_dir)]) | ||
|
||
print('no. of trained samples = ', nb_train_samples, ' no. of validation samples= ',nb_validation_samples) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Long lines aren't a good thing. This print could be like the following in other to avoid the extra long line:
Extra space was missing there. |
||
|
||
#dimensions of our images. | ||
img_width, img_height = 300, 300 | ||
|
||
epochs = 20 | ||
batch_size = 15 | ||
|
||
if K.image_data_format() == 'channels_first': | ||
input_shape = (3, img_width, img_height) | ||
else: | ||
input_shape = (img_width, img_height, 3) | ||
|
||
model = Sequential() | ||
model.add(Conv2D(32, (3, 3), input_shape=input_shape)) | ||
model.add(Activation('relu')) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
|
||
model.add(Conv2D(32, (3, 3))) | ||
model.add(Activation('relu')) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
|
||
model.add(Conv2D(64, (3, 3))) | ||
model.add(Activation('relu')) | ||
model.add(MaxPooling2D(pool_size=(2, 2))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I noted that you make the same process here 3 times with very little difference between parameters used. Maybe you can explain a little why is that necessary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I included a brief description and a link to explain it: http://deeplearning.net/tutorial/lenet.html |
||
|
||
model.add(Flatten()) | ||
model.add(Dense(64)) | ||
model.add(Activation('relu')) | ||
model.add(Dropout(0.5)) | ||
model.add(Dense(1)) | ||
model.add(Activation('sigmoid')) | ||
|
||
model.compile(loss='binary_crossentropy', | ||
optimizer='rmsprop', | ||
metrics=['accuracy']) | ||
|
||
#this is the augmentation configuration we will use for training | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP-8 inline comments state that:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, using the tool |
||
train_datagen = ImageDataGenerator( | ||
rescale=1. / 255, | ||
shear_range=0.2, | ||
zoom_range=0.2, | ||
horizontal_flip=False)#As you can see i put it as FALSE and on link example it is TRUE | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP-8 inline comments state that:
And be careful on the line length here too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, I didn't quite get what you meant here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was a copy and past from serenata. Now i included the reason I put horizontal_flip as FALSE because we can not handwrite from right to left in Portuguese |
||
#Explanation, there no possibility to write in a reverse way :P | ||
|
||
#this is the augmentation configuration we will use for testing: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same thing as inline comments, these should start with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is applicable to all other comments you made in this file ;) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP8 requirements done, using it: http://pep8online.com/checkresult |
||
#only rescaling | ||
test_datagen = ImageDataGenerator(rescale=1. / 255) | ||
|
||
train_generator = train_datagen.flow_from_directory( | ||
train_data_dir, | ||
target_size=(img_width, img_height), | ||
batch_size=batch_size, | ||
class_mode='binary') | ||
|
||
validation_generator = test_datagen.flow_from_directory( | ||
validation_data_dir, | ||
target_size=(img_width, img_height), | ||
batch_size=batch_size, | ||
class_mode='binary') | ||
|
||
#It allow us to save only the best model between the iterations | ||
checkpointer = ModelCheckpoint(filepath=save_dir+"weights.hdf5", verbose=1, save_best_only=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. File paths should be built using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jtemporal Another good option with good API is pathlib There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, i included the os.path.join |
||
|
||
model.fit_generator( | ||
train_generator, | ||
callbacks=[checkpointer], #And we set the parameter to save only the best model | ||
steps_per_epoch=nb_train_samples // batch_size, | ||
epochs=epochs, | ||
validation_data=validation_generator, | ||
validation_steps=nb_validation_samples // batch_size) | ||
|
||
def fit(self, X): | ||
#Load an existent Keras model | ||
self.keras_model = load_model(X) | ||
return self | ||
|
||
def transform(self, X=None): | ||
pass | ||
|
||
def predict(self, X): | ||
self._X = X[self.COLS] | ||
self._X = self._X[self.__applicable_rows(self._X)] | ||
self._X = self.__document_url(self._X) | ||
self._X['y']=False | ||
result=[] | ||
|
||
for index, item in self._X.iterrows(): | ||
|
||
png_image = self.__download_doc(item.link) | ||
if png_image is not None : | ||
x = img_to_array(png_image) | ||
x = np.expand_dims(x, axis=0) | ||
|
||
preds = self.keras_model.predict_classes(x, verbose=0) #predict it in our model :D | ||
prob = self.keras_model.predict_proba(x, verbose=0) #get the probability of prediciton | ||
if(prob>=0.8 and preds==1):#Only keep the predictions with more than 80% of accuracy and the class 1 (suspicious) | ||
result.append(True) | ||
else: | ||
result.append(False) | ||
else: | ||
result.append(False) | ||
|
||
self._X['y']=result | ||
return self._X['y'] | ||
|
||
def __applicable_rows(self, X): | ||
return (X['category'] == 'Meal') | ||
|
||
|
||
"""convert the row of a dataframe to a string represinting the url for the files in the chamber of deputies | ||
Return a string to access the files in the chamber of deputies web site | ||
|
||
arguments: | ||
record -- row of a dataframe | ||
""" | ||
|
||
def __document_url(self,X): | ||
X['link']='' | ||
links=list() | ||
for index, x in X.iterrows(): | ||
links.append('http://www.camara.gov.br/cota-parlamentar/documentos/publ/{}/{}/{}.pdf'.format(x.applicant_id,x.year, x.document_id)) | ||
X['link']=links | ||
return X | ||
|
||
"""Download a pdf file and transform it to png | ||
Returns the png image using PIL image | ||
|
||
arguments: | ||
url -- the pdf url to chamber of deputies web site, e.g., http://www.../documentos/publ/2437/2015/5645177.pdf | ||
|
||
Exception -- returns None | ||
""" | ||
def __download_doc(self,url_link): | ||
#using the doc id as file name | ||
try: | ||
#open the resquest and get the file | ||
response = urlopen(url_link) | ||
print(response) | ||
#Default arguments to read the file and has a good resolution | ||
with Image(file=response, resolution=300) as img: | ||
img.compression_quality = 99 | ||
#Format choosed to convert the pdf to image | ||
with img.convert('png') as converted: | ||
print(converted) | ||
data = pil_image.open(BytesIO(converted.make_blob())) | ||
data = data.convert('RGB') | ||
hw_tuple = (self.img_height, self.img_width) | ||
if data.size != hw_tuple: | ||
data = data.resize(hw_tuple) | ||
print(data) | ||
return data | ||
except Exception as ex: | ||
print("Error during pdf download") | ||
print(ex) | ||
return None #case we get some exception we return None |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
document_id,applicant_id,year,category | ||
5787458,2277,2015,Meal | ||
5856593,3072,2015,Meal | ||
5739133,2987,2015,Meal | ||
5791826,3010,2015,Meal | ||
5797797,3082,2015,Meal | ||
5777103,1467,2015,Meal | ||
5630857,1467,2015,Flight ticket issue | ||
5630627,1467,2015,Flight ticket issue |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from unittest import TestCase | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from numpy.testing import assert_array_equal | ||
|
||
from rosie.chamber_of_deputies.classifiers.reimbursement_generalization import MealGeneralizationClassifier | ||
|
||
|
||
class TestMealGeneralizationClassifier(TestCase): | ||
|
||
def setUp(self): | ||
self.model = 'rosie/chamber_of_deputies/classifiers/keras/model/weights.hdf5' | ||
self.dataset = pd.read_csv('rosie/chamber_of_deputies/tests/fixtures/generalization_reimbursements.csv', | ||
dtype={'document_id': np.str, | ||
'applicant_id': np.str, | ||
'year': np.str}) | ||
self.subject = MealGeneralizationClassifier() | ||
self.subject.fit(self.model) | ||
self.prediction = self.subject.predict(self.dataset) | ||
|
||
def test_predict_true_when_generalized(self): | ||
assert_array_equal(np.repeat(True, 4), | ||
self.prediction[[0, 2, 4, 5]]) | ||
|
||
def test_predict_false_when_not_generalized(self): | ||
assert_array_equal(np.repeat(False, 2), | ||
self.prediction[[1, 3]]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't need to be here since Rosie's
setup
file createsconfig.ini
from the example file copied in the previous line 😉You can go ahead and delete this line.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, i removed it. Strange the first time i tried to build the image i got error without it... Any way now its ok