Skip to content

Commit

Permalink
Classifier to detect generalization in the description of reimbursements
Browse files Browse the repository at this point in the history
  • Loading branch information
silviodc committed May 27, 2017
1 parent 58d9faa commit 29bdae0
Show file tree
Hide file tree
Showing 8 changed files with 14,840 additions and 867 deletions.
280 changes: 1 addition & 279 deletions develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.html

Large diffs are not rendered by default.

231 changes: 1 addition & 230 deletions develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# This python notebook only shows how to converte PDF files to PNG\n",
"# This python notebook shows how to converte PDF files to PNG and use SIFT\n",
"\n",
"## To do so, 1) install the following requiriments:\n",
"\n",
Expand Down Expand Up @@ -405,235 +405,6 @@
"### About these similar images check the issue: https://github.com/datasciencebr/serenata-de-amor/issues/32\n",
"##### ---- So, i also will try to combine the sift descriptors and other information. \n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import glob\n",
"from __future__ import print_function\n",
"from wand.image import Image\n",
"\n",
"def convert_pdf_png_and_save(file_name,new_file_name):\n",
" \"\"\"Convert a pdf file to png and save it at disk\n",
"\n",
" arguments:\n",
" file_name -- the real path to access the pdf file on disk\n",
" new_file_name -- my_path/12312.png\n",
" \"\"\"\n",
" #Default arguments to read the file and has a good resolution\n",
" with Image(filename=file_name, resolution=300) as img:\n",
" img.compression_quality = 99\n",
" print('width =', img.width)\n",
" print('height =', img.height)\n",
" print('pages = ', len(img.sequence))\n",
" print('resolution = ', img.resolution)\n",
"\n",
" #Format choosed to convert the pdf to image\n",
" with img.convert('png') as converted:\n",
" converted.save(filename=new_file_name)\n",
" \n",
"\n",
"pdf_directory=\"../data/training set/not_wrong/\"\n",
" \n",
"#Get the pdfs files downloaded in our folder, e.g., /data\n",
"pdfs = glob.glob(pdf_directory+'*.pdf')\n",
"\n",
"#Case we have pdf files we convert the pdf_files_name to doc_ids\n",
"for file in pdfs:\n",
" full_name= file.split(\"/\")\n",
" file_name = full_name[len(full_name)-1]\n",
" file_name= file_name.split(\".pdf\")\n",
" file_name= pdf_directory+file_name[0]+\".png\"\n",
" convert_pdf_png_and_save(file,file_name)\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"OpenCV VERSION (should be 3.1.0 or later, with nonfree modules installed!): 3.2.0\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
" \"This module will be removed in 0.20.\", DeprecationWarning)\n",
"/opt/conda/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
" DeprecationWarning)\n"
]
}
],
"source": [
"import visual_bow as bow\n",
"import cv2\n",
"import numpy as np\n",
"import glob\n",
"import os\n",
"from sklearn.svm import SVC\n",
"from sklearn.grid_search import GridSearchCV"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def binary_labeled_img_from_folder(positive_folder, cal101_root='../data/training set/ML/', image_suffix='*.png'):\n",
" \"\"\"\n",
" Generate a balanced dataset of positive and negative images from a directory of images\n",
" where each type of image is separated in its own folder.\n",
"\n",
" Returns:\n",
" --------\n",
" labeled_img_paths: list of lists\n",
" Of the form [[image_path, label], ...]\n",
" Where label is True or False for positive and negative images respectively\n",
" \"\"\"\n",
" all_imgs = set(glob.glob(cal101_root + '/*/' + image_suffix))\n",
" pos_imgs = set(glob.glob(os.path.join(cal101_root, positive_folder) + '/' + image_suffix))\n",
" \n",
" neg_imgs = all_imgs - pos_imgs\n",
"\n",
" neg_sample_size = len(pos_imgs)\n",
" selected_negs = np.random.choice(list(neg_imgs), size=neg_sample_size, replace=False)\n",
"\n",
" print ('{} positive, {} negative images selected (out of {} negatives total)'.format(\n",
" len(pos_imgs), len(selected_negs), len(neg_imgs)))\n",
"\n",
" labeled_img_paths = [[path, True] for path in pos_imgs] + [[path, False] for path in selected_negs]\n",
"\n",
" return np.array(labeled_img_paths)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"10 positive, 10 negative images selected (out of 273 negatives total)\n"
]
}
],
"source": [
"positive_folder = 'wrong'\n",
"imgs = binary_labeled_img_from_folder(positive_folder)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"generating SIFT descriptors for 20 images\n",
"SIFT descriptors generated.\n",
"0.8 2\n"
]
},
{
"ename": "TypeError",
"evalue": "ufunc 'multiply' did not contain a loop with signature matching types dtype('<U47') dtype('<U47') dtype('<U47')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-3bff5b49ad97>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkmeans\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgen_bow_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimgs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_train_ratio\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.8\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mK_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m750\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/notebook/develop/visual_bow.py\u001b[0m in \u001b[0;36mgen_bow_features\u001b[0;34m(imgs, test_train_ratio, K_clusters)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0mimg_descs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgen_sift_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimgs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0mpercent_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.99\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mtest_train_ratio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0mtraining_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_idxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_val_split_idxs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimgs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_train_ratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpercent_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcluster_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcluster_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimg_descs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_idxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mperform_data_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_idxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/notebook/develop/visual_bow.py\u001b[0m in \u001b[0;36mtrain_test_val_split_idxs\u001b[0;34m(total_rows, percent_test, percent_val)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0mrow_range\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{} {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpercent_test\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m \u001b[0mno_test_rows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtotal_rows\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpercent_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 68\u001b[0m \u001b[0mno_test_rows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mno_test_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mtest_idxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchoice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_range\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mno_test_rows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mTypeError\u001b[0m: ufunc 'multiply' did not contain a loop with signature matching types dtype('<U47') dtype('<U47') dtype('<U47')"
]
}
],
"source": [
"X_train, X_test, y_train, y_test, kmeans = bow.gen_bow_features(imgs, test_train_ratio=0.8, K_clusters=750)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"c_vals = [0.0001, 0.01, 0.1, 1, 5, 10, 100, 1000]\n",
"\n",
"param_grid = [\n",
" {'C': c_vals, 'kernel': ['linear']},\n",
" {'C': c_vals, 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},\n",
" ]\n",
"\n",
"svc = GridSearchCV(SVC(), param_grid, n_jobs=-1)\n",
"svc.fit(X_train, y_train)\n",
"svc.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"svc.best_estimator_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# We have our estimator, let's try on a new picture"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for img_path in ['kanye_glasses.jpeg', \n",
" 'kanye_glasses2.jpeg', \n",
" 'more_pandas/0001.jpeg', \n",
" '101_ObjectCategories/brontosaurus/image_0001.jpg',\n",
" '101_ObjectCategories/brontosaurus/image_0002.jpg',\n",
" '101_ObjectCategories/dalmatian/image_0001.jpg',\n",
" '101_ObjectCategories/dalmatian/image_0002.jpg'\n",
" ]:\n",
" print img_path, svc.predict(bow.img_to_vect(img_path, kmeans))"
]
}
],
"metadata": {
Expand Down
129 changes: 1 addition & 128 deletions develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

# coding: utf-8

# # This python notebook only shows how to converte PDF files to PNG
# # This python notebook shows how to converte PDF files to PNG and use SIFT
#
# ## To do so, 1) install the following requiriments:
#
Expand Down Expand Up @@ -210,130 +210,3 @@
# ### About these similar images check the issue: https://github.com/datasciencebr/serenata-de-amor/issues/32
# ##### ---- So, i also will try to combine the sift descriptors and other information.
#

# In[1]:

import glob
from __future__ import print_function
from wand.image import Image

def convert_pdf_png_and_save(file_name,new_file_name):
"""Convert a pdf file to png and save it at disk
arguments:
file_name -- the real path to access the pdf file on disk
new_file_name -- my_path/12312.png
"""
#Default arguments to read the file and has a good resolution
with Image(filename=file_name, resolution=300) as img:
img.compression_quality = 99
print('width =', img.width)
print('height =', img.height)
print('pages = ', len(img.sequence))
print('resolution = ', img.resolution)

#Format choosed to convert the pdf to image
with img.convert('png') as converted:
converted.save(filename=new_file_name)


pdf_directory="../data/training set/not_wrong/"

#Get the pdfs files downloaded in our folder, e.g., /data
pdfs = glob.glob(pdf_directory+'*.pdf')

#Case we have pdf files we convert the pdf_files_name to doc_ids
for file in pdfs:
full_name= file.split("/")
file_name = full_name[len(full_name)-1]
file_name= file_name.split(".pdf")
file_name= pdf_directory+file_name[0]+".png"
convert_pdf_png_and_save(file,file_name)




# In[1]:

import visual_bow as bow
import cv2
import numpy as np
import glob
import os
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV


# In[2]:

def binary_labeled_img_from_folder(positive_folder, cal101_root='../data/training set/ML/', image_suffix='*.png'):
"""
Generate a balanced dataset of positive and negative images from a directory of images
where each type of image is separated in its own folder.
Returns:
--------
labeled_img_paths: list of lists
Of the form [[image_path, label], ...]
Where label is True or False for positive and negative images respectively
"""
all_imgs = set(glob.glob(cal101_root + '/*/' + image_suffix))
pos_imgs = set(glob.glob(os.path.join(cal101_root, positive_folder) + '/' + image_suffix))

neg_imgs = all_imgs - pos_imgs

neg_sample_size = len(pos_imgs)
selected_negs = np.random.choice(list(neg_imgs), size=neg_sample_size, replace=False)

print ('{} positive, {} negative images selected (out of {} negatives total)'.format(
len(pos_imgs), len(selected_negs), len(neg_imgs)))

labeled_img_paths = [[path, True] for path in pos_imgs] + [[path, False] for path in selected_negs]

return np.array(labeled_img_paths)


# In[3]:

positive_folder = 'wrong'
imgs = binary_labeled_img_from_folder(positive_folder)


# In[4]:

X_train, X_test, y_train, y_test, kmeans = bow.gen_bow_features(imgs, test_train_ratio=0.8, K_clusters=750)


# In[ ]:

c_vals = [0.0001, 0.01, 0.1, 1, 5, 10, 100, 1000]

param_grid = [
{'C': c_vals, 'kernel': ['linear']},
{'C': c_vals, 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
]

svc = GridSearchCV(SVC(), param_grid, n_jobs=-1)
svc.fit(X_train, y_train)
svc.score(X_test, y_test)


# In[ ]:

svc.best_estimator_


# # We have our estimator, let's try on a new picture

# In[ ]:

for img_path in ['kanye_glasses.jpeg',
'kanye_glasses2.jpeg',
'more_pandas/0001.jpeg',
'101_ObjectCategories/brontosaurus/image_0001.jpg',
'101_ObjectCategories/brontosaurus/image_0002.jpg',
'101_ObjectCategories/dalmatian/image_0001.jpg',
'101_ObjectCategories/dalmatian/image_0002.jpg'
]:
print img_path, svc.predict(bow.img_to_vect(img_path, kmeans))

Loading

0 comments on commit 29bdae0

Please sign in to comment.