Classifier to detect generalization in the description of reimbursements

okfn-brasil · May 27, 2017 · 29bdae0 · 29bdae0
1 parent 58d9faa
commit 29bdae0
Show file tree

Hide file tree

Showing 8 changed files with 14,840 additions and 867 deletions.
diff --git a/develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.html b/develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.html
diff --git a/develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.ipynb b/develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# This python notebook only shows how to converte PDF files to PNG\n",
+    "# This python notebook shows how to converte PDF files to PNG and use SIFT\n",
     "\n",
     "## To do so, 1) install the following requiriments:\n",
     "\n",
@@ -405,235 +405,6 @@
     "### About these similar images check the issue: https://github.com/datasciencebr/serenata-de-amor/issues/32\n",
     "##### ---- So, i also will try to combine the sift descriptors and other information. \n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "import glob\n",
-    "from __future__ import print_function\n",
-    "from wand.image import Image\n",
-    "\n",
-    "def convert_pdf_png_and_save(file_name,new_file_name):\n",
-    "    \"\"\"Convert a pdf file to png and save it at disk\n",
-    "\n",
-    "    arguments:\n",
-    "    file_name -- the real path to access the pdf file on disk\n",
-    "    new_file_name -- my_path/12312.png\n",
-    "    \"\"\"\n",
-    "    #Default arguments to read the file and has a good resolution\n",
-    "    with Image(filename=file_name, resolution=300) as img:\n",
-    "        img.compression_quality = 99\n",
-    "        print('width =', img.width)\n",
-    "        print('height =', img.height)\n",
-    "        print('pages = ', len(img.sequence))\n",
-    "        print('resolution = ', img.resolution)\n",
-    "\n",
-    "        #Format choosed to convert the pdf to image\n",
-    "        with img.convert('png') as converted:\n",
-    "            converted.save(filename=new_file_name)\n",
-    "            \n",
-    "\n",
-    "pdf_directory=\"../data/training set/not_wrong/\"\n",
-    "            \n",
-    "#Get the pdfs files downloaded in our folder, e.g., /data\n",
-    "pdfs = glob.glob(pdf_directory+'*.pdf')\n",
-    "\n",
-    "#Case we have pdf files we convert the pdf_files_name to doc_ids\n",
-    "for file in pdfs:\n",
-    "    full_name= file.split(\"/\")\n",
-    "    file_name = full_name[len(full_name)-1]\n",
-    "    file_name= file_name.split(\".pdf\")\n",
-    "    file_name= pdf_directory+file_name[0]+\".png\"\n",
-    "    convert_pdf_png_and_save(file,file_name)\n",
-    "\n",
-    "            "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "OpenCV VERSION (should be 3.1.0 or later, with nonfree modules installed!): 3.2.0\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
-      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
-      "/opt/conda/lib/python3.5/site-packages/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.\n",
-      "  DeprecationWarning)\n"
-     ]
-    }
-   ],
-   "source": [
-    "import visual_bow as bow\n",
-    "import cv2\n",
-    "import numpy as np\n",
-    "import glob\n",
-    "import os\n",
-    "from sklearn.svm import SVC\n",
-    "from sklearn.grid_search import GridSearchCV"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "def binary_labeled_img_from_folder(positive_folder, cal101_root='../data/training set/ML/', image_suffix='*.png'):\n",
-    "    \"\"\"\n",
-    "    Generate a balanced dataset of positive and negative images from a directory of images\n",
-    "    where each type of image is separated in its own folder.\n",
-    "\n",
-    "    Returns:\n",
-    "    --------\n",
-    "    labeled_img_paths: list of lists\n",
-    "        Of the form [[image_path, label], ...]\n",
-    "        Where label is True or False for positive and negative images respectively\n",
-    "    \"\"\"\n",
-    "    all_imgs = set(glob.glob(cal101_root + '/*/' + image_suffix))\n",
-    "    pos_imgs = set(glob.glob(os.path.join(cal101_root, positive_folder) + '/' + image_suffix))\n",
-    "    \n",
-    "    neg_imgs = all_imgs - pos_imgs\n",
-    "\n",
-    "    neg_sample_size = len(pos_imgs)\n",
-    "    selected_negs = np.random.choice(list(neg_imgs), size=neg_sample_size, replace=False)\n",
-    "\n",
-    "    print ('{} positive, {} negative images selected (out of {} negatives total)'.format(\n",
-    "        len(pos_imgs), len(selected_negs), len(neg_imgs)))\n",
-    "\n",
-    "    labeled_img_paths = [[path, True] for path in pos_imgs] + [[path, False] for path in selected_negs]\n",
-    "\n",
-    "    return np.array(labeled_img_paths)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "10 positive, 10 negative images selected (out of 273 negatives total)\n"
-     ]
-    }
-   ],
-   "source": [
-    "positive_folder = 'wrong'\n",
-    "imgs = binary_labeled_img_from_folder(positive_folder)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "generating SIFT descriptors for 20 images\n",
-      "SIFT descriptors generated.\n",
-      "0.8 2\n"
-     ]
-    },
-    {
-     "ename": "TypeError",
-     "evalue": "ufunc 'multiply' did not contain a loop with signature matching types dtype('<U47') dtype('<U47') dtype('<U47')",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-4-3bff5b49ad97>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkmeans\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgen_bow_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimgs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_train_ratio\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.8\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mK_clusters\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m750\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m/notebook/develop/visual_bow.py\u001b[0m in \u001b[0;36mgen_bow_features\u001b[0;34m(imgs, test_train_ratio, K_clusters)\u001b[0m\n\u001b[1;32m     84\u001b[0m      \u001b[0mimg_descs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgen_sift_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimgs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     85\u001b[0m      \u001b[0mpercent_val\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.99\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mtest_train_ratio\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m      \u001b[0mtraining_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_idxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_test_val_split_idxs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimgs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_train_ratio\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpercent_val\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     87\u001b[0m      \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcluster_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcluster_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimg_descs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_idxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m      \u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mperform_data_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraining_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_idxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval_idxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m/notebook/develop/visual_bow.py\u001b[0m in \u001b[0;36mtrain_test_val_split_idxs\u001b[0;34m(total_rows, percent_test, percent_val)\u001b[0m\n\u001b[1;32m     65\u001b[0m     \u001b[0mrow_range\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     66\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"{} {}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpercent_test\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtotal_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m     \u001b[0mno_test_rows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtotal_rows\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpercent_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     68\u001b[0m     \u001b[0mno_test_rows\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mno_test_rows\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     69\u001b[0m     \u001b[0mtest_idxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrandom\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchoice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow_range\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mno_test_rows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mTypeError\u001b[0m: ufunc 'multiply' did not contain a loop with signature matching types dtype('<U47') dtype('<U47') dtype('<U47')"
-     ]
-    }
-   ],
-   "source": [
-    "X_train, X_test, y_train, y_test, kmeans = bow.gen_bow_features(imgs, test_train_ratio=0.8, K_clusters=750)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "c_vals = [0.0001, 0.01, 0.1, 1, 5, 10, 100, 1000]\n",
-    "\n",
-    "param_grid = [\n",
-    "  {'C': c_vals, 'kernel': ['linear']},\n",
-    "  {'C': c_vals, 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},\n",
-    " ]\n",
-    "\n",
-    "svc = GridSearchCV(SVC(), param_grid, n_jobs=-1)\n",
-    "svc.fit(X_train, y_train)\n",
-    "svc.score(X_test, y_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "svc.best_estimator_"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# We have our estimator, let's try on a new picture"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "for img_path in ['kanye_glasses.jpeg', \n",
-    "                 'kanye_glasses2.jpeg', \n",
-    "                 'more_pandas/0001.jpeg', \n",
-    "                 '101_ObjectCategories/brontosaurus/image_0001.jpg',\n",
-    "                 '101_ObjectCategories/brontosaurus/image_0002.jpg',\n",
-    "                 '101_ObjectCategories/dalmatian/image_0001.jpg',\n",
-    "                 '101_ObjectCategories/dalmatian/image_0002.jpg'\n",
-    "                ]:\n",
-    "    print img_path, svc.predict(bow.img_to_vect(img_path, kmeans))"
-   ]
   }
  ],
  "metadata": {

diff --git a/develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.py b/develop/2017-05-05-silvio-PDF-to-PNG-SIFT-descriptors.py
@@ -1,7 +1,7 @@
 
 # coding: utf-8
 
-# # This python notebook only shows how to converte PDF files to PNG
+# # This python notebook shows how to converte PDF files to PNG and use SIFT
 # 
 # ## To do so, 1) install the following requiriments:
 # 
@@ -210,130 +210,3 @@
 # ### About these similar images check the issue: https://github.com/datasciencebr/serenata-de-amor/issues/32
 # ##### ---- So, i also will try to combine the sift descriptors and other information. 
 # 
-
-# In[1]:
-
-import glob
-from __future__ import print_function
-from wand.image import Image
-
-def convert_pdf_png_and_save(file_name,new_file_name):
-    """Convert a pdf file to png and save it at disk
-
-    arguments:
-    file_name -- the real path to access the pdf file on disk
-    new_file_name -- my_path/12312.png
-    """
-    #Default arguments to read the file and has a good resolution
-    with Image(filename=file_name, resolution=300) as img:
-        img.compression_quality = 99
-        print('width =', img.width)
-        print('height =', img.height)
-        print('pages = ', len(img.sequence))
-        print('resolution = ', img.resolution)
-
-        #Format choosed to convert the pdf to image
-        with img.convert('png') as converted:
-            converted.save(filename=new_file_name)
-
-
-pdf_directory="../data/training set/not_wrong/"
-
-#Get the pdfs files downloaded in our folder, e.g., /data
-pdfs = glob.glob(pdf_directory+'*.pdf')
-
-#Case we have pdf files we convert the pdf_files_name to doc_ids
-for file in pdfs:
-    full_name= file.split("/")
-    file_name = full_name[len(full_name)-1]
-    file_name= file_name.split(".pdf")
-    file_name= pdf_directory+file_name[0]+".png"
-    convert_pdf_png_and_save(file,file_name)
-
-
-
-
-# In[1]:
-
-import visual_bow as bow
-import cv2
-import numpy as np
-import glob
-import os
-from sklearn.svm import SVC
-from sklearn.grid_search import GridSearchCV
-
-
-# In[2]:
-
-def binary_labeled_img_from_folder(positive_folder, cal101_root='../data/training set/ML/', image_suffix='*.png'):
-    """
-    Generate a balanced dataset of positive and negative images from a directory of images
-    where each type of image is separated in its own folder.
-
-    Returns:
-    --------
-    labeled_img_paths: list of lists
-        Of the form [[image_path, label], ...]
-        Where label is True or False for positive and negative images respectively
-    """
-    all_imgs = set(glob.glob(cal101_root + '/*/' + image_suffix))
-    pos_imgs = set(glob.glob(os.path.join(cal101_root, positive_folder) + '/' + image_suffix))
-
-    neg_imgs = all_imgs - pos_imgs
-
-    neg_sample_size = len(pos_imgs)
-    selected_negs = np.random.choice(list(neg_imgs), size=neg_sample_size, replace=False)
-
-    print ('{} positive, {} negative images selected (out of {} negatives total)'.format(
-        len(pos_imgs), len(selected_negs), len(neg_imgs)))
-
-    labeled_img_paths = [[path, True] for path in pos_imgs] + [[path, False] for path in selected_negs]
-
-    return np.array(labeled_img_paths)
-
-
-# In[3]:
-
-positive_folder = 'wrong'
-imgs = binary_labeled_img_from_folder(positive_folder)
-
-
-# In[4]:
-
-X_train, X_test, y_train, y_test, kmeans = bow.gen_bow_features(imgs, test_train_ratio=0.8, K_clusters=750)
-
-
-# In[ ]:
-
-c_vals = [0.0001, 0.01, 0.1, 1, 5, 10, 100, 1000]
-
-param_grid = [
-  {'C': c_vals, 'kernel': ['linear']},
-  {'C': c_vals, 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
- ]
-
-svc = GridSearchCV(SVC(), param_grid, n_jobs=-1)
-svc.fit(X_train, y_train)
-svc.score(X_test, y_test)
-
-
-# In[ ]:
-
-svc.best_estimator_
-
-
-# # We have our estimator, let's try on a new picture
-
-# In[ ]:
-
-for img_path in ['kanye_glasses.jpeg', 
-                 'kanye_glasses2.jpeg', 
-                 'more_pandas/0001.jpeg', 
-                 '101_ObjectCategories/brontosaurus/image_0001.jpg',
-                 '101_ObjectCategories/brontosaurus/image_0002.jpg',
-                 '101_ObjectCategories/dalmatian/image_0001.jpg',
-                 '101_ObjectCategories/dalmatian/image_0002.jpg'
-                ]:
-    print img_path, svc.predict(bow.img_to_vect(img_path, kmeans))
-