FreeUKGen · iamgroot42 · Feb 28, 2018 · Mar 2, 2018
diff --git a/classification/.gitignore b/classification/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/classification/README.md b/classification/README.md
@@ -0,0 +1,12 @@
+## Classification of images into entry/other
+
+### Proposed Technique
+* Convert all images into Black&White.
+* Downsize all images into (150, 250)
+* Define a simple CNN-classifier and train it on the given data
+* Batch-normalization is used to handle the variance in given data, while automatic class-weights are used to balance the error function (as the class distribution is biased)
+* To account for the low amount of data given, a small learning rate is used (to avoid overfitting)
+
+### Running it
+* Run `python trainClassifier.py <images_folder> <label_file>` from the current directory to train an end-to-end model.
+* For example, run `python trainClassifier.py images/freecen/ data/gold/combined_classifications_20180227.csv`
diff --git a/classification/model.py b/classification/model.py
@@ -0,0 +1,30 @@
+import keras
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten, Activation
+from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
+
+# Define a simple CNN model
+
+
+def getSimpleCNN(input_shape, num_classes):
+    model = Sequential()
+    model.add(Conv2D(16, kernel_size=(3, 3), input_shape=input_shape))
+    model.add(BatchNormalization())
+    model.add(Activation('relu'))
+    model.add(Conv2D(32, (3, 3)))
+    model.add(BatchNormalization())
+    model.add(Activation('relu'))
+    model.add(MaxPooling2D(pool_size=(2, 2)))
+    model.add(Dropout(0.25))
+    model.add(Flatten())
+    model.add(Dense(64))
+    model.add(BatchNormalization())
+    model.add(Activation('relu'))
+    model.add(Dropout(0.5))
+    model.add(Dense(num_classes, activation='softmax'))
+
+    model.compile(loss=keras.losses.categorical_crossentropy,
+                  optimizer=keras.optimizers.Adadelta(lr=0.1),
+                  metrics=['accuracy'])
+
+    return model
diff --git a/classification/readData.py b/classification/readData.py
@@ -0,0 +1,37 @@
+import numpy as np
+from PIL import Image
+import os
+from tqdm import tqdm
+from scipy.misc import imresize
+import csv
+
+# Read label classification file, construct data
+
+
+def getData(imageDirPrefix, filePath):
+    X = []
+    Y = []
+    with open(filePath, 'r') as f:
+        reader = csv.reader(f)
+        for line in tqdm(reader):
+            filePath = line[0]
+            imgClass = line[1]
+            # Read image as a black&white image
+            image = np.asarray(
+                Image.open(os.path.join(imageDirPrefix, filePath)).convert('L'))
+            # Resize into a smaller image
+            image = imresize(image, (150, 250))
+            X.append(image)
+            Y.append(imgClass)
+    X = np.array(X)
+    X = X.reshape(X.shape + (1,))
+    # Also store the mapping between class-names and indices
+    mappingDict = dict([(y, x) for x, y in enumerate(sorted(set(Y)))])
+    Y = np.array([mappingDict[x] for x in Y])
+    return X, Y, mappingDict
+
+
+if __name__ == "__main__":
+    import sys
+    X, Y, mapping = getData(sys.argv[1], sys.argv[2])
+    print X.shape, Y.shape
diff --git a/classification/trainClassifier.py b/classification/trainClassifier.py
@@ -0,0 +1,19 @@
+import readData
+import model
+import keras
+
+
+if __name__ == "__main__":
+    import sys
+    # Load data
+    X, Y, mapping = readData.getData(sys.argv[1], sys.argv[2])
+    num_classes = len(mapping.keys())
+    input_shape = X.shape[1:]
+    # Loada simple CNN for tha classification task
+    model = model.getSimpleCNN(input_shape, num_classes)
+    Y = keras.utils.to_categorical(Y, num_classes)
+    batch_size = 8
+    epochs = 20
+    # Train our model on the available data
+    model.fit(X, Y, batch_size=batch_size, epochs=epochs,
+              validation_split=0.2, class_weight='auto')