svm2 (4).py

# -*- coding: utf-8 -*-
"""SVM2.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1peeZdHhgK0UbG2QZFEv-3JtaDcRbK7ay
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

#Loading MNIST data in CSV format

train = pd.read_csv('mnist_train.csv')
test = pd.read_csv('mnist_test.csv')


train.shape

test.shape

train.head()

test.head() # printing first five columns of test_data

# there are no missing values in the dataset 

train.isnull().sum().head(10)

test.isnull().sum().head(10)

test.describe()

train.describe()

# about the dataset

# dimensions
print("Dimensions: ",test.shape, "\n")

# data types
print(test.info())

# head
test.head()

# about the dataset

# dimensions
print("Dimensions: ",train.shape, "\n")

# data types
print(train.info())

# head
train.head()

print(train.columns)
print(test.columns)

order = list(np.sort(train['label'].unique()))
print(order)

## Visualizing the number of class and counts in the datasets
plt.plot(figure = (16,10))
g = sns.countplot( train["label"], palette = 'icefire')
plt.title('Number of digit classes')
train.label.astype('category').value_counts()

# Plotting some samples as well as converting into matrix

one = train.iloc[3, 1:]
one.shape
one = one.values.reshape(28,28)
plt.imshow(one, cmap='gray')
plt.title("Digit 1")

# Shuffeling training data
from random import shuffle
train_shuffled = shuffle(train.values)

# Extracting features as X and labels as y
X_train = train.drop(labels = ["label"],axis = 1) 
y_train = train["label"]

# Loading test data (no labels are provieded)
X_test = test.values

print(f'X_train = {X_train.shape}, y = {y_train.shape}, X_test = {X_test.shape}')

# Plotting some digits
import numpy as np
plt.figure(figsize=(14,12))
for digit_num in range(0,30):
    plt.subplot(7,10,digit_num+1)
    grid_data = X_train.iloc[digit_num].values.reshape(28,28)  # reshape from 1d to 2d pixel array
    plt.imshow(grid_data, interpolation = "none", cmap = "afmhot")
    plt.xticks([])
    plt.yticks([])
plt.tight_layout()

# Exploring the class distribution (almost equally distributed)

sns.set(style="darkgrid")
counts = sns.countplot(x="label", data=train, palette="Set1")

# average feature values
round(train.drop('label', axis=1).mean(), 2)

## Separating the X and Y variable

y = train['label']

## Dropping the variable 'label' from X variable 
X = train.drop(columns = 'label')

## Printing the size of data 
print(train.shape)

## Normalization

X = X/255.0
test = test/255.0

print("X:", X.shape)
print("test_data:", test.shape)

# scaling the features
from sklearn.preprocessing import scale
X_scaled = scale(X)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, train_size = 0.2 ,random_state = 10)

# linear model

model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

# predict
x_pred = model_linear.predict(X_train)
y_pred = model_linear.predict(X_test)

predictions = model_linear.predict(X_train)
accuracy_score(predictions, y_train)

from sklearn import metrics
from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, x_pred, normalize=True, sample_weight=None))
print(metrics.confusion_matrix(y_true=y_train, y_pred=x_pred))

# confusion matrix and accuracy

from sklearn import metrics
from sklearn.metrics import confusion_matrix
# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

# cm
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

# run gc.collect() (garbage collect) to free up memory
# else, since the dataset is large and SVM is computationally heavy,
# it'll throw a memory error while training
import gc
gc.collect()

"""# New Section

# New Section
"""