**Exercise 12**

Implement batch gradient descent from scratch (no SKLearn!)

In [1]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from sklearn import datasets

%matplotlib inline

In [2]:
iris = datasets.load_iris()
list(iris.keys())

['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']

In [3]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [4]:
X = iris["data"][:, (2,3)]  # petal length and width
y = (iris["target"])  # 1 if Iris virginica, else 0

In [5]:
# Important variables

X_with_bias = np.c_[np.ones([len(X), 1]), X] # Add column of ones for theta intercept term
alpha = 0.1
iterations=1500

print(X.shape)

# NOTE: If ValueError: all input arrays must have the same shape appears then you may have run this cel multiple times
#    which will have added multiple collumns of ones to the matrix X

(150, 2)


In [70]:
# Setup our proportions

test_ratio = .2
val_ratio = .2
total_size = len(X)

# Calculate size of our splits

test_size = int(test_ratio*total_size)
val_size = int(val_ratio*total_size)
train_size = total_size - test_size - val_size

# Split our data

rnd_indices = np.random.permutation(total_size) # Shuffle our input matrix

X_train = X_with_bias[rnd_indices[:train_size]]
y_train = y[rnd_indices[:train_size]]
X_valid = X_with_bias[rnd_indices[train_size:-test_size]]
y_valid = y[rnd_indices[train_size:-test_size]]
X_test = X_with_bias[rnd_indices[-test_size:]]
y_test = y[rnd_indices[-test_size:]]

In [71]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(90, 3)
(30, 2)
(30, 3)


In [72]:
def to_one_hot(y):
    n_classes = y.max() + 1
    m = len(y)
    Y_one_hot = np.zeros((m, n_classes)) # Setup zero matrix with m rows and a column for each class
    Y_one_hot[np.arange(m), y] = 1 # Fill in ones
    return Y_one_hot

In [73]:
y_train[:10]

array([2, 2, 2, 0, 0, 0, 1, 2, 0, 2])

In [74]:
to_one_hot(y_train[:10])

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [75]:
Y_train_one_hot = to_one_hot(y_train)
Y_test_one_hot = to_one_hot(y_test)
Y_val_one_hot = to_one_hot(y_val)

In [76]:
# Softmax function = exp(X) / (sum of exp(X))

def softmax(logits):
    exps = np.exp(logits)
    exp_sums = np.sum(exps, axis=1, keepdims=True)
    return exps / exp_sums

In [82]:
n_inputs = X_train.shape[1] # Number of features
n_outputs = len(np.unique(y_train)) # 3 uniqure values which will each be a possible output

In [80]:
eta = 0.01
n_iterations = 5001
m = len(X_train)
epsilon = 1e-7

Theta = np.random.randn(n_inputs, n_outputs)

# Cycle through set to apply batch gradient descent

for iteration in range(n_iterations):
    logits = X_train.dot(Theta) # Logits which are raw predictions from applying X to Theta
    p_hat = softmax(logits) # Apply softmax to logits to get our probabilities
    loss = -np.mean(np.sum(Y_train_one_hot * np.log(p_hat + epsilon), axis=1)) # Compute loss function
    error = p_hat - Y_train_one_hot # Compute error 
    if iteration % 500 == 0:
        print(iteration, loss)
    Grad = 1/m * X_train.T.dot(error)
    Theta = Theta - eta * Grad
    

0 1.4567897105648775
500 0.7451993577978241
1000 0.6279369677273878
1500 0.5572702696067121
2000 0.5111859948576022
2500 0.47856473219026296
3000 0.45387932862540925
3500 0.43422780377165426
4000 0.41797875623202274
4500 0.4041537521442775
5000 0.39213163561158126


In [81]:
Theta

array([[ 3.61613128,  0.06856255, -2.86225561],
       [-0.2597962 ,  0.80558911,  0.70553675],
       [-0.90831271,  0.18903751,  2.43558706]])

In [87]:
# Predictions

logits = X_valid.dot(Theta)
p_hat = softmax(logits)
y_pred = np.argmax(p_hat, axis=1)

accuracy_score = np.mean(y_pred == y_valid)
accuracy_score

0.9666666666666667