In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize

**Exercise 8**

Create hard/soft voting ensemble on mnist

In [3]:
mnist = fetch_openml('mnist_784', version=1)

In [4]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [5]:
X, y = mnist['data'], mnist['target']
X.shape

(70000, 784)

In [6]:
# Split into train, val, test sets of size 50k, 10k, 10k

X_train = X[:50000]
y_train = y[:50000]
X_val = X[50000:60000]
y_val = y[50000:60000]
X_test = X[60000:]
y_test = y[60000:]

# Normalize features

X_train /= 255.0
X_val /= 255.0
X_test /= 255.0

In [7]:
print(X_test.max())

1.0


In [8]:
rfc = RandomForestClassifier()
etc = ExtraTreesClassifier()
svc = LinearSVC()

classifiers = [rfc, etc, svc]
scores = []

# Fit each classifier to the training set and predict on X_val
for clf in classifiers:
    print('Training our ', clf)
    clf.fit(X_train, y_train)
    score = clf.score(X_val, y_val)
    scores.append(score)

Training our  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Training our  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
             



In [9]:
print(scores)

[0.9719, 0.9741, 0.9208]


In [10]:
from sklearn.ensemble import VotingClassifier

# Hard vote ensmeble
voting_clf = VotingClassifier(
    estimators=[('rf', rfc), ('et', etc), ('sv', svc)],
    voting='hard'
)

voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
        

In [11]:
voting_clf.score(X_val, y_val)

0.9719

In [12]:
# Try without SVC

del voting_clf.estimators_[2]

In [13]:
voting_clf.score(X_val, y_val)

0.9732

In [14]:
# Set to soft voting and check if better

voting_clf.voting='soft'

voting_clf.score(X_val, y_val)

0.9752

In [15]:
# Check on Test Set

voting_clf.score(X_test, y_test)

0.9707

**Exercise 9**

train a stacking ensemble on our previous classifiers

In [30]:
# Round up our predictions

X_val_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)

for index, clf in enumerate(classifiers):
    X_val_predictions[:, index] = clf.predict(X_val)

In [31]:
print(X_val_predictions)

[[3. 3. 3.]
 [8. 8. 8.]
 [6. 6. 6.]
 ...
 [5. 5. 5.]
 [6. 6. 6.]
 [8. 8. 8.]]


In [32]:
# Train a classifier which will take as input our predictions matrix
blender = RandomForestClassifier(n_estimators=200, oob_score=True)
blender.fit(X_val_predictions, y_val)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
# Check our out of bag score to get an idea of accuracy
blender.oob_score_

0.9727

In [35]:
# Round up predictions for X_test
X_test_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)

for index, clf in enumerate(classifiers):
    X_test_predictions[:, index] = clf.predict(X_test)

In [36]:
# Use our blender to predict based on our predictions matrix
y_pred = blender.predict(X_test_predictions)

In [38]:
accuracy_score(y_pred, y_test)

0.968