{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from matplotlib import pyplot as plt\n",
    "import os\n",
    "from sklearn.datasets import fetch_openml\n",
    "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Exercise 8**\n",
    "\n",
    "Create hard/soft voting ensemble on mnist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist = fetch_openml('mnist_784', version=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mnist.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(70000, 784)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X, y = mnist['data'], mnist['target']\n",
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split into train, val, test sets of size 50k, 10k, 10k\n",
    "\n",
    "X_train = X[:50000]\n",
    "y_train = y[:50000]\n",
    "X_val = X[50000:60000]\n",
    "y_val = y[50000:60000]\n",
    "X_test = X[60000:]\n",
    "y_test = y[60000:]\n",
    "\n",
    "# Normalize features\n",
    "\n",
    "X_train /= 255.0\n",
    "X_val /= 255.0\n",
    "X_test /= 255.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.0\n"
     ]
    }
   ],
   "source": [
    "print(X_test.max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training our  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
      "                       criterion='gini', max_depth=None, max_features='auto',\n",
      "                       max_leaf_nodes=None, max_samples=None,\n",
      "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
      "                       min_samples_leaf=1, min_samples_split=2,\n",
      "                       min_weight_fraction_leaf=0.0, n_estimators=100,\n",
      "                       n_jobs=None, oob_score=False, random_state=None,\n",
      "                       verbose=0, warm_start=False)\n",
      "Training our  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,\n",
      "                     criterion='gini', max_depth=None, max_features='auto',\n",
      "                     max_leaf_nodes=None, max_samples=None,\n",
      "                     min_impurity_decrease=0.0, min_impurity_split=None,\n",
      "                     min_samples_leaf=1, min_samples_split=2,\n",
      "                     min_weight_fraction_leaf=0.0, n_estimators=100,\n",
      "                     n_jobs=None, oob_score=False, random_state=None, verbose=0,\n",
      "                     warm_start=False)\n",
      "Training our  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
      "          intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
      "          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
      "          verbose=0)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  \"the number of iterations.\", ConvergenceWarning)\n"
     ]
    }
   ],
   "source": [
    "rfc = RandomForestClassifier()\n",
    "etc = ExtraTreesClassifier()\n",
    "svc = LinearSVC()\n",
    "\n",
    "classifiers = [rfc, etc, svc]\n",
    "scores = []\n",
    "\n",
    "# Fit each classifier to the training set and predict on X_val\n",
    "for clf in classifiers:\n",
    "    print('Training our ', clf)\n",
    "    clf.fit(X_train, y_train)\n",
    "    score = clf.score(X_val, y_val)\n",
    "    scores.append(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.9719, 0.9741, 0.9208]\n"
     ]
    }
   ],
   "source": [
    "print(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  \"the number of iterations.\", ConvergenceWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "VotingClassifier(estimators=[('rf',\n",
       "                              RandomForestClassifier(bootstrap=True,\n",
       "                                                     ccp_alpha=0.0,\n",
       "                                                     class_weight=None,\n",
       "                                                     criterion='gini',\n",
       "                                                     max_depth=None,\n",
       "                                                     max_features='auto',\n",
       "                                                     max_leaf_nodes=None,\n",
       "                                                     max_samples=None,\n",
       "                                                     min_impurity_decrease=0.0,\n",
       "                                                     min_impurity_split=None,\n",
       "                                                     min_samples_leaf=1,\n",
       "                                                     min_samples_split=2,\n",
       "                                                     min_weight_fraction_leaf=0.0,\n",
       "                                                     n_estimators=100,\n",
       "                                                     n_jobs=None,\n",
       "                                                     oob_score...\n",
       "                                                   n_estimators=100,\n",
       "                                                   n_jobs=None, oob_score=False,\n",
       "                                                   random_state=None, verbose=0,\n",
       "                                                   warm_start=False)),\n",
       "                             ('sv',\n",
       "                              LinearSVC(C=1.0, class_weight=None, dual=True,\n",
       "                                        fit_intercept=True, intercept_scaling=1,\n",
       "                                        loss='squared_hinge', max_iter=1000,\n",
       "                                        multi_class='ovr', penalty='l2',\n",
       "                                        random_state=None, tol=0.0001,\n",
       "                                        verbose=0))],\n",
       "                 flatten_transform=True, n_jobs=None, voting='hard',\n",
       "                 weights=None)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import VotingClassifier\n",
    "\n",
    "# Hard vote ensmeble\n",
    "voting_clf = VotingClassifier(\n",
    "    estimators=[('rf', rfc), ('et', etc), ('sv', svc)],\n",
    "    voting='hard'\n",
    ")\n",
    "\n",
    "voting_clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9719"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "voting_clf.score(X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Try without SVC\n",
    "\n",
    "del voting_clf.estimators_[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9732"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "voting_clf.score(X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9752"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Set to soft voting and check if better\n",
    "\n",
    "voting_clf.voting='soft'\n",
    "\n",
    "voting_clf.score(X_val, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9707"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check on Test Set\n",
    "\n",
    "voting_clf.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Exercise 9**\n",
    "\n",
    "train a stacking ensemble on our previous classifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Round up our predictions\n",
    "\n",
    "X_val_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n",
    "\n",
    "for index, clf in enumerate(classifiers):\n",
    "    X_val_predictions[:, index] = clf.predict(X_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[3. 3. 3.]\n",
      " [8. 8. 8.]\n",
      " [6. 6. 6.]\n",
      " ...\n",
      " [5. 5. 5.]\n",
      " [6. 6. 6.]\n",
      " [8. 8. 8.]]\n"
     ]
    }
   ],
   "source": [
    "print(X_val_predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
       "                       criterion='gini', max_depth=None, max_features='auto',\n",
       "                       max_leaf_nodes=None, max_samples=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, n_estimators=200,\n",
       "                       n_jobs=None, oob_score=True, random_state=None,\n",
       "                       verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Train a classifier which will take as input our predictions matrix\n",
    "blender = RandomForestClassifier(n_estimators=200, oob_score=True)\n",
    "blender.fit(X_val_predictions, y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9727"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check our out of bag score to get an idea of accuracy\n",
    "blender.oob_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Round up predictions for X_test\n",
    "X_test_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n",
    "\n",
    "for index, clf in enumerate(classifiers):\n",
    "    X_test_predictions[:, index] = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use our blender to predict based on our predictions matrix\n",
    "y_pred = blender.predict(X_test_predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.968"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accuracy_score(y_pred, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}