{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from matplotlib import pyplot as plt\n", "import os\n", "from sklearn.datasets import fetch_openml\n", "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n", "from sklearn.svm import LinearSVC\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.preprocessing import normalize" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Exercise 8**\n", "\n", "Create hard/soft voting ensemble on mnist" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "mnist = fetch_openml('mnist_784', version=1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mnist.keys()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(70000, 784)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X, y = mnist['data'], mnist['target']\n", "X.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Split into train, val, test sets of size 50k, 10k, 10k\n", "\n", "X_train = X[:50000]\n", "y_train = y[:50000]\n", "X_val = X[50000:60000]\n", "y_val = y[50000:60000]\n", "X_test = X[60000:]\n", "y_test = y[60000:]\n", "\n", "# Normalize features\n", "\n", "X_train /= 255.0\n", "X_val /= 255.0\n", "X_test /= 255.0" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.0\n" ] } ], "source": [ "print(X_test.max())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training our RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", " criterion='gini', max_depth=None, max_features='auto',\n", " max_leaf_nodes=None, max_samples=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=100,\n", " n_jobs=None, oob_score=False, random_state=None,\n", " verbose=0, warm_start=False)\n", "Training our ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,\n", " criterion='gini', max_depth=None, max_features='auto',\n", " max_leaf_nodes=None, max_samples=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=100,\n", " n_jobs=None, oob_score=False, random_state=None, verbose=0,\n", " warm_start=False)\n", "Training our LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", " verbose=0)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", " \"the number of iterations.\", ConvergenceWarning)\n" ] } ], "source": [ "rfc = RandomForestClassifier()\n", "etc = ExtraTreesClassifier()\n", "svc = LinearSVC()\n", "\n", "classifiers = [rfc, etc, svc]\n", "scores = []\n", "\n", "# Fit each classifier to the training set and predict on X_val\n", "for clf in classifiers:\n", " print('Training our ', clf)\n", " clf.fit(X_train, y_train)\n", " score = clf.score(X_val, y_val)\n", " scores.append(score)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0.9719, 0.9741, 0.9208]\n" ] } ], "source": [ "print(scores)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", " \"the number of iterations.\", ConvergenceWarning)\n" ] }, { "data": { "text/plain": [ "VotingClassifier(estimators=[('rf',\n", " RandomForestClassifier(bootstrap=True,\n", " ccp_alpha=0.0,\n", " class_weight=None,\n", " criterion='gini',\n", " max_depth=None,\n", " max_features='auto',\n", " max_leaf_nodes=None,\n", " max_samples=None,\n", " min_impurity_decrease=0.0,\n", " min_impurity_split=None,\n", " min_samples_leaf=1,\n", " min_samples_split=2,\n", " min_weight_fraction_leaf=0.0,\n", " n_estimators=100,\n", " n_jobs=None,\n", " oob_score...\n", " n_estimators=100,\n", " n_jobs=None, oob_score=False,\n", " random_state=None, verbose=0,\n", " warm_start=False)),\n", " ('sv',\n", " LinearSVC(C=1.0, class_weight=None, dual=True,\n", " fit_intercept=True, intercept_scaling=1,\n", " loss='squared_hinge', max_iter=1000,\n", " multi_class='ovr', penalty='l2',\n", " random_state=None, tol=0.0001,\n", " verbose=0))],\n", " flatten_transform=True, n_jobs=None, voting='hard',\n", " weights=None)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.ensemble import VotingClassifier\n", "\n", "# Hard vote ensmeble\n", "voting_clf = VotingClassifier(\n", " estimators=[('rf', rfc), ('et', etc), ('sv', svc)],\n", " voting='hard'\n", ")\n", "\n", "voting_clf.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9719" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "voting_clf.score(X_val, y_val)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# Try without SVC\n", "\n", "del voting_clf.estimators_[2]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9732" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "voting_clf.score(X_val, y_val)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9752" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Set to soft voting and check if better\n", "\n", "voting_clf.voting='soft'\n", "\n", "voting_clf.score(X_val, y_val)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9707" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check on Test Set\n", "\n", "voting_clf.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Exercise 9**\n", "\n", "train a stacking ensemble on our previous classifiers" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# Round up our predictions\n", "\n", "X_val_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n", "\n", "for index, clf in enumerate(classifiers):\n", " X_val_predictions[:, index] = clf.predict(X_val)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[3. 3. 3.]\n", " [8. 8. 8.]\n", " [6. 6. 6.]\n", " ...\n", " [5. 5. 5.]\n", " [6. 6. 6.]\n", " [8. 8. 8.]]\n" ] } ], "source": [ "print(X_val_predictions)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", " criterion='gini', max_depth=None, max_features='auto',\n", " max_leaf_nodes=None, max_samples=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=200,\n", " n_jobs=None, oob_score=True, random_state=None,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Train a classifier which will take as input our predictions matrix\n", "blender = RandomForestClassifier(n_estimators=200, oob_score=True)\n", "blender.fit(X_val_predictions, y_val)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9727" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check our out of bag score to get an idea of accuracy\n", "blender.oob_score_" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# Round up predictions for X_test\n", "X_test_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n", "\n", "for index, clf in enumerate(classifiers):\n", " X_test_predictions[:, index] = clf.predict(X_test)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "# Use our blender to predict based on our predictions matrix\n", "y_pred = blender.predict(X_test_predictions)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.968" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "accuracy_score(y_pred, y_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }