diff --git a/Ch4/.ipynb_checkpoints/exercises-checkpoint.ipynb b/Ch4/.ipynb_checkpoints/exercises-checkpoint.ipynb new file mode 100644 index 000000000..606665cc9 --- /dev/null +++ b/Ch4/.ipynb_checkpoints/exercises-checkpoint.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 12**\n", + "\n", + "Implement batch gradient descent from scratch (no SKLearn!)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import os\n", + "from matplotlib import pyplot as plt\n", + "from sklearn import datasets\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris = datasets.load_iris()\n", + "list(iris.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _iris_dataset:\n", + "\n", + "Iris plants dataset\n", + "--------------------\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 150 (50 in each of three classes)\n", + " :Number of Attributes: 4 numeric, predictive attributes and the class\n", + " :Attribute Information:\n", + " - sepal length in cm\n", + " - sepal width in cm\n", + " - petal length in cm\n", + " - petal width in cm\n", + " - class:\n", + " - Iris-Setosa\n", + " - Iris-Versicolour\n", + " - Iris-Virginica\n", + " \n", + " :Summary Statistics:\n", + "\n", + " ============== ==== ==== ======= ===== ====================\n", + " Min Max Mean SD Class Correlation\n", + " ============== ==== ==== ======= ===== ====================\n", + " sepal length: 4.3 7.9 5.84 0.83 0.7826\n", + " sepal width: 2.0 4.4 3.05 0.43 -0.4194\n", + " petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n", + " petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n", + " ============== ==== ==== ======= ===== ====================\n", + "\n", + " :Missing Attribute Values: None\n", + " :Class Distribution: 33.3% for each of 3 classes.\n", + " :Creator: R.A. Fisher\n", + " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", + " :Date: July, 1988\n", + "\n", + "The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\n", + "from Fisher's paper. Note that it's the same as in R, but not as in the UCI\n", + "Machine Learning Repository, which has two wrong data points.\n", + "\n", + "This is perhaps the best known database to be found in the\n", + "pattern recognition literature. Fisher's paper is a classic in the field and\n", + "is referenced frequently to this day. (See Duda & Hart, for example.) The\n", + "data set contains 3 classes of 50 instances each, where each class refers to a\n", + "type of iris plant. One class is linearly separable from the other 2; the\n", + "latter are NOT linearly separable from each other.\n", + "\n", + ".. topic:: References\n", + "\n", + " - Fisher, R.A. \"The use of multiple measurements in taxonomic problems\"\n", + " Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n", + " Mathematical Statistics\" (John Wiley, NY, 1950).\n", + " - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n", + " (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n", + " - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n", + " Structure and Classification Rule for Recognition in Partially Exposed\n", + " Environments\". IEEE Transactions on Pattern Analysis and Machine\n", + " Intelligence, Vol. PAMI-2, No. 1, 67-71.\n", + " - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\n", + " on Information Theory, May 1972, 431-433.\n", + " - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\n", + " conceptual clustering system finds 3 classes in the data.\n", + " - Many, many more ...\n" + ] + } + ], + "source": [ + "print(iris.DESCR)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X = iris[\"data\"][:, (2,3)] # petal length and width\n", + "y = (iris[\"target\"]) # 1 if Iris virginica, else 0" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(150, 2)\n" + ] + } + ], + "source": [ + "# Important variables\n", + "\n", + "X_with_bias = np.c_[np.ones([len(X), 1]), X] # Add column of ones for theta intercept term\n", + "alpha = 0.1\n", + "iterations=1500\n", + "\n", + "print(X.shape)\n", + "\n", + "# NOTE: If ValueError: all input arrays must have the same shape appears then you may have run this cel multiple times\n", + "# which will have added multiple collumns of ones to the matrix X" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup our proportions\n", + "\n", + "test_ratio = .2\n", + "val_ratio = .2\n", + "total_size = len(X)\n", + "\n", + "# Calculate size of our splits\n", + "\n", + "test_size = int(test_ratio*total_size)\n", + "val_size = int(val_ratio*total_size)\n", + "train_size = total_size - test_size - val_size\n", + "\n", + "# Split our data\n", + "\n", + "rnd_indices = np.random.permutation(total_size) # Shuffle our input matrix\n", + "\n", + "X_train = X_with_bias[rnd_indices[:train_size]]\n", + "y_train = y[rnd_indices[:train_size]]\n", + "X_valid = X_with_bias[rnd_indices[train_size:-test_size]]\n", + "y_valid = y[rnd_indices[train_size:-test_size]]\n", + "X_test = X_with_bias[rnd_indices[-test_size:]]\n", + "y_test = y[rnd_indices[-test_size:]]" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(90, 3)\n", + "(30, 2)\n", + "(30, 3)\n" + ] + } + ], + "source": [ + "print(X_train.shape)\n", + "print(X_val.shape)\n", + "print(X_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "def to_one_hot(y):\n", + " n_classes = y.max() + 1\n", + " m = len(y)\n", + " Y_one_hot = np.zeros((m, n_classes)) # Setup zero matrix with m rows and a column for each class\n", + " Y_one_hot[np.arange(m), y] = 1 # Fill in ones\n", + " return Y_one_hot" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([2, 2, 2, 0, 0, 0, 1, 2, 0, 2])" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 0., 1.],\n", + " [0., 0., 1.],\n", + " [0., 0., 1.],\n", + " [1., 0., 0.],\n", + " [1., 0., 0.],\n", + " [1., 0., 0.],\n", + " [0., 1., 0.],\n", + " [0., 0., 1.],\n", + " [1., 0., 0.],\n", + " [0., 0., 1.]])" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_one_hot(y_train[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_one_hot = to_one_hot(y_train)\n", + "Y_test_one_hot = to_one_hot(y_test)\n", + "Y_val_one_hot = to_one_hot(y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "# Softmax function = exp(X) / (sum of exp(X))\n", + "\n", + "def softmax(logits):\n", + " exps = np.exp(logits)\n", + " exp_sums = np.sum(exps, axis=1, keepdims=True)\n", + " return exps / exp_sums" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "n_inputs = X_train.shape[1] # Number of features\n", + "n_outputs = len(np.unique(y_train)) # 3 uniqure values which will each be a possible output" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1.4567897105648775\n", + "500 0.7451993577978241\n", + "1000 0.6279369677273878\n", + "1500 0.5572702696067121\n", + "2000 0.5111859948576022\n", + "2500 0.47856473219026296\n", + "3000 0.45387932862540925\n", + "3500 0.43422780377165426\n", + "4000 0.41797875623202274\n", + "4500 0.4041537521442775\n", + "5000 0.39213163561158126\n" + ] + } + ], + "source": [ + "eta = 0.01\n", + "n_iterations = 5001\n", + "m = len(X_train)\n", + "epsilon = 1e-7\n", + "\n", + "Theta = np.random.randn(n_inputs, n_outputs)\n", + "\n", + "# Cycle through set to apply batch gradient descent\n", + "\n", + "for iteration in range(n_iterations):\n", + " logits = X_train.dot(Theta) # Logits which are raw predictions from applying X to Theta\n", + " p_hat = softmax(logits) # Apply softmax to logits to get our probabilities\n", + " loss = -np.mean(np.sum(Y_train_one_hot * np.log(p_hat + epsilon), axis=1)) # Compute loss function\n", + " error = p_hat - Y_train_one_hot # Compute error \n", + " if iteration % 500 == 0:\n", + " print(iteration, loss)\n", + " Grad = 1/m * X_train.T.dot(error)\n", + " Theta = Theta - eta * Grad\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 3.61613128, 0.06856255, -2.86225561],\n", + " [-0.2597962 , 0.80558911, 0.70553675],\n", + " [-0.90831271, 0.18903751, 2.43558706]])" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Theta" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9666666666666667" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predictions\n", + "\n", + "logits = X_valid.dot(Theta)\n", + "p_hat = softmax(logits)\n", + "y_pred = np.argmax(p_hat, axis=1)\n", + "\n", + "accuracy_score = np.mean(y_pred == y_valid)\n", + "accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch4/datasets/housing/ex1data1.txt b/Ch4/datasets/housing/ex1data1.txt new file mode 100644 index 000000000..0f88ccb61 --- /dev/null +++ b/Ch4/datasets/housing/ex1data1.txt @@ -0,0 +1,97 @@ +6.1101,17.592 +5.5277,9.1302 +8.5186,13.662 +7.0032,11.854 +5.8598,6.8233 +8.3829,11.886 +7.4764,4.3483 +8.5781,12 +6.4862,6.5987 +5.0546,3.8166 +5.7107,3.2522 +14.164,15.505 +5.734,3.1551 +8.4084,7.2258 +5.6407,0.71618 +5.3794,3.5129 +6.3654,5.3048 +5.1301,0.56077 +6.4296,3.6518 +7.0708,5.3893 +6.1891,3.1386 +20.27,21.767 +5.4901,4.263 +6.3261,5.1875 +5.5649,3.0825 +18.945,22.638 +12.828,13.501 +10.957,7.0467 +13.176,14.692 +22.203,24.147 +5.2524,-1.22 +6.5894,5.9966 +9.2482,12.134 +5.8918,1.8495 +8.2111,6.5426 +7.9334,4.5623 +8.0959,4.1164 +5.6063,3.3928 +12.836,10.117 +6.3534,5.4974 +5.4069,0.55657 +6.8825,3.9115 +11.708,5.3854 +5.7737,2.4406 +7.8247,6.7318 +7.0931,1.0463 +5.0702,5.1337 +5.8014,1.844 +11.7,8.0043 +5.5416,1.0179 +7.5402,6.7504 +5.3077,1.8396 +7.4239,4.2885 +7.6031,4.9981 +6.3328,1.4233 +6.3589,-1.4211 +6.2742,2.4756 +5.6397,4.6042 +9.3102,3.9624 +9.4536,5.4141 +8.8254,5.1694 +5.1793,-0.74279 +21.279,17.929 +14.908,12.054 +18.959,17.054 +7.2182,4.8852 +8.2951,5.7442 +10.236,7.7754 +5.4994,1.0173 +20.341,20.992 +10.136,6.6799 +7.3345,4.0259 +6.0062,1.2784 +7.2259,3.3411 +5.0269,-2.6807 +6.5479,0.29678 +7.5386,3.8845 +5.0365,5.7014 +10.274,6.7526 +5.1077,2.0576 +5.7292,0.47953 +5.1884,0.20421 +6.3557,0.67861 +9.7687,7.5435 +6.5159,5.3436 +8.5172,4.2415 +9.1802,6.7981 +6.002,0.92695 +5.5204,0.152 +5.0594,2.8214 +5.7077,1.8451 +7.6366,4.2959 +5.8707,7.2029 +5.3054,1.9869 +8.2934,0.14454 +13.394,9.0551 +5.4369,0.61705 diff --git a/Ch4/datasets/housing/ex1data2.txt b/Ch4/datasets/housing/ex1data2.txt new file mode 100644 index 000000000..79e9a807e --- /dev/null +++ b/Ch4/datasets/housing/ex1data2.txt @@ -0,0 +1,47 @@ +2104,3,399900 +1600,3,329900 +2400,3,369000 +1416,2,232000 +3000,4,539900 +1985,4,299900 +1534,3,314900 +1427,3,198999 +1380,3,212000 +1494,3,242500 +1940,4,239999 +2000,3,347000 +1890,3,329999 +4478,5,699900 +1268,3,259900 +2300,4,449900 +1320,2,299900 +1236,3,199900 +2609,4,499998 +3031,4,599000 +1767,3,252900 +1888,2,255000 +1604,3,242900 +1962,4,259900 +3890,3,573900 +1100,3,249900 +1458,3,464500 +2526,3,469000 +2200,3,475000 +2637,3,299900 +1839,2,349900 +1000,1,169900 +2040,4,314900 +3137,3,579900 +1811,4,285900 +1437,3,249900 +1239,3,229900 +2132,4,345000 +4215,4,549000 +2162,4,287000 +1664,2,368500 +2238,3,329900 +2567,4,314000 +1200,3,299000 +852,2,179900 +1852,4,299900 +1203,3,239500 diff --git a/Ch4/exercises.ipynb b/Ch4/exercises.ipynb new file mode 100644 index 000000000..606665cc9 --- /dev/null +++ b/Ch4/exercises.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 12**\n", + "\n", + "Implement batch gradient descent from scratch (no SKLearn!)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import os\n", + "from matplotlib import pyplot as plt\n", + "from sklearn import datasets\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iris = datasets.load_iris()\n", + "list(iris.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _iris_dataset:\n", + "\n", + "Iris plants dataset\n", + "--------------------\n", + "\n", + "**Data Set Characteristics:**\n", + "\n", + " :Number of Instances: 150 (50 in each of three classes)\n", + " :Number of Attributes: 4 numeric, predictive attributes and the class\n", + " :Attribute Information:\n", + " - sepal length in cm\n", + " - sepal width in cm\n", + " - petal length in cm\n", + " - petal width in cm\n", + " - class:\n", + " - Iris-Setosa\n", + " - Iris-Versicolour\n", + " - Iris-Virginica\n", + " \n", + " :Summary Statistics:\n", + "\n", + " ============== ==== ==== ======= ===== ====================\n", + " Min Max Mean SD Class Correlation\n", + " ============== ==== ==== ======= ===== ====================\n", + " sepal length: 4.3 7.9 5.84 0.83 0.7826\n", + " sepal width: 2.0 4.4 3.05 0.43 -0.4194\n", + " petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n", + " petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n", + " ============== ==== ==== ======= ===== ====================\n", + "\n", + " :Missing Attribute Values: None\n", + " :Class Distribution: 33.3% for each of 3 classes.\n", + " :Creator: R.A. Fisher\n", + " :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n", + " :Date: July, 1988\n", + "\n", + "The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\n", + "from Fisher's paper. Note that it's the same as in R, but not as in the UCI\n", + "Machine Learning Repository, which has two wrong data points.\n", + "\n", + "This is perhaps the best known database to be found in the\n", + "pattern recognition literature. Fisher's paper is a classic in the field and\n", + "is referenced frequently to this day. (See Duda & Hart, for example.) The\n", + "data set contains 3 classes of 50 instances each, where each class refers to a\n", + "type of iris plant. One class is linearly separable from the other 2; the\n", + "latter are NOT linearly separable from each other.\n", + "\n", + ".. topic:: References\n", + "\n", + " - Fisher, R.A. \"The use of multiple measurements in taxonomic problems\"\n", + " Annual Eugenics, 7, Part II, 179-188 (1936); also in \"Contributions to\n", + " Mathematical Statistics\" (John Wiley, NY, 1950).\n", + " - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n", + " (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n", + " - Dasarathy, B.V. (1980) \"Nosing Around the Neighborhood: A New System\n", + " Structure and Classification Rule for Recognition in Partially Exposed\n", + " Environments\". IEEE Transactions on Pattern Analysis and Machine\n", + " Intelligence, Vol. PAMI-2, No. 1, 67-71.\n", + " - Gates, G.W. (1972) \"The Reduced Nearest Neighbor Rule\". IEEE Transactions\n", + " on Information Theory, May 1972, 431-433.\n", + " - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al\"s AUTOCLASS II\n", + " conceptual clustering system finds 3 classes in the data.\n", + " - Many, many more ...\n" + ] + } + ], + "source": [ + "print(iris.DESCR)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X = iris[\"data\"][:, (2,3)] # petal length and width\n", + "y = (iris[\"target\"]) # 1 if Iris virginica, else 0" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(150, 2)\n" + ] + } + ], + "source": [ + "# Important variables\n", + "\n", + "X_with_bias = np.c_[np.ones([len(X), 1]), X] # Add column of ones for theta intercept term\n", + "alpha = 0.1\n", + "iterations=1500\n", + "\n", + "print(X.shape)\n", + "\n", + "# NOTE: If ValueError: all input arrays must have the same shape appears then you may have run this cel multiple times\n", + "# which will have added multiple collumns of ones to the matrix X" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup our proportions\n", + "\n", + "test_ratio = .2\n", + "val_ratio = .2\n", + "total_size = len(X)\n", + "\n", + "# Calculate size of our splits\n", + "\n", + "test_size = int(test_ratio*total_size)\n", + "val_size = int(val_ratio*total_size)\n", + "train_size = total_size - test_size - val_size\n", + "\n", + "# Split our data\n", + "\n", + "rnd_indices = np.random.permutation(total_size) # Shuffle our input matrix\n", + "\n", + "X_train = X_with_bias[rnd_indices[:train_size]]\n", + "y_train = y[rnd_indices[:train_size]]\n", + "X_valid = X_with_bias[rnd_indices[train_size:-test_size]]\n", + "y_valid = y[rnd_indices[train_size:-test_size]]\n", + "X_test = X_with_bias[rnd_indices[-test_size:]]\n", + "y_test = y[rnd_indices[-test_size:]]" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(90, 3)\n", + "(30, 2)\n", + "(30, 3)\n" + ] + } + ], + "source": [ + "print(X_train.shape)\n", + "print(X_val.shape)\n", + "print(X_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "def to_one_hot(y):\n", + " n_classes = y.max() + 1\n", + " m = len(y)\n", + " Y_one_hot = np.zeros((m, n_classes)) # Setup zero matrix with m rows and a column for each class\n", + " Y_one_hot[np.arange(m), y] = 1 # Fill in ones\n", + " return Y_one_hot" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([2, 2, 2, 0, 0, 0, 1, 2, 0, 2])" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 0., 1.],\n", + " [0., 0., 1.],\n", + " [0., 0., 1.],\n", + " [1., 0., 0.],\n", + " [1., 0., 0.],\n", + " [1., 0., 0.],\n", + " [0., 1., 0.],\n", + " [0., 0., 1.],\n", + " [1., 0., 0.],\n", + " [0., 0., 1.]])" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_one_hot(y_train[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "Y_train_one_hot = to_one_hot(y_train)\n", + "Y_test_one_hot = to_one_hot(y_test)\n", + "Y_val_one_hot = to_one_hot(y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "# Softmax function = exp(X) / (sum of exp(X))\n", + "\n", + "def softmax(logits):\n", + " exps = np.exp(logits)\n", + " exp_sums = np.sum(exps, axis=1, keepdims=True)\n", + " return exps / exp_sums" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "n_inputs = X_train.shape[1] # Number of features\n", + "n_outputs = len(np.unique(y_train)) # 3 uniqure values which will each be a possible output" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 1.4567897105648775\n", + "500 0.7451993577978241\n", + "1000 0.6279369677273878\n", + "1500 0.5572702696067121\n", + "2000 0.5111859948576022\n", + "2500 0.47856473219026296\n", + "3000 0.45387932862540925\n", + "3500 0.43422780377165426\n", + "4000 0.41797875623202274\n", + "4500 0.4041537521442775\n", + "5000 0.39213163561158126\n" + ] + } + ], + "source": [ + "eta = 0.01\n", + "n_iterations = 5001\n", + "m = len(X_train)\n", + "epsilon = 1e-7\n", + "\n", + "Theta = np.random.randn(n_inputs, n_outputs)\n", + "\n", + "# Cycle through set to apply batch gradient descent\n", + "\n", + "for iteration in range(n_iterations):\n", + " logits = X_train.dot(Theta) # Logits which are raw predictions from applying X to Theta\n", + " p_hat = softmax(logits) # Apply softmax to logits to get our probabilities\n", + " loss = -np.mean(np.sum(Y_train_one_hot * np.log(p_hat + epsilon), axis=1)) # Compute loss function\n", + " error = p_hat - Y_train_one_hot # Compute error \n", + " if iteration % 500 == 0:\n", + " print(iteration, loss)\n", + " Grad = 1/m * X_train.T.dot(error)\n", + " Theta = Theta - eta * Grad\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 3.61613128, 0.06856255, -2.86225561],\n", + " [-0.2597962 , 0.80558911, 0.70553675],\n", + " [-0.90831271, 0.18903751, 2.43558706]])" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Theta" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9666666666666667" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Predictions\n", + "\n", + "logits = X_valid.dot(Theta)\n", + "p_hat = softmax(logits)\n", + "y_pred = np.argmax(p_hat, axis=1)\n", + "\n", + "accuracy_score = np.mean(y_pred == y_valid)\n", + "accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch5/.ipynb_checkpoints/Exercises-checkpoint.ipynb b/Ch5/.ipynb_checkpoints/Exercises-checkpoint.ipynb new file mode 100644 index 000000000..bc6125423 --- /dev/null +++ b/Ch5/.ipynb_checkpoints/Exercises-checkpoint.ipynb @@ -0,0 +1,657 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "import os\n", + "\n", + "from sklearn.svm import SVC\n", + "from sklearn import datasets\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn import datasets\n", + "\n", + "iris = datasets.load_iris()\n", + "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", + "y = iris[\"target\"]\n", + "\n", + "setosa_or_versicolor = (y == 0) | (y == 1)\n", + "X = X[setosa_or_versicolor]\n", + "y = y[setosa_or_versicolor]" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the data\n", + "\n", + "def plot_setosa_versicolor(X = X, y = y):\n", + " plt.plot(X[:,0][y==0], X[:,1][y==0], 'bo', label='iris setosa')\n", + " plt.plot(X[:,0][y==1], X[:,1][y==1], 'r^', label='iris verticolor')\n", + "\n", + " plt.xlabel('petal length', fontsize=15)\n", + " plt.ylabel('petal width', fontsize=15)\n", + " plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAELCAYAAAAlTtoUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO3deZQV9Z338feXBoYlGCPgBkrrPJqRtYUGxQVFoiGKgysuaMJEQwA540yeiWGSieKSeWLMM2ayaXABF0YneVzGxGggKEEUIy02CogMKq0dSGgbRVEYBL7PH1UXbl/uUn3X6u7P65w6fat+tXxv9Tn97V/9qr5l7o6IiEgUnSodgIiItB1KGiIiEpmShoiIRKakISIikSlpiIhIZJ0rHUCp9enTx6urqysdhohIm/Hyyy+/5+5907W1+6RRXV1NXV1dpcMQEWkzzKwhU5suT4mISGRKGiIiEpmShoiIRNbuxzTS+fTTT2lsbGTHjh2VDkWAbt260b9/f7p06VLpUEQkhw6ZNBobG+nVqxfV1dWYWaXD6dDcnebmZhobGznqqKMqHY6I5NAhL0/t2LGD3r17K2HEgJnRu3dv9fqkY9q0CU47Df7850pHElmHTBqAEkaM6HchHdbNN8PSpcHPNqLDJg0RkYratAnmzoU9e4KfbaS3oaRRISeddFJebYVYvHgxL7zwQkn2LSKtdPPNQcIA2L27zfQ2lDQimD8fqquhU6fg5/z5he8z3R/v3bt3Z2wrBiUNkZhI9DJ27gzmd+5sM70NJY0c5s+HqVOhoQHcg59TpxaeOD7zmc8AwR/ysWPHcvnllzNkyJAWbZs2bWLMmDHU1NQwePBgnnvuuf32M2vWLAYOHMjQoUP5p3/6JwCampq48MILGTlyJCNHjuT5559nw4YN3Hnnndx+++3U1NTw3HPP0dDQwLhx4xg6dCjjxo3jnXfeAeBXv/oVgwcPZtiwYYwZMwaADRs2cOqppzJ8+HCGDx+u5CNSiOReRkJb6W24e9km4F5gM7AqQ/s3gfpwWgXsBg4K2zYAr4VtdVGPOWLECE+1Zs2a/ZZlMmCAe5AuWk4DBkTeRVo9e/Z0d/dnn33We/To4W+99dZ+bT/84Q/9lltucXf3Xbt2+YcffthiH83NzX7sscf6nj173N39/fffd3f3yy67zJ977jl3d29oaPC/+Zu/cXf3G264wW+77ba920+YMMHnzZvn7u733HOPT5w40d3dBw8e7I2NjS32+fHHH/v27dvd3X3dunWe7rwWojW/E5E2r6Ym/R+WmppKR+bu7tn+xpb7OY15wE+B+9M1uvttwG0AZnYu8I/uviVplbHu/l6pg0wW/vMdeXk+Ro0alfYZhZEjR/LVr36VTz/9lPPOO4+ampoW7QcccADdunXj6quv5pxzzmHChAkA/P73v2fNmjV71/vwww/56KOP9tv/smXLePTRRwG48sorue666wA4+eSTmTJlCpMmTeKCCy4AggciZ86cSX19PVVVVaxbt644X16kI3rllUpHkLeyXp5y9yXAlpwrBi4DHiphOJEceWTrluejZ8+eaZePGTOGJUuW0K9fP6688kruv79lru3cuTMvvfQSF154IY8//jjjx48HYM+ePSxbtoz6+nrq6+v505/+RK9evXLGkbj19c477+SWW27h3XffpaamhubmZm6//XYOOeQQVq5cSV1dHTsT12JFpEOJ5ZiGmfUAxgOPJC12YIGZvWxmU3NsP9XM6sysrqmpqaBYvvc96NGj5bIePYLlpdbQ0MDBBx/M1772Na666ipWrFjRon3btm1s3bqVs88+mx/96EfU19cDcNZZZ/HTn/5073qJ5b169WrR4zjppJN4+OGHAZg/fz6nnHIKAG+++SYnnHACN910E3369OHdd99l69atHHbYYXTq1IkHHnhg76C9iHQwma5blWoCqskwppG0ziXAr1OWHR7+PBhYCYyJcrxCxzTc3R98MBjDMAt+PvhgqzZPK3lM45xzzknbNm/ePB80aJDX1NT4Kaec0mLcw91948aNPnLkSB8yZIgPHjx47/hEU1OTT5o0yYcMGeLHHXecf/3rX3d39zfeeMOHDBniw4YN8yVLlvjbb7/tY8eO9SFDhvgZZ5zhDQ0N7u5+/vnn++DBg33QoEH+93//975nzx5ft26dDxkyxE844QSfNWvW3hiLRWMaIvFBljENC9rLx8yqgd+4++As6zwG/Mrd/yND+2xgm7v/MNfxamtrPfUlTK+//jrHHXdcK6KWUtPvRCQ+zOxld69N1xa7y1Nm9lngNOC/kpb1NLNeic/AWQR3V4mIdEwVqltV1qRhZg8By4DPm1mjmV1lZtPMbFrSaucDC9z946RlhwBLzWwl8BLwpLs/Xb7IRURipkJ1q8p6y627XxZhnXkEt+YmL3sLGFaaqERE2pjUulXf/S4cemhZDh27y1MiIpJDBetWKWmIiLQlFa5bpaQhItKWVLhulZJGhVSiNHo68+bNY+PGjXvnr7766hYlSFqzn5kzZxYzNBFJZ9myfb2MhJ07oUxFRJU0oiry7W2VKI2e7nipSePuu+9m4MCBJT/2rl27Sn4MkXbplVfSlTosWz0rJY2oinx7WzFKoz/11FNMmjRp7/zixYs599xzAViwYAGjR49m+PDhXHzxxWzbtg2A6upqbrrpJk455RQeeugh6urqmDx5MjU1NWzfvp3TTz+dxMOQTz/9NMOHD2fYsGGMGzcOgC1btnDeeecxdOhQTjzxRF599dX9vlumkutTpkzhG9/4BmPHjuVb3/pWUc6jiJRZpkfF28tUjDIivnGje7duQT7v3t1906bWbZ9GMUqjf/rpp37EEUf4tm3b3N192rRp/sADD3hTU5Ofeuqpe5d///vf9xtvvNHd3QcMGOC33nrr3n2cdtppvnz58v3mN2/e7P37998bV3Nzs7u7z5w502fPnu3u7osWLfJhw4a5u/vcuXP9mmuucffMJde/8pWv+DnnnOO7du3a73yojIhIfJCljIh6GlGU+Pa2bKXR586dy+zZs3nttdf2q1TbuXNnxo8fz69//Wt27drFk08+ycSJE3nxxRdZs2YNJ598MjU1Ndx33300NDTs3e6SSy7JGdOLL77ImDFj9sZ10EEHAbB06VKuvPJKAM444wyam5vZunVri22XLVvG5ZdfDgQl15cuXbq37eKLL6aqqirKaRGRGFLSyKUMt7flWxodggTwy1/+kmeeeYaRI0fSq1cv3J0zzzxzb2n0NWvWcM899+Q8XjJ331sqPXV5qnTrZWqPcmyR2Ml3TLNCpT5KeVwljVwqeHtbrtLoAKeffjorVqzgrrvu2tuDOPHEE3n++edZv349AJ988knGlyallktPGD16NH/4wx94++23gWAsA4JENj981+3ixYvp06cPBxxwQIttM5VcF2mz8h3TrFCpj1IeV0kjlwre3rZ48WJqamo4/vjjeeSRR7j22mv3W6eqqooJEybw1FNP7X1zX9++fZk3bx6XXXbZ3gHrtWvXpj3GlClTmDZt2t6B8IS+ffsyZ84cLrjgAoYNG7Y3Ic2ePZu6ujqGDh3KrFmzuO+++/bb549//GPmzp3L0KFDeeCBB/j3f//3YpwOkcpILdkR9b/3fLcrVKmPm2mwo71MRRkIl5LT70Ria/p0965dgxthunZ1nzGjtNsVqgjHRQPhIiJ5yHdMs1KlPspwXCUNEZFM8h3TrNRYaBmO22GThqe5C0gqQ78Lia18xzQrNRZahuOW9X0acdGtWzeam5vp3bt3zttFpbTcnebmZrp161bpUET2l29pjjKV9KjEcTtk0ujfvz+NjY00NTVVOhQhSOL9+/evdBgiEkGHTBpdunRJ+wS2iIhk12HHNEREpPWUNEREJLKyJg0zu9fMNpvZqgztp5vZVjOrD6frk9rGm9kbZrbezGaVL2oRaRNKWefpjjvADO66q3XHLSSm+no48EBI8/qBSip3T2MeMD7HOs+5e0043QRgZlXAz4AvAQOBy8ys9G8KEpG2o5R1nhJvpZw2rXXHLSSmK66ArVshrBgdF2VNGu6+BNiSx6ajgPXu/pa77wQeBiYWNTgRabtKWW/pjjv2PTC3Z0/L3ka24xYSU309rF4dfF69Ola9jTiOaYw2s5Vm9pSZDQqX9QPeTVqnMVyWlplNNbM6M6vTbbUiHUAp33mT6GUkJPc2sh23kJiuuKLlfIx6G3FLGiuAAe4+DPgJ8Hi4PN0TeBkfI3b3Oe5e6+61ffv2LUGYIhIbpay3lNzLSEj0NrIdt5CYknsZCTHqbcQqabj7h+6+Lfz8W6CLmfUh6FkckbRqf2BjBUIUkbgpZb2l1F5GwrRp2Y9bSEypvYyEmPQ2YvVwn5kdCvzF3d3MRhEktWbgA+AYMzsK+BNwKRCPMygilVXKekupf/iTl+c6br4xvflm65aXWVmThpk9BJwO9DGzRuAGoAuAu98JXARMN7NdwHbg0rC2+y4zmwn8DqgC7nX31WkOISIdTSnrLVWimGbSy9DiyNp7hdHa2lqvq6urdBgiIm2Gmb3s7rXp2mI1piEiIvGmpCEiIpEpaYiISGRKGiJSXqWsEZWvQuo8Zfs+ufYbx3ORg5KGiJRXKWtE5auQOk/Zvk+u/cbxXOSgu6dEpHw2bYKjj4YdO6B7d3jrLTj00MrGVF8Pxx+/b37lShg6NNq22b5Prv3G8VyEdPeUiMRDKWtE5auQOk/Zvk+u/cbxXESgnoaIlEfyf9YJlf4PO7U3kBClt5Ht+/z5z9n3G8dzkUQ9DRGpvFLWiMpXIXWesn2fXPuN47mISElDRMqjlDWi8lVInads3yfXfuN4LiKKVcFCEWnHSlkjKl+F1Hkq5PvE8VxEpJ6GiIhEpqQhIiKRKWmIiEhkShoikl4lSlwsXAidO8Mzz6Rvz1aWI1fJjmztub5rtvY2WAqkIO7erqcRI0a4iORh+nT3Tp3cZ8wo3zE/9zl3CH6mM2hQ0D5oUOvacrXn+q7Z2itxnkoMqPMMf1Mr/ke91JOShkgeNm5079Yt+BPRvbv7pk2lP+aCBcHxEtOiRS3bX3mlZfvKldHacrXn+q7Z2itxnsogW9LQ5SkR2V8lSlxccknL+YsuajmfrSxHrpId2dpzfdds7W20FEghWlVGxMyOBfoD3VLb3P23RYyraFRGRKSVKlHiYuFCOOus/ZcvWgRnnJG93MeePdlLdmTbtm/f7N8127lwj3UpkEJkKyMS6eE+MxsI/CcwELA0qzhQlXeEIhIf2Upc/OxnpTlmai8j4aKLYMuW/Mp9XH45rFqVfdsxY7J/12znwr385ykGoj4R/gugK3ABsAbYmX319MzsXmACsNndB6dpnwx8K5zdBkx395Vh2wbgI2A3sCtTFhSRAlWixMUHH2Rfnk+5j0Rbtm27dMn+XXOdizZaCqQQUZPG8cCl7v6bAo83D/gpcH+G9reB09z9fTP7EjAHOCGpfay7v1dgDCKSTSVKXKT+x56qkHIflSoV0k5FTRpvkmYco7XcfYmZVWdpT07RLxKMn4iISExEvXvqfwPfNrOjSxlMiquAp5LmHVhgZi+b2dRsG5rZVDOrM7O6pqamkgYpItKRZOxpmNlygj/UCf2AteHYwn4XIN19VLGCMrOxBEnjlKTFJ7v7RjM7GFhoZmvdfUm67d19DsGlLWpra9v3W6ZERMoo2+Wp1bRMGqtLHAsAZjYUuBv4krs3J5a7+8bw52YzewwYBaRNGiIiUhoZL0+5+xR3/7uoUzGCMbMjgUeBK919XdLynmbWK/EZOAtYVYxjikiZlaqOU6W2rcR+KyjSmIaZ3WtmR2VoGxDeShtlPw8By4DPm1mjmV1lZtPMbFq4yvVAb+DnZlZvZomn8g4BlprZSuAl4El3fzrKMUUkZm6+GZYuTf/0dLa2QvZbym0rsd8KivREuJntAU5095fStI0AXnL3WD7cpyfCRWIk+Qnr1Kens7UVst9SbluJ/ZZBtifCW1N7KlN2GQzoFiURya1UdZwqtW0l9lthGXsaZnYtcG04OwD4M/A/Kat1I7h0NM/drypVkIVQT0MkJkpVx6mQWlmlqrNVifpdRZRvT2MN8AjBwLQBz4bzydNcYAowo4jxikh7lK2OU7a2QvZbym0rsd8YyHjLrbsvBBYCmNlHwN3u/qdyBSYi7Uyp6jgVUiurVHW2KlG/q0xaVRq9LdLlKRGR1smrNLqZZXhJb3rufkZrAxMRkbYl25hGc8p0LHAq0IOgbHkPgjIfxwCqPCsi0gFkG9O4OPHZzK4CPg+c5O7vJC0/EvgN4diHiIi0b1Gf0/gOcH1ywgAI528Avl3swEREJH6iJo1Dgb/K0PZXwMHFCUeknWqHNYiyKlV9Kam4qEljMXCrmbUYTTezkcCtwB+KHJdI+9IOaxBlVar6UlJxUWtP9QeeAIYBfwE2E/QuDgFeBc5198YSxpk33XIrFdeGaxDlpVT1paRsCq495e6N7j4cOBf4BUGl2l8QJIvj45owRGKhndYgyqhU9aUkFvRwn0gptfEaRK1WqvpSUlZ59TTMrEfy51xTKQIXafPacQ2itEpVX0piI9vrXj8ys9HhOzS2kbk0ekIs36chUlHtuAZRWqWqLyWxkS1pfBV4M+lz+76OJVIKr7xS6QjKq6N93w4o2xPh9yV9nleWaEREJNaiviP8ZjM708w+U+qAREQkvqI+3Hce8DTwvpnVmdntZnaBmelJcBGRDiTqcxpDgD7AhQRv8BsNPAxsMrO1ZnZXlP2Y2b1mttnMVmVoNzP7sZmtN7NXzWx4Utt4M3sjbJsV5XgiHUJ9PRx4ILz6anm2g9KVAsm1X5UgqTx3b/UEdAXOJigvsgfYHXG7McBwYFWG9rOBpwheL3si8MdweRXBoPzR4bFXAgOjHHPEiBEu0q4NGuQOwc9ybOfuPn26e6dO7jNmtH7bQvZbquNKC0CdZ/ibGnVM4wAz+5KZ/auZPQdsBe4DPgC+SdDziJKglgBbsqwyEbg/jPtF4EAzOwwYBax397fcfSdBL2dilGOKtGv19bB6dfB59erovYZ8t4Pgv/25c4NnLubOLd5//bn2W6rjSqtEHdPYAvw/oBp4EKh1977ufp67/18PnuUohn7Au0nzjeGyTMvTMrOp4dhLXVNTU5FCE4mhK65oOX/55aXdDkpXCiTXflWCJBaiJo3lBLfnngmcBZxpZsebmRU5nnT78yzL03L3Oe5e6+61ffv2LVpwIrGS3FtIiNJryHc72PfffuIhvZ07i/Nff679luq40mpRB8JHAwcCkwiq2k4AniO4m+q3ZvatIsXTCByRNN8f2JhluUjHldpbSMjVa8h3OyhdKZBc+1UJktiI2tPA3be7+7PufiNwATAZqAfGA/9apHieAL4c3kV1IrDV3TcR9HSOMbOjzKwrcGm4rkjH9eabrVte6HZQurIoufbb0cqxxFi2MiJ7mdmhwKlJ02CCS0argZ8R9Dqi7Och4HSgj5k1ErwqtguAu98J/JbgDqr1wCfA34Vtu8xsJvA7gjup7nX31fsdQKQj2b69vNtB6cqE5NqvypPERtSXMO0BdgIrCBLEEuB5d/+gtOEVTqXRRURaJ1tp9Eg9DWAcsMzdd+RcU0RE2q1IScPdny11ICIiEn+RB8JFRESUNEREJDIlDRERiUxJQ0REIlPSEBGRyDLePWVmy2nFe8HdfVRRIhIRkdjKdsvtalqRNEREpP3LmDTcfUoZ4xARkTZAYxoiIhJZ1DIimFk1cAVwLNAttd3dJxUtKhERiaWoVW5HAH8geHvesQTv1PgswZv8Ggmq0oqISDsX9fLUbcAj7CuJfpW7Hw2cQjBY/oPShCciInESNWnUAP8BJF6d1Q3A3V8AbgS+X/zQREQkbqImDQd2evDyjc3AgKS2d4Fjih2YiIjET9SksQb46/DzMuAfzewYMxsAXAdEeE+kiIi0dVHvnprDvt7Ft4EFwNpw/mPgoiLHJSIiMRT1JUwPJH1+3cyOA0YD3YEX3X1zieITEZEYiXR5ysy+bGa9E/Puvs3dF7r7E8AuM/tyySIUEZHYiDqmMZd9YxqpjgrbIzGz8Wb2hpmtN7NZadq/aWb14bTKzHab2UFh2wYzey1sq4t6TBERKY6oYxqWpa038GGknZhVAT8DziR4KHC5mT3h7msS67j7bQTPhWBm5wL/6O5bknYz1t3fixi3iIgUUbbS6BOBiUmLvmtmTSmrdQNOBZZHPN4oYL27vxUe4+HwGGsyrH8Z8FDEfYuISIll62kcDAxJmv9r4NCUdXYS3El1S8Tj9SN4riOhETgh3Ypm1gMYD8xMWuzAAjNz4BfuPifDtlOBqQBHHnlkxNBERCSXbKXR7wLuAjCzZ4EZ7v56gcdLd5kr0zs7zgWeT7k0dbK7bzSzg4GFZrbW3ZekiX0OwW3C1NbW6p0gIiJFEmkg3N3HJhKGBQ43s8gVcpM0AkckzfcHNmZY91JSLk25+8bw52bgMYLLXSIiUiaR36dhZmeb2R+BHQSXmIaGy+8ysysi7mY5cIyZHWVmXQkSwxNpjvVZ4DTgv5KW9TSzXonPwFnAqqjxi4hI4SI/p0Hwx30twVhB8mWmdcBVUfbj7rsIxih+B7wO/NLdV5vZNDOblrTq+cACd/84adkhwFIzWwm8BDzp7k9HOa6IiBSHBTUIc6xk9gbwqLv/c3jb7KdArbuvMLOzgbnufkiJY81LbW2t19XpkQ4RkajM7GV3r03XFvXy1ABgYYa2HcAB+QQmIiJtS9Sk8S5wfIa2WvTmPhGRDiFq0rgHuCEc8O4eLjMzG0dQGv2uUgQnIiLxEvW22VsJbpW9D9gdLnsBqCJ4yO7HJYhNRERiJmppdAeuMbPbgXEE9aa2AM+4+7oSxiciIjHSqgf03H09Gr8QEemwIieN8GG8KQRPYR8GbAL+CNzn7jtLEp2IiMRK1If7jgP+m6Cs+WCCcY3B4fx6MxtYsghFRCQ2WvOO8K3Aqe7+TmKhmR0JPAncCYwpfngiIhInUW+5rQWuT04YAOH89cDIYgcm0c2fD9XV0KlT8HP+/EpHJCLtVdSexgaCFy6l0w14J0OblNj8+TB1KnzySTDf0BDMA0yeXLm4RKR9itrTmAXcYmYtXphkZicCNwHfKnZgEs13vrMvYSR88kmwXESk2KL2NP6FoL7UC2a2GdhM8Ga/g4Fm4Ntm9u3Eyu6u91yUyTsZ+niZlouIFCJq0liF3l0RS0ceGVySSrdcRKTYoj4R/nelDkTy873vtRzTAOjRI1guIlJskd/cJ/E0eTLMmQMDBoBZ8HPOHA2Ci0hp5POeb4mZyZOVJESkPNTTEBGRyJQ0REQkMiUNERGJrOxJw8zGm9kbZrbezGalaT/dzLaaWX04XR91W0lPZUZEpFjKOhBuZlUElXHPBBqB5Wb2hLuvSVn1OXefkOe2kkRlRkSkmMrd0xgFrHf3t8J3cDwMTCzDth2WyoyISDGVO2n0A95Nmm8Ml6UabWYrzewpMxvUym0xs6lmVmdmdU1NTcWIu81SmRERKaZyJw1Ls8xT5lcAA9x9GPAT4PFWbBssdJ/j7rXuXtu3b9+8g20PMpUTUZkREclHuZNGI3BE0nx/YGPyCu7+obtvCz//FuhiZn2ibCv7+973grIiyVRmRETyVe6ksRw4xsyOCt85finwRPIKZnaomVn4eVQYY3OUbWV/KjMiIsVU1run3H2Xmc0EfgdUAfe6+2ozmxa23wlcBEw3s13AduBSd3cg7bbljL+tUpkRESkWC/4et1+1tbVeV1dX6TBERNoMM3vZ3WvTtemJcBERiUxJQ0REIlPSEBGRyJQ0yqiQGlBf+EJw91Ni+sIXou+3kOOqbpWItODu7XoaMWKEx8GDD7r36OEO+6YePYLluYwb13K7xDRuXO79FnLcQrYVkbYLqPMMf1N191SZVFcHxQJTDRgAGzZk39bSPQuftH22/RZy3EK2FZG2K9vdU0oaZdKpU/C/eioz2LMn+7bZkoZZ9v0WctxCthWRtku33MZAqWpA5dpvIcdV3SoRSaWkUSaF1IAaNy7z8lz7LeS4qlslIvvJNNjRXqa4DIS7BwPIAwa4mwU/WzOgnDoYPm5c9P0WctxCthWRtgkNhFd+TENEpK3QmIaIiBSFkoaIiESmpCEiIpEpaYiISGRKGmU0YwZ07hw8HNe5czCfkK22FKh+lIjEQ1nf3NeRzZgBd9yxb3737n3z69bBokUt11+0KEgcv/998Ed+6lT45JOgraEhmIfcb+QrZFsRkVS65bZMOncOEkWqqqr0yxPcVT9KRMpLt9zGQKbEkC1hJLzzTuuWF2tbEZFUShplUlXVuuXJVD9KROKi7EnDzMab2Rtmtt7MZqVpn2xmr4bTC2Y2LKltg5m9Zmb1Zlb5a06tkBhHSLc8W20pUP0oEYmRTPVFSjEBVcCbwNFAV2AlMDBlnZOAz4WfvwT8MaltA9CnNceMU+2p6dPdq6qC2lFVVcF8QrbaUu6qHyUi5UNcak+Z2Whgtrt/MZz/5zBx/Z8M638OWOXu/cL5DUCtu78X9ZhxGQgXEWkr4jQQ3g94N2m+MVyWyVXAU0nzDiwws5fNLMMFHzCzqWZWZ2Z1TU1NBQUsIiL7lPs5jXTvoEvb1TGzsQRJ45SkxSe7+0YzOxhYaGZr3X3Jfjt0nwPMgaCnUXjYIiIC5e9pNAJHJM33BzamrmRmQ4G7gYnu3pxY7u4bw5+bgceAUSWNVkREWih30lgOHGNmR5lZV+BS4InkFczsSOBR4Ep3X5e0vKeZ9Up8Bs4CVpUiyELKbmQrFTJoUMtSIYMG7Wvr2rVlW9euLffbo0fL9uQ7ovr1a9nWL+WCn0qQiEjRZBohL9UEnA2sI7iL6jvhsmnAtPDz3cD7QH041YXLjya422olsDqxba6ptXdPPfige48eLe9k6tEj2h1H06e33C4xTZ/uPnBg+raBA927dEnf1qVLsN/u3dO3d+/ufvjh6dsOP7zw71PItiLSdhGXu6cqobV3TxVSdiPfUiHZuAe9h3y3VQkSEWmtON09FXuFlN0opFRIqagEiYgUk5xViZUAAAhDSURBVJJGikLKbhRSKqRUVIJERIpJSSNFIWU3spUKGTgwfdvAgdClS/q2xPLu3dO3d+8Ohx+evi2xXCVIRKSoMg12tJcpnzIihZTdyFYqJHUwfODAfW2pg+GJQfCE1MHw7t33taUOhicGwYvxfVSCRKTjQQPhKiMiIhKVBsJFRKQolDRERCQyJQ0REYlMSUNERCJT0milUtViylazKkq7iEg5lLs0eps2f37wzMUnnwTzDQ37ns2YPDn//c6YAXfcsW9+9+598z//ee52EZFy0S23rVCqWkzZalbt2pW7XUSkmHTLbZGUqhZTrppVcaxpJSIdk5JGK5SqFlOumlVxrGklIh2TkkYrlKoWU7aaVVHaRUTKRUmjFSZPhjlzgjEMs+DnnDmFDYJDMJg9fXrLnsX06fsGuXO1i4iUiwbCRUSkBQ2Ei4hIUShpiIhIZEoaIiISmZKGiIhEpqQhIiKRtfu7p8ysCUhT/COSPsB7RQynvdJ5ikbnKRqdp+hKda4GuHvfdA3tPmkUwszqMt12JvvoPEWj8xSNzlN0lThXujwlIiKRKWmIiEhkShrZzal0AG2EzlM0Ok/R6DxFV/ZzpTENERGJTD0NERGJTElDREQiU9JIw8zuNbPNZraq0rHEmZkdYWbPmtnrZrbazK6tdExxZGbdzOwlM1sZnqcbKx1TnJlZlZm9Yma/qXQscWVmG8zsNTOrN7OylvHWmEYaZjYG2Abc7+6DKx1PXJnZYcBh7r7CzHoBLwPnufuaCocWK2ZmQE9332ZmXYClwLXu/mKFQ4slM/sGUAsc4O4TKh1PHJnZBqDW3cv+EKR6Gmm4+xJgS6XjiDt33+TuK8LPHwGvA/0qG1X8eGBbONslnPTfWhpm1h84B7i70rFIekoaUhRmVg0cD/yxspHEU3jJpR7YDCx0d52n9H4EXAfsqXQgMefAAjN72czK+uJnJQ0pmJl9BngE+Ad3/7DS8cSRu+929xqgPzDKzHTZM4WZTQA2u/vLlY6lDTjZ3YcDXwKuCS+pl4WShhQkvEb/CDDf3R+tdDxx5+4fAIuB8RUOJY5OBv42vF7/MHCGmT1Y2ZDiyd03hj83A48Bo8p1bCUNyVs4wHsP8Lq7/1ul44krM+trZgeGn7sDXwDWVjaq+HH3f3b3/u5eDVwKPOPuV1Q4rNgxs57hjSeYWU/gLKBsd3oqaaRhZg8By4DPm1mjmV1V6Zhi6mTgSoL/COvD6exKBxVDhwHPmtmrwHKCMQ3dTir5OgRYamYrgZeAJ9396XIdXLfciohIZOppiIhIZEoaIiISmZKGiIhEpqQhIiKRKWmIiEhkShoiKcxsqpmdl+e283JVHY2yTqmZ2XVmdnqa5W5mMysQkrQRShoi+5sK5JU02pDrgNMrHYS0PUoaIiISmZKGtAuJSz5mdp6ZrTWzHWa21MwGpqzXycxmmdl6M/sfM1tnZl9Jal8MjAC+El6qcTObErZ9OdznFjN7P3wBVW2R4j/SzB4O9/2Jmf3OzD6f1F4dxjLJzH5hZlvDagU3mlmnlH1dbGb/bWbbwxiPT/keG4DewA1J3/H0pF1Umdm/mllT+DKyn5nZXxXje0rbp6Qh7ckA4N+Am4HLgc8CvzOzbknr/AT4F2AOwXsbHgPuDSusAswgqAv1W2B0OD0ZtlUD9wMXh/tvBJaY2dGFBG1mBxG8mOnzwDRgEtAT+H1YqyrZDwheEHYR8CBwffg5sa9agmJ/K4DzgSeA/0zZx/nAVoK6YYnvuCKp/X8DhwNXALcBXwf0VkYJuLsmTW1+AuYRvGPgpKRlA4BdwLRw/n8RvKfhKynb3g8sT5qvA+blOF4noDNBgrk+JY66CLHWJc3fDDQDByUt+xzBH/Zrwvnq8Pvdn7KveuDhpPlfERSvs6Rl14XbTkla9h4wO01sDixJWfY48GKlf8ea4jGppyHtyWZ3fyEx4+4NBK+gTZSNHkeQNB4zs86JCVgE1JhZVbadm9lxZvaYmf0F2A18StA7OLbAuL8ALAQ+TIrpozD21MtfC1Lm1xC8oyNhJPBrd08uKvdEK+PJdQzpwDpXOgCRItqcYdlh4ec+QBXBf/DpHEZwyWk/YSnqBcBfgG8ADcAOgteSdku3TSv0AU4ELknTtihl/oOU+Z0pxz8UaEpZJ3U+l1zHkA5MSUPak4MzLFsdft5CcLnqZNK/TjRd0kkYTfDf9pnuvvddGGb22fxCbWELQW/g5jRtH7VyX38G+qYsS50XyZuShrQnB5vZSYlLVGZ2JDAcmBu2P0PQ0/isuy/Msp90/1knBqT/J7HAzE4iGGso9PWkiwgGv1e7+/YC97UcONfMvp10iepv06yn3oPkRUlD2pP3gAfM7LvAduAmgt7DPAB3f8PM7gQeNrMfEAx4dwMGAce6+9XhftYCXzSzLxIMUL8NvEhw19Jd4bb9gdnAn4oQ978R3Kn0jJn9JNznIcBpwFJ3f6gV+7oV+CPBd5wLHAd8LWxL7l2tBc4xs6cJvtcb7t7aXo10QBoIl/akAfgmwR/zh4EPgS+6+46kda4huAz0ZYLbaucR3Hq7JGmdW4DXgV8S/ufu7n8huNX2UOC/gH8guD12faFBu/t7BGMaa4HbCcZOfkBwy/CrrdxXHXAZwbMmjwMXAtPD5g+TVv0m8DHB7cTLw/VFctKb+6RdMLN5wGB3L8rDdu2JmV0BPAAc7e5vVzoeadt0eUqknTGzOwhu4X2fYEznXwjeI62EIQVT0hBpf3oDPw9/NhM8EX5dRSOSdkOXp0REJDINhIuISGRKGiIiEpmShoiIRKakISIikSlpiIhIZP8fXS2o+OOWKW0AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**EXERCISE 8**\n", + "\n", + "Train LinearSVC on linearly seperable data, then train SVC and SGCClassifier and compare" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_decision_boundary(pipeline, X=X, y=y):\n", + " # Get params for our decision boundary\n", + " transfX = pipeline['scaler'].fit_transform(X)\n", + "\n", + " Xmin = transfX.min()\n", + " Xmax = transfX.max()\n", + "\n", + " b = pipeline['clf'].intercept_\n", + " w0 = pipeline['clf'].coef_[0][0]\n", + " w1 = pipeline['clf'].coef_[0][1]\n", + "\n", + " # get our input values to build line\n", + " x0 = np.linspace(Xmin, Xmax, 200)\n", + "\n", + " # Setup boundary\n", + " # b + w0x + w1y = 0 ==> y = -w0/w1 * x - b/w1\n", + " boundary = -(w0/w1) * x0 - b/w1\n", + " margin = 1/w1\n", + " top_gutter = boundary + margin\n", + " bot_gutter = boundary - margin\n", + "\n", + " # Plot our boundary and gutters\n", + " plt.plot(x0, boundary, 'k-')\n", + " plt.plot(x0, top_gutter, 'r--')\n", + " plt.plot(x0, bot_gutter, 'b--')" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Preprocess the data\n", + "\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.svm import LinearSVC\n", + "\n", + "Linear_SVM_clf = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('clf', LinearSVC()) \n", + "])\n", + "\n", + "Linear_SVM_clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor(X = transfX)\n", + "plot_decision_boundary(Linear_SVM_clf)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('clf',\n", + " SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,\n", + " coef0=0.0, decision_function_shape='ovr', degree=3,\n", + " gamma='scale', kernel='linear', max_iter=-1,\n", + " probability=False, random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.svm import SVC\n", + "\n", + "SVC_clf = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('clf', SVC(kernel='linear')) \n", + "])\n", + "\n", + "SVC_clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor(X = transfX)\n", + "plot_decision_boundary(SVC_clf)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('clf',\n", + " SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", + " early_stopping=False, epsilon=0.1, eta0=0.001,\n", + " fit_intercept=True, l1_ratio=0.15,\n", + " learning_rate='constant', loss='hinge',\n", + " max_iter=1000, n_iter_no_change=5, n_jobs=None,\n", + " penalty='l2', power_t=0.5, random_state=42,\n", + " shuffle=True, tol=0.001, validation_fraction=0.1,\n", + " verbose=0, warm_start=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import SGDClassifier\n", + "\n", + "sgd_clf = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('clf', SGDClassifier(loss=\"hinge\", learning_rate=\"constant\", eta0=0.001,\n", + " max_iter=1000, tol=1e-3, random_state=42)) \n", + "])\n", + "\n", + "sgd_clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor(X = transfX)\n", + "plot_decision_boundary(sgd_clf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**EXERCISE 10**\n", + "\n", + "Train SVR on California Housing Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_california_housing\n", + "\n", + "housing = fetch_california_housing()\n", + "X = housing[\"data\"]\n", + "y = housing[\"target\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(20640, 8)" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('model',\n", + " SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,\n", + " gamma='scale', kernel='rbf', max_iter=-1, shrinking=True,\n", + " tol=0.001, verbose=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.svm import SVR\n", + "\n", + "SVR_model = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('model', SVR())\n", + "])\n", + "\n", + "SVR_model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7275639524733043" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SVR_model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.3570026426754465\n", + "0.5974969813107398\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "y_pred = SVR_model.predict(X_test)\n", + "mse = mean_squared_error(y_pred, y_test)\n", + "rmse = np.sqrt(mse)\n", + "print(mse)\n", + "print(rmse)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 10 candidates, totalling 30 fits\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492, total= 10.9s\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 10.8s remaining: 0.0s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492, total= 10.7s\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492 ...\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492, total= 10.9s\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492 ...\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492, total= 10.1s\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492 ...\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492, total= 10.1s\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492 ...\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492, total= 10.0s\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784 ....\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784, total= 11.1s\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784 ....\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784, total= 11.1s\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784 ....\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784, total= 11.2s\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594 ...\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594, total= 11.2s\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594 ...\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594, total= 11.6s\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594 ...\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594, total= 11.8s\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165 ..\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165, total= 9.9s\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165 ..\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165, total= 10.1s\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165 ..\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165, total= 7.3s\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009 ....\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009, total= 6.4s\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009 ....\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009, total= 8.2s\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009 ....\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009, total= 11.8s\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101 ...\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101, total= 10.3s\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101 ...\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101, total= 5.7s\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101 ...\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101, total= 9.1s\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465 ...\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465, total= 10.7s\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465 ...\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465, total= 11.0s\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465 ...\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465, total= 11.6s\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697 ....\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697, total= 11.9s\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697 ....\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697, total= 6.6s\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697 ....\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697, total= 11.9s\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038 .....\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038, total= 10.0s\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038 .....\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038, total= 10.0s\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038 .....\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038, total= 10.0s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 5.1min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "RandomizedSearchCV(cv=3, error_score=nan,\n", + " estimator=Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True,\n", + " with_mean=True,\n", + " with_std=True)),\n", + " ('model',\n", + " SVR(C=1.0, cache_size=200,\n", + " coef0=0.0, degree=3,\n", + " epsilon=0.1, gamma='scale',\n", + " kernel='rbf', max_iter=-1,\n", + " shrinking=True, tol=0.001,\n", + " verbose=False))],\n", + " verbose=False),\n", + " iid='deprecated', n_iter=10, n_jobs=None,\n", + " param_distributions={'model__C': ,\n", + " 'model__gamma': },\n", + " pre_dispatch='2*n_jobs', random_state=None, refit=True,\n", + " return_train_score=False, scoring=None, verbose=2)" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import RandomizedSearchCV\n", + "from scipy.stats import reciprocal, uniform\n", + "\n", + "SVR_model = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('model', SVR())\n", + "])\n", + "\n", + "param_dists = {\n", + " 'model__gamma': reciprocal(0.001, 0.1),\n", + " 'model__C': uniform(1,10)\n", + "}\n", + "\n", + "rnd_search_cv = RandomizedSearchCV(SVR_model, param_dists, verbose=2, cv=3)\n", + "rnd_search_cv.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('model',\n", + " SVR(C=5.908725733130206, cache_size=200, coef0=0.0, degree=3,\n", + " epsilon=0.1, gamma=0.04722302780443009, kernel='rbf',\n", + " max_iter=-1, shrinking=True, tol=0.001, verbose=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rnd_search_cv.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.37646952041010656\n", + "0.6135711209062129\n" + ] + } + ], + "source": [ + "y_pred = rnd_search_cv.predict(X_test)\n", + "mse = mean_squared_error(y_pred, y_test)\n", + "rmse = np.sqrt(mse)\n", + "print(mse)\n", + "print(rmse)\n", + "\n", + "# Note that our original model performed better thanks to its gamma = 'auto' default option\n", + "# To improce performance, further tuning of gamma would be required" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch5/Exercises.ipynb b/Ch5/Exercises.ipynb new file mode 100644 index 000000000..bc6125423 --- /dev/null +++ b/Ch5/Exercises.ipynb @@ -0,0 +1,657 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "import os\n", + "\n", + "from sklearn.svm import SVC\n", + "from sklearn import datasets\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.svm import SVC\n", + "from sklearn import datasets\n", + "\n", + "iris = datasets.load_iris()\n", + "X = iris[\"data\"][:, (2, 3)] # petal length, petal width\n", + "y = iris[\"target\"]\n", + "\n", + "setosa_or_versicolor = (y == 0) | (y == 1)\n", + "X = X[setosa_or_versicolor]\n", + "y = y[setosa_or_versicolor]" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "# Plot the data\n", + "\n", + "def plot_setosa_versicolor(X = X, y = y):\n", + " plt.plot(X[:,0][y==0], X[:,1][y==0], 'bo', label='iris setosa')\n", + " plt.plot(X[:,0][y==1], X[:,1][y==1], 'r^', label='iris verticolor')\n", + "\n", + " plt.xlabel('petal length', fontsize=15)\n", + " plt.ylabel('petal width', fontsize=15)\n", + " plt.legend()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**EXERCISE 8**\n", + "\n", + "Train LinearSVC on linearly seperable data, then train SVC and SGCClassifier and compare" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_decision_boundary(pipeline, X=X, y=y):\n", + " # Get params for our decision boundary\n", + " transfX = pipeline['scaler'].fit_transform(X)\n", + "\n", + " Xmin = transfX.min()\n", + " Xmax = transfX.max()\n", + "\n", + " b = pipeline['clf'].intercept_\n", + " w0 = pipeline['clf'].coef_[0][0]\n", + " w1 = pipeline['clf'].coef_[0][1]\n", + "\n", + " # get our input values to build line\n", + " x0 = np.linspace(Xmin, Xmax, 200)\n", + "\n", + " # Setup boundary\n", + " # b + w0x + w1y = 0 ==> y = -w0/w1 * x - b/w1\n", + " boundary = -(w0/w1) * x0 - b/w1\n", + " margin = 1/w1\n", + " top_gutter = boundary + margin\n", + " bot_gutter = boundary - margin\n", + "\n", + " # Plot our boundary and gutters\n", + " plt.plot(x0, boundary, 'k-')\n", + " plt.plot(x0, top_gutter, 'r--')\n", + " plt.plot(x0, bot_gutter, 'b--')" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Preprocess the data\n", + "\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.svm import LinearSVC\n", + "\n", + "Linear_SVM_clf = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('clf', LinearSVC()) \n", + "])\n", + "\n", + "Linear_SVM_clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor(X = transfX)\n", + "plot_decision_boundary(Linear_SVM_clf)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('clf',\n", + " SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,\n", + " coef0=0.0, decision_function_shape='ovr', degree=3,\n", + " gamma='scale', kernel='linear', max_iter=-1,\n", + " probability=False, random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.svm import SVC\n", + "\n", + "SVC_clf = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('clf', SVC(kernel='linear')) \n", + "])\n", + "\n", + "SVC_clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAELCAYAAAA2mZrgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAgAElEQVR4nO2dd3xU1fLAvye00DtCqE9sdKQJgnQREUVEQRSkJyDYlYc/fZaHPgs+8SkqCb2oiCKIYkFAEBSEKEUF6YQSpIQuLcnO74+TRsgmm2Q3d3cz38/nfHbvPfeeO3c32blzZs6MEREURVEUBSDEaQEURVEU/0GVgqIoipKCKgVFURQlBVUKiqIoSgqqFBRFUZQUCjotQG6pUKGC1KpVy2kxFEVRAopffvnlqIhUTL8/4JVCrVq1iI6OdloMRVGUgMIYE5PRfp0+UhRFUVJQpaAoiqKkoEpBURRFSUGVgqIoipKCKgVFURQlBVUKiqIoSgqqFBRFUZQU8rdSWLUKjh1zWgpFURS/If8qhfh4uOsuqFoVHngAfvwRtLaEoij5nPyrFAoVgsWLYdAgWLAA2rSBBg3sPkVRlHyKXykFY0x1Y8z3xpgtxpg/jDGP+PSCjRvDe+9BbCxMngzFikFoqO3btQtWr1brQVGUfIVfKQUgAXhCROoALYGRxpi6Pr9qiRIwZAisXQs33WT3vfMO3HgjNGoEEybAiRM+F0NRFMVp/EopiMhBEfk16f1pYAtQNU+FMMa+jh0LUVFQpAg89BCEhdlXRVGUIMavlEJajDG1gOuBnzPoCzfGRBtjoo8cOeIbAUqUgGHDYN06+OUX6N/f+iHATinNng0nT/rm2oqiKA5hxA/nzI0xJYAVwMsi8llmxzZr1kzyPHX2xo3WH1G0KNx7L0REQIsWqVaGoiiKn2OM+UVEmqXf73eWgjGmEDAP+CArheAYjRpBdDT06wdz50LLlnD99bBtm9OSKYqi5Aq/UgrGGANMAbaIyJtOy5MpTZtan0NsLEycCGXLQvXqtm/xYuu09kMrTFEUJTP8SikArYH+QEdjzIak1s1poTKlVCk7ffT993Y6CeDpp+GGG6BJE3j/fTh1ylkZFUVxz8GD0K4d/PWX05L4BX6lFERklYgYEWkoIo2T2ldOy5Vtvv/eKgOABx+EKlXgf/9zViZFUTJm7Fib8mbsWKcl8Qv8SikEDaVKwfDh8Ouv8PPP0Lcv/OMfti82FiIj4fRpZ2VUFMVaCdOmgctlX9VaUKXgU4yxUUmTJ8Mdd9h9CxZYhVGlCoSH23BXRVGcYexYqxAAEhPVWkCVQt4zYgSsWQO9e9u1Ds2aWf9DfLzTkilK/iLZSrh40W5fvKjWAqoU8h5jrBKYOtX+UU6YYFNrJC+Me+stO+2kKIpvSWslJKPWAgWdFiBfU7o0jByZun3sGDz7LPz9t7UgwsOtP6JECedkVJRgZfXqVCshmYsX4aefnJHHT1BLwZ8oVw7277fJ+M6ft0qhShVN560ovmD9eruWKH1bv96z84M0lFWVgr9RpgyMGgWbNtnCP3ffbVdLA3zzjXVanznjrIyKogRtKKtf5j7KDo7kPnKKAQNg5kwoWdKm2IiIsCk3FEXJWw4ehCuvtBZ90aK2/krlyk5LlS0CJveRkgnTp9snk549bZRE48YwcKDTUilK/iOIQ1lVKQQSxkDr1jBjBhw4YFdJ33ab7Tt9Gh55xE47KYriO4I8lFWVQqBSrhw8/DDcc4/dXrfOrpRu1AhatbJ/pGfPOiujogQjQR7KqkohWOjY0VoP48fb0qGDB9tqcYcOOS2ZoniXnEb9eCtayJNQ1gCOTFKlEEyULw+PPgqbN8OKFXY66YorbN/LL1ufhFoPSqCT06gfb0ULeRLKGsCRSRp9lB9ITLSL4TZssCGv/fvbyKV69ZyWTFGyR06jfvIyWihAIpM0+ig/U6CATZ2xYgV062Z9D/Xrw9tvOy2ZomSPnEb95GW0UIBHJqmlkB85etSud7jjDrjqKlv/YcECaz3Ureu0dIqSMWmfwJPx5Ek8p+flpYwOoJaCkkqFCvD441YhAGzcaIsC1atnk/PNmgXnzjkro6KkJ6dRP3kZLRQEkUmqFBTrnD5wAMaNs9FKDzxgM7kGuBWp+AHejMLJadRPbhLfJfvhPF3/EwRJ9lQpKJaKFeHJJ2HrVli2DJ57zi6Wi4+Hu+6ytR/SmsSK4gnejMJJjvoJSfrZCgnxLOon+bwRI+w5Dz7oeeK7fv3g5Em4777syZjTJHv+gIgEdGvatKkoPmTHDpGrrrJ/2uXKiTz6qMiWLU5LpQQCsbEioaH2b6doUZGDB3M/5nvvXfpzGxXl2fVyIsv69Zdea+PG3MvvRwDRksFvqloKSubUrm2th6VLoXNnePddqFMH1q51WjLF3/FFFM6oUZduDx/u2fVyIku/fpdue2otBDgafaRkj8OH4ZNPUk3xl16C48dt7Ydrr3VaOsVf8EUUzvvv26mf9ERFQffu7q8nkn1ZNmxITVmflo0boWHDnMnvZ2j0keIdKlWy1eKS53X377frHa67Dtq3h48+ggsXHBVR8QN8EYWT3kpIZvjwzK+XE1nSWwnJ5ANrQZWCkjsmToR9++CVV2DvXvtP88gjTkulOI0vonDS/7Cn3Z/Z9XIiy86d2dsfRKhSUHJP5cowZgzs2GFLhz70kN2/fj106ABz5qj1ECh4K4Q0t1E4GYWCZjRe2nFFIDYW2ra195G8P7lv/XpbF33jxktlyeiez53L+Dr5YP2OKgXFe4SEwM03p+ZUOnwYYmKgb1+oVg1Gj4bt252VUckcf0nklt1Q0GQyk9/dmP5yz35CvnU0T548mVWrVhEREUHLli0xxvhAOgWXC5YssfmWPv8cihe3C+RCQ52WTEmPvyRyS+/k9dS5m5n87sb0l3t2gIBxNBtjphpjDhtjfvfldY4cOcK8efO48cYbadSoERMmTODEiRO+vGT+JCQEunSBefOs72HOHKsQRGwNiNGj7bST4jz+ksgtp6Ggmcnvbkx/uWd/IqPFC042oC3QBPjdk+Nzs3jt1KlTEhkZKU2aNBFAihYtKoMGDZLVq1eLy+XK8biKB5w6JXLnnSIFCtjZ2k6dRD7+WOTCBacly5+kXdyV3Ly14Cw7pF8w5unCsczkdzfmkiX+cc8OgZvFa44rgQyFglp5oRTSEh0dLcOGDZPixYsLIA0bNpR3331XTpw44ZXxFTccOCAydqxIjRr2z/HDD+1+Vcp5y4gRIoULX/oDWbiwyIMP5q0c9epl/ANer17m52Umv7sxy5b1j3t2iKBSCkA4EA1E16hRw6sf1MmTJ2XixIly/fXXCyDFihWTwYMHy88//6zWgy9JSBD56iuRc+fs9uuvi3TuLPLJJyIXLzormz8TGyvStm3un24bN874h7Nx49RjFi+2lt3SpZefv369SOnSlz/Ru9vvri/9k3tyCw3N/J4zk9/dmMZkfc9BTFAphbTNV7mPXC6XrF27VoYOHZpiPTRq1Ejee+89OXnypE+uqaRh4sRU6+GKK0TGjBHZudNpqfyPESNEQkLy5um2bFlJecJOT/LTePonenf7s+rL7L5y2qdcgiqFXHDy5El57733pFGjRinWw5AhQ2Tt2rVqPfiShASRRYtE7rjD/qN37Zral5jonFz+gi8Szrlj8eJLn6bTWgvuEsdlllAus76cJrbLy88jCFCl4AVcLpf8/PPPMmTIEClWrJgAcv3118v777+v1oOv2bdP5I8/7PuYGJGwMJGnnxbZtctZuZwk7Ty6r+fCk62EtPPxyaSfs09+8ne3P6u+zO4rp33KZQSMUgA+Ag4C8cB+YEhmxzuVOvvEiRPy7rvvSsOGDQWQ4sWLy9ChQ2XdunWOyJOv2LJF5PbbrfVgjEiXLiLz5uUv30NeRgultxLSWgvuIns+/jjj/Rs3Zh5hlNl95bRPyZCAUQrZbU7XU3C5XLJmzRoZNGiQFC1aVABp0qSJTJw4UU6dOuWobEHP3r0izz8vUrWqdYDGxtr9+UE55GW0UHorIa214C6yJ71saS2CzCKMMruvnPYpGaJKIQ84fvy4TJgwQRo0aCCAlChRQsLDwyU6Otpp0YKb+HiRn39O3e7Uyfof5s+3fcGIJ9FCOSGjyB53UTrGuI/scddCQzOPMMrsvnLal9m95WNUKeQhLpdLVq9eLQMHDkyxHpo2bSpRUVFy+vRpp8ULblyuVOsBRKpUEXn2WeuHULLG25E9OY0G8kUUkUYmXYIqBYc4fvy4vPPOO1K/fv0U6yEiIkJ+/fVXp0ULbuLjRT7/XOS22+wT7Ztv2v0XLgSv9ZBbvB3Zk9NoIF9EEWlk0mWoUnAYl8slP/74owwYMEBCQ0MFkGbNmsmkSZPUevA1MTEiySvTp061kUv/+pdaD+nxdmRPTqOBfBFFpJFJl6FKwY84duyY/O9//5O6desKICVLlpThw4fL+vXrnRYt+FmxQuTWW631YIxIt27Wosjv6028HdmT02ggX0QRaWRShqhS8ENcLpesWrVK+vfvL0WKFBFAWrRoIZMnT5YzZ844LV5ws3u3yDPPiFSuLHL99alKIb+uN/F2ZE9Oo4F8EUWkkUkZ4k4p+F3q7PyEMYbWrVszc+ZMYmNjeeuttzh9+jRDhw4lLCyMkSNHsnHjRqfFDE5q1YKXXrIlROfPB2NsAZZq1eD22+HLL20q5fyCt8tZ5rQcpy/KePpizGAmI00RSC2QLYWMcLlcsnLlSunXr1+K9XDDDTfI1KlT1XrwNXFxqdZDcuRSzZoiv/3mtGTeJbPQTA3bzDeglkJgYIyhTZs2zJo1i9jYWMaPH8/JkycZPHgwYWFhjBo1it9++81pMYOTcuVSrYd586BgQVtO9OWXbf+xY8FhPWRWflJLU+Z78m05zkBCRFi5ciVRUVF8+umnXLhwgZYtWxIREUHv3r0pVqyY0yIGHxmVaXzsMfjxRxg6FIYMgapVnZYy+2RWfjIfl6bMjwRMOU7lcowxtG3bltmzZ3PgwAH++9//cvz4cQYNGkRYWBgPPfQQv//u0+ql+Y+MyjTeey9cdx08/zzUqAE9esCyZc7KmV0yKz+ppSkVUJ9CoOJyuWT58uXSt29fKVy4sABy4403yvTp0+Xs2bNOixfYZBXCuGOHre9QqZJ9FbFpvg8ccE5mT9CEckoa8IZPwRhzjTGmozGmW/rmG5WluMMYQ7t27fjwww85cOAAb7zxBkePHmXgwIGEhYXxyCOP8McffzgtZmCS9ok5mbRPzrVrwyuvwL598PTTdt+331rroWdP+OYb//Q9ZHZfWd2zkn/ISFOkb0Bd4DcgEXBl0BI9GccXLb9aChnhcrnk+++/l3vvvVcKFSokgLRu3VpmzJih1kN2yEmyuZgYkX/+U6RiRXtszZoiL70kkvZzz6w0ZWbk5LzslqzMbUI5jVoKOMjN4jVgJbAV6AFcDdRM3zwZxxdNlULGHD58WMaNGydXX321AFK2bFl55JFH5I/kQjWKb7hwwdYS6NTJKoaEBLt/+3aRunXtv1xWRejTk1nZSnfkdUI5TTYXcORWKZwBuntybF43VQqZ43K5ZNmyZdKnT58U66FNmzYya9YstR58TXI9jQsXRMqVu/QJ3NOn/szKVrojrxPKabK5gMSdUvDUp7ATCM3lTJXiAMYYOnTowJw5c9i/fz+vv/46f/31F/3796dq1ao89thjbNmyxWkxg5OSJe2rMZA+bLhNG1i/Pusx+vW7dPu++7I+xxdRRBq1lH/ISFOkb0Bn4FfgSk+Oz8umlkL2SUxMlCVLlkjv3r1TrIe2bdvK7Nmz5dy5c06LF3y4Kz85aZLtj40V+esvz8/LzFrI64RyGrUUsJDd6SNgHbA2TTsEXAS2pdu/FljrbhxfN1UKuePQoUPy6quvypVXXimAlCtXTh577DHZsmWL06IFD+7KT9ata/sfflikYEGRu+8W+e47kcTEzM/LzLeQ1wnlNNlcwOJOKbhd0WyMmQ54vNxZRAZl307JPflhRXNe4HK5WLZsGZGRkSxYsICEhATatWtHeHg4vXr1okiRIk6LGLgULWpXCacnNBTOnYNt2yAqCqZPh7g4G/L68MPwz39mfl5GXH89bNhw+f7GjT2brsrumOD96yl5grsVzZrmQrmMQ4cOMW3aNCZNmsSuXbsoX748AwcOZNiwYVx77bVOixe8nD8Pn31mFUTt2jBlit3/00/QsiWEaAICxXvkSikYY6YCY0VkdwZ9NYHnRWSwVyTNJqoUfIfL5WLp0qVERkby+eefk5CQQPv27YmIiKBnz55qPfiShASbkC86Gpo3t0oiPBwGDoRKlZyWTgkCcpv7aCBQ0U1fBWBADuVS/JiQkBBuvvlmPv30U/bt28d//vMfYmJi6Nu3L9WqVeOpp55i+/btTosZnBQsaF/r14fZsyEszE4nVasGffpAbKyz8ilBS3bsUXcmRX3giBdkUfyYypUr8/TTT7Njxw6+/fZb2rZty/jx47nmmmvo2LEjH3/8MRfTFzJRck9oKNx/P/zwA/zxB4wcCevWQZkytv+XX+CI/vsp3iMzR/MjwCNJmzWBv4AL6Q4LBa4ApovIEF8JmRk6feQcBw8eZOrUqUyaNImYmBgqVqzIwIEDCQ8P56qrrnJavODF5bL+BRGoVw927oS77rLTS+3b23URipIF2fYpGGNuBroABngc+BA4mO6wi8CfwFwRSa8w8gRVCs6TmJjId999R1RUFAsXLiQxMZGOHTsSERHBnXfeSeHChZ0WMXjZvNk6pmfMgBMn4Jpr4D//gV69nJZM8XNy62h+HpgsIgd8IVxuUKXgX8TGxqZYD3v37qVSpUoMGjSIYcOGUbt2bafFC17OnYNPP4XISHjoIet3OHzYKo127dR6UC4jYEJSjTFdgf8BBbCK6NXMjlel4J8kJiayePFiIiMj+fLLL0lMTKRz586Eh4fTo0cPtR58iYhVAq++alN7X3utnVp64AGoUMFp6RQ/ISfTR9kqKSUiHXMoW9prFsCumL4Z2I9dVd1XRDa7Oyc3SmH0aKhVy/rxSpfO0RCKBxw4cCDFeti3bx+VKlVi8ODBDBs2jCuvvNJp8YKXs2fhk0+s9bB6NRQubC2I6dN1zYOSo5DUuHTtGuAmoBg2a2oxoA02lfZRL8nZAtghIrtE5CIwB5uu2+vEx8P339tgjrAwW3Z33Tr7kKV4l6pVq/Kvf/2L3bt3s2jRIlq2bMnrr79O7dq16dKlC/PmzSM+Pt5pMYOPYsVgwAC7+G3TJmstJDupAebNg2PHnJVR8T8yyn2RvgFDgE1AjXT7ayTtH+bJOB5c527slFHydn9gQgbHhQPRQHSNGjVynPvD5RJZu1ZkyBCRYsVs2pZ3383xcEo22Ldvn7z44otSvXp1AeSKK66Qp59+Wnbt2uW0aPmDAwdEjBEpUkSkXz+RH36w/xBKvoFc1lPYBdzppq8nsNuTcTy4zj0ZKIV3MjvHWwnxTpwQee89kf377fbChSLDhomsW+eV4RU3JCQkyBdffCG33367hISEiDFGunTpIvPmzZOLFy86LV5ws3GjyMiRIqVKpSbo0z/4fIM7peDpxGJlwF1OgyKAt9bd7weqp9muBuTJ0s3SpWHECKha1W7v2gUffGAzDDRtaqP+Tp/OC0nyFwUKFKB79+4sXLiQPXv28Nxzz/HHH3/Qq1cvatSowTPPPMOePXucFjM4adgQJkywq6OnToWyZaF60r/fTz/BqlU6n5ofyUhTpG/AV8AeoFm6/c2BGOArT8bx4DoFsVbJP4DCwEagXmbn+DJ19okTdjqpYUP7INWkic8upaQhPj5eFi5cKLfddluK9dC1a1f57LPP1HrIK269NdV6eOstkWPHnJZI8TLkcvqoGrbITiL2yX1D0msisB6o5sk4Hl6rGzYCaSfwTFbH50U9BZdLZPVqka+/tttnz4p07CgSGZlacVHxDTExMfLcc89JWFiYAFKlShV59tlnZc+ePU6LFtycOSMyebJI8+b2ZyI0VOT5552WSvEi7pRCttYpGGO6JVkHlbFpL9aJyFc5MlG8hBPrFLZuhbvvht9/hxIlbEhrRIRNO6/4hoSEBL766isiIyP5+uuvAejatSvh4eF0796dgskJ5BTvs369nT9t2tSG6Z05Y8Na+/VLzcGkBBwBs3gtuzi1eE0E1qyxIeAff2xT4W/YAI0a5bko+Y6YmBimTJnClClTiI2NJSwsjCFDhjB06FBq1KjhtHjBz7x59qmoaFG77iE83NZ70FXTAUVOFq8VE5Gzye+zukDysXmNP6xoPn4cFi60C0aNgSefhL//ttZDcnEqxfskJCTw5ZdfEhUVxTfffAPArbfeSkREBN26dVPrwZf8+qu1Hj74wFoODRrYTK5qOQQMOVEKiUArEVlrjHGRRWlOESngFUmziT8ohfSMGmWLZp0/Dy1aWOXQpw8UL+60ZMHLnj17UqyHgwcPUrVq1RTroXr16lkPoOSM06dhzhy7YnrqVLtvyhSbvfWGG9R68GNyohQGAF+KSJwxZiBZK4UZ3hA0u/ijUgBrPcyaZaeXNm+GRx6Bt97y/PwPPoBnnoG9e6FGDXj5Zeu7UDInPj6eL7/8ksjISBYvXowxhm7duhEeHs6tt96q1oOvuXjRFgI6csSGvIaHW9+D5pHxO9wpBa9EDDnZ8iL6KDe4XCKrVons3Gm3f/hB5IYbRKZMsQEeGTF7duoK6+RWrJjdr3jOrl275P/+7/+kcuXKAki1atXk+eefl3379jktWnBz6pTIxIki11+f+sc7Z47TUinpIJchqWOxSepKeHJ8XjZ/Vwrp+eYbkeuus598qVJ2QenGjZceU7PmpQohudWs6YTEgc/Fixdl3rx50qVLFwEkJCREbr/9dvnyyy8lISHBafGCm+homxpgyxa7vXq1TR1w8qSzcim5C0k1xvwG1AVc2AVlK5PaKhE57B1jJmf46/RRZojYxaKRkTYFfqlScOAAFCpk+woUyHghqTE2n5mSc3bt2sXkyZOZOnUqhw4donr16gwdOpTBgwdTrVo1p8ULfp56Ct54wybr69vXOtyaNVPfgwPkOiTVGFMWmyU1uTXB1jzYDqwUkWHeE9dzAlEppCUuzpbebdvW/uA3awY7dmScUqNmTdCMD94hPj6ehQsXEhkZyXfffUdISAjdu3cnIiKCW265hQIFHImbCH5EIDraPhF99JFN792tGyxa5LRk+Q6vrlMwxhQGOgOjgbaAiEYf5ZoTJ2zk0ty5NrV3WooVsxGA6mz2Pjt37kyxHg4fPkyNGjUYOnQoQ4YMISwszGnxgpdTp2xERUiItRgSEuCf/7QWRLPL/Z+Kd8mVoxkoBdwK/Ac7bXQOOAIsAJ4AWngyji9aoPkUPOHoUZH77hMpWND6EipWtE7m5Onv2bOtf8EY+6oOaO9w4cIFmTt3rnTu3FkAKVCggPTo0UO++uor9T3kBRs2pEZYNGlindWaR8ZnkEtHcwLwN/AhEEEWSerysgWjUkjG5RJZsUIkPt5uP/OMyDXXiBQuLBqZ5GO2b98uo0ePlooVKwogNWvWlLFjx8qBAwecFi24SZ+FsnhxkT/+cFqqoMSdUvDU0bwa60M4BfyQZC2sADaIJwP4kGCaPsqKyZNteu+EhMv71N/gGy5evMiCBQuIiopi6dKlFChQgNtvv52IiAi6dOlCiJa19A0isHatnUsdN85OMb33no3G6NvXJh1TcoU3HM1FgZZYH8JNSe8TgJ+AFSLymvfE9Zz8pBTAfZCGMXYFdeHCeStPfmL79u1MmjSJadOmcfToUWrVqsWwYcMYNGgQVapUcVq84KdTJ1i2TLNQegmvLl7D+hh6AMuxYaqJORnHGy2Yp48ywt0ahrAwkXLlRB59VGTzZqelDG7Onz8vc+bMkQ4dOgggBQsWlLvuuku+/fZbSUxMdFq84MXlEvnpJ5EBA2wqb7BzqkqOIDeV14wxlY0x9xhj3jbGrAfigHlAWeBdoK8XFJfiAS+/bCOR0lKsGDz6KHTuDO++C3XrQrt2NrDjwgVn5AxmihQpQp8+fVi2bBlbt27l0Ucf5YcffuCWW27hqquu4pVXXuGvv/5yWszgwxho1cqm7Y6Nhbffhh49bN/vv9u51fXrHRUxKMhIU6RvWGvgPHaq6DXgNqCMJ+f6uvmLpZCTiKBOnS592u/UybPxRowQKVDAnlOggN1O5tAhkddeE6ld2z5MJRfM+vtvjVryJefPn5cPP/xQ2rdvn2I99OrVSxYvXqzWQ14wc2aq9dC8uS0Q5C6PjCIi7i0FT5VCByDUk2PzuvmDUshJrqL0CiGtYshsPE+vlZh4adDGlVeKhIRkT0YlZ/z555/yxBNPSPny5QWQK6+8Ul555RX566+/nBYtuImLE/nf/2wJ0eQ51eTQPeUy3CkFLbLjBWrVgpiYy/dnFhGU2ar+mjXdjwfZv1ZiIlSoYBfHZec8JXecP3+ezz77jKioKFasWEHBggW58847iYiIoGPHjhq55CtE4KefYNs2GDTIbg8caOdUNYd9Clp5zYeEhGQ/V1FmSsEY9+NBzvIiZSbj33/b/iJF3J+v5I4///yTqKgoZsyYwbFjx6hdu3ZK5FKlSpWcFi+4OXrU5pHZssUmGuvf30YuNWjgtGSO4k4p6KOKF3BXATKnlSEzGy+n18rsvEmTbAr8p56C7ds9l1PxnOuuu44333yTAwcOMHv2bMLCwhgzZgzVqlWjT58+LF26FJdmO/QNFSrYBGM//AB33GEX/DRsCF984bRk/klGc0qB1NSn4JlvILPzfvxRpFev1LQaHTuKfPyxjQBUfMfmzZvl0UcflbJlywogV111lbz++uty6NAhp0ULbo4eFXnrLZGzZ+32+++LjBolsmmTs3LlMeTG0ezPzR+UgkjeRh/lNIooq/NiY0VeflmkVi2Rtm1T9+tvlG85e/aszJo1S9q0aSOAFCpUSPr06SPLli0Tl2pm3zNmTGrumFatRKZPT1UYQYwqBR/jLkw0pz/8meHr0NLERJHkQJm//rL/L8nWw4UL3r2Wcim///67PIrEiTkAACAASURBVPLII1KmTBkB5Oqrr5Zx48bJkSNHnBYtuDlyROS//xW59lr7j9qjh9MS+ZxsKwVgHbDW0+ZuHF83f1AKI0Zc+sOfdpVxTqaIMiOvS3UeOyby0kupK6krVhQZPVrk4EHfXE+xnD17VmbMmCGtW7cWQAoXLiz33nuvfP/992o9+BKXS2T5clshTkRk/36Rm24SmTEj6KwHd0rBbfSRMWY64HFokogMyr5HI/f4Q/RRwYI27DM7ZBZ2mlmIaE7CX71BYiIsXmxro3z1lY32q1ULDh6E8uU155Iv+f3334mKimLmzJmcPHmSa665hvDwcAYMGECFChWcFi+4Wb3ahrNu2wZlysADD0B4ONSr57RkuUZDUn1ITioJZhZ2mtPQ0rwKXjl2DMqVs++7d7eFtAYNgmHD4Mor80aG/MjZs2f55JNPiIyMZPXq1RQuXJi7776b8PBw2rZti9GSlr5BBFassE9E8+bZf7TYWAjwUGK/D0lNyq30hzHGZYwJqLJLOanc6IvQ0rwiWSGArRTXsiW8/jrUrg233GItCsX7FCtWjAEDBvDTTz+xadMmwsPDWbRoEe3bt6dOnTq8+eabxMXFOS1m8GEMtG9vy4ceOAAff5yqEO69Fx57zK6BCBI8VgrGmFrGmGeNMTONMXPTNy/I8jtwF7ZeQ0ARHp7xfneVHDt1cp/Y7uWXM79WTs/zFV27woIFsHcvvPii/d9INtwuXIBdu5yRK9hp0KAB77zzDrGxsUybNo2yZcvyxBNPULVqVfr168fKlSsJ9FkAv6RiRejVy75PSLAKIzkLZdu2Ngvl+fPOyphbMnI0pG9AU+AMsAVIBNYDu7CJ8vYCyzwZx8NrLQeaeXp8ThzNOY3eySwRXZkylzp/y5Sx+zNyNCdTqNCl+wsVSu1L76QOC/NMDl/cd3ZISEj1x82ZY6/VpYvIvHkiFy96/3pKKhs3bpSRI0dKqVKlBJA6derI+PHjJS4uzmnRgpu0WShB5M037X4/DwgglwnxlgEzgAJJiqBJ0v4bgRigqyfjeHgtnyqFnEbvuIswGjEiNf+Wp61QIZGiRTPuK1rUfdRSWFjgRC2J2MCNF14QqVbNXq9yZZH/+z+Rc+d8d01F5MyZMzJlyhS54YYbBJAiRYpIv379ZOXKlRq55EsSE0WWLLGL40Rs5tZ27UQ++MAv/+hzqxSOAbcAJkkp3JimbzC2LKcn4yzBThOlbz3SHJOlUgDCgWggukaNGtn6INwVqalZM/Pzkp/M0zd3+33Vcip/Ts/zBvHxIgsXitx2m0i9eqkPUBs3ahJLX7NhwwYZMWKElCxZUgCpW7euvPXWW3IsOae64js+/NCmJwaR8uVFHn9c5M8/nZYqhdwqhTigQ9L7v4C+afpuBv72ZBwPr+VTS8GYjH8cjcnqA/SPllP5c3qetzl/3r6eOSNSqpRIlSoizz4rsmdP3sqR3zhz5oxMnjxZWrRoIYCEhoZK//79ZdWqVWo9+JLERJHvvhO5+26bR6ZRo0v7HMSdUvDU0bwZqJ30fjXwmDHmamNMTWA0sNPDcRwnp9E77iKMchJ5lBsCOWoJUjOxFikCs2ZBkybWSf6Pf0C3bvDrr3krT36hePHiDBkyhJ9//pn169czcOBAFixYQJs2bWjQoAFvv/02x48fd1rM4CMkxJZE/OQT2LcPpkyx+0+dsuF6Tz1l10D4ExlpivQN6A88m/S+DrAP63BOBE4BXTwZJ4tr9AT2AxeAQ8C3npynPgX/9Sl4SkyMyL/+Ze9v3Tq7b/dutR58zenTp2XSpEnSrFmzFOthwIAB8uOPP6r14GtiYi7NQtmhg8hHH6Wa0nkA3sx9BJTAThvdAVTKyRjeat6OPsppGcz0iqFu3eQP/vKWTPpKaCEhqX2ZRR/5a86k3JLWvzBokJWzWzeRzz9X34Ov+eWXXyQiIkJKlCghgNSvX1/eeecdOX78uNOiBTdps1CCdbSJ5MkffK6UAvAAUN5NXzngAU/G8UXzZu4jbz9NZ2ZdZNanWEvh2WetzwFEqlYVeeMNp6UKfk6dOiVRUVHStGlTAaRo0aIycOBAWb16tVoPviQxUWTVqtTtfv1skjQfZqHMrVJIBFq46WsKJHoyji+aN5WCtyN0MotYyqxPSSU+XmT+fJFbbxV59FG7z+US+eYbuyZC8R3R0dESHh6eYj00aNBAJkyYICdOnHBatOBn3LjLs1Bu3+7VS+RWKbgyUQpdgOOejOOL5k2l4O0InZxGGCkZk/yg+v339nOqVs2ug9i3z1Gxgp5Tp07JxIkT5frrr0+xHgYNGiRr1qxR68GXJCSIfP21yJ132qfFp56y+xMTvWI9uFMKmWVJ7QH0SNocCCwCjqQ7LBS4CdgiIl08c217F28mxPN2BlJ32VOTI5bc9SUkZP9a+Yn4eFtJMSrK5lkyBm67zW5Xruy0dMFNdHQ0kZGRfPTRR/z99980atSI8PBw7r//fkqXLu20eMFLbKz9cbjiCvj2W5ut9Zln4OGHczxkThLiVQIaJDWwIakN0rWawGIgIseS+RHezivkLidSeHjmfUrmFCoEd90F33wDO3fCmDE291Jyor4ffoD9+52VMVhp1qwZkyZNIjY2lvfffx9jDCNHjiQsLIwhQ4awdu1a3D1oKrkgLMwqBLC56lu18l1a5IzMh/QN+B6o48mxed28XWTH2xE6mUUs5TSHkXI5ybMYLpf93kJCRO64Q2TRIvU9+BKXyyVr166VIUOGSLFixQSQxo0by3vvvScnT550WjwlE8ju9JE7jE3aXgU4LCKOT3T4Qz0Fxb/YuRMmT4apU+HwYbtA74034J57nJYsuDl58iQffvghkZGRbNy4keLFi9O3b1/Cw8Np1qyZ1nvwM3JdT8EY080Y8zNwHrt4rWHS/knGmH5ek1RRcknt2vDKK3YB6SefwLXXQokStm/vXvj66+xXylOypnTp0owYMYL169fz888/06dPHz788ENatGhB06ZNiYyM5PTp006LqWSBR0rBGPMAsBD4E5uMLq3K3wYM8b5oipI7CheGu++2zuhbb7X7pkyx6TRq17a+ooMHnZUxGDHG0KJFC6ZMmUJsbCwTJkwgISGB4cOHU6VKFcLDw/nll1+cFlNxg6eWwjPAOBEZAMxO1/cHUNerUimKj3jmGZg7F666Cp59FqpXh759bTCw4n1Kly7NyJEj2bhxI6tXr6Z3797Mnj2bZs2a0bRpU6KiotR68DM8VQo1ge/c9J0HSnlHHEXxLYULW9/CkiU2D9kTT0CFCql1tqdNU+vBFxhjaNmyJVOnTiU2NpZ33nmHixcvEhERQVhYGBEREfyq2RD9Ak+Vwj7gejd9zYAd3hFHUfKOq6+G116Dd96x27t3w+DB1jHdq5cNB/dV1F9+pkyZMowaNYpNmzbx008/0atXL2bOnEnTpk1p3rw5kyZN4syZM06LmW/xVClMAZ5PcigXTdpnjDGdsKmzJ/lCOEXJS/7xD9i6FR591K516NrVTjPpA6xvMMbQqlUrpk+fTmxsLG+//Tbnzp0jPDycsLCwFKe1krd4FJKaFIY6ARiOzYNUEIjHlueMFJGRvhQyMzQkVfEFFy7A/PkwYwbMmQOlS8N3SROonTrZNPmK9xERVq9eTWRkJHPnzuX8+fM0b96ciIgI+vTpQ4nkMDIl17gLSc3WOgVjzFVAJ6A8tkTnMhFxtEKEKgUlr+jcGZYutZFLw4bBwIGpi0wV73Ps2DFmz55NZGQkmzdvpmTJkvTr14+IiAgaNWrktHgBj1eUgj+iSkHJK86ft9ZDZCSsWGHTbTz9NLz4otOSBTciwo8//khUVBRz587lwoULtGjRIsV6KF68uNMiBiTeWLxW2BgTboyZbIxZlPQ6zBhT2LuiKop/Ehpqw1eXL4ctW+Chh6BePdsXF2ed1ocPOypiUGKMoU2bNsycOZPY2FjGjx/P6dOnGTJkCGFhYYwcOZJNmzY5LWbQ4KlPoQ7wDRAG/AIcxibMawL8BXQVkc0+lNMtaiko/sBHH8F991nroWdPm9iwQwf1PfgKEWHVqlVERkby6aefcuHCBVq2bEl4eDh9+vShWPrMlspl5Gr6yBizEigNdBeRvWn218Cm1D4uIm29KK/HqFJQ/IUtW2z67hkz4PhxuOYa+OWX1BQbim+Ii4tj5syZREZGsnXrVkqXLk3//v0JDw+nQYMGWQ+QT8mtUjgH9BWRBRn09QQ+FJGil5/pe1QpKP7G+fMwb54NZf3vf+2+8eOhcWNo3z51oZziXUSElStXplgPFy9epFWrVoSHh9O7d2+1HtKRW5/CHmxBnYwIBfa66VOUfEdoKNx/f6pCOHvWJujr2NEm53vjDTiSvlyVkmuMMbRt25YPPviAAwcO8N///pe4uDgGDRpE1apVefjhh/n999+dFtPv8VQpjAFeMsbckHanMaYl8G/gn94WTFGChWLFbEW/mTOhUiV46imoVs1mcFV8Q4UKFXj88cf5888/Wb58ObfeeiuRkZE0aNCA1q1bM3PmTM6dO+e0mH6Jp9NH67D5j8pjnczJjuZKQBzWkkhBRFp4W1B36PSREmj88Yf1PTzxhE2psXQpbNgAAwbYPEyKbzh69CgzZswgKiqKbdu2UaZMGR544AEiIiKoWzf/5fTMrU9hWnYuJiKDsnN8blCloAQ6Tz5pp5qSU32Hh0Pbtup78BUiwvLly4mKimLevHnEx8fTpk0bwsPDufvuuyla1BH3aJ6ji9cUxY/57TeYNMlOMZ08aWs+LFrktFTBz5EjR5g+fTpRUVHs2LGDsmXLplgPderUcVo8n6JKQVECgLNnra+hcGG7UO78eXj4YejfH9q0UevBV7hcLpYvX05kZCTz588nPj6em266KcV6CA11F2cTuKhSUJQAJDra5lw6eRLq1LFTSw88AOXKOS1Z8HL48OEU62Hnzp2UK1cuxXq47rrrnBbPa+Q6zYWvMcaMM8b8aYzZZIyZb4wp47RMiuI0zZrBgQMwdSqUKgWPPQZhYbBDK5j4jEqVKjF69Gi2bdvGkiVL6NSpExMmTKBOnTq0a9eODz/8kPPnzzstps/wG0vBGNMFm3U1wRjzGoCIZBnqqpaCkp/YuBEWLIDnnrNTSa++CkWLWuuhbFmnpQteDh06lGI97Nq1i/LlyzNgwADCw8O59tprnRYvRwTU9FHSKum7ReT+rI5VpaDkV0SgSxdbWjQ01JYZjYiAG29U34OvcLlcLFu2jMjISBYsWEBCQgLt2rUjIiKCu+66iyJFijgtosf4/fRROgYDX7vrTMrWGm2MiT6iS0OVfIoxtvDP+vUwaJC1INq00VTeviQkJITOnTvzySefsG/fPv7zn/+wd+9e7rvvPqpWrcqTTz7Jtm2OlpjJNXlqKRhjlgCVM+h6RkQ+TzrmGWzd57vEA+HUUlAUy5kz8PHH0KoV1K0La9bAxInWOd2qlVoPvsLlcrFkyRKioqL4/PPPSUhIoH379kRERNCzZ0+/tR4CYvrIGDMAW/Kzk4ic9eQcVQqKkjEzZ8LIkVZZ1K9vlUP//lBGQzh8xsGDB5k2bRqTJk1iz549VKhQgUGDBjFs2DCuvvpqp8W7BL9XCsaYrsCbQDsR8XhOSJWCorjnzBlb6yEqyoa3Vqtm8zBpnQff4nK5+O6774iMjGThwoUkJibSsWNHwsPD6dmzJ4ULO1+bLBCUwg6gCDaXEsAaERme1XmqFBTFM379FXbtsqk0XC5bDKhLF+jXD0qXdlq64OXgwYNMnTqVSZMmERMTQ8WKFVOsh6uuusoxufxeKeQUVQqKkn0OHoTu3a2iKFoU7r3XRi61aKG+B1+RmJiYYj188cUXJCYm0qlTJyIiIujRo0eeWw+BFn2kKIoPqVLFVoWLjraWwty50LIlLF7stGTBS4ECBejatSvz588nJiaGsWPHsn37dnr37k316tUZM2YMO3fudFpMtRQURYHTp61ieOABW2f69ddh2zbrnG7eXK0HX5GYmMi3335LVFQUX3zxBS6Xi86dO6dYD4UKFfLZtdVSUBTFLSVLwpAhViEAnDoFc+bADTdAkybw/vt2n+JdChQoQLdu3ViwYAF79+7lxRdfZOvWrdxzzz1Ur16dp59+ml27duWpTKoUFEW5jJdegthYeO89u3L6wQet1aD4jqpVq/Lcc8+xe/duvvzyS1q0aMHrr79O7dq1ueWWW1JqP/ganT5SFCVTRGDdOptKo2FD2L7dOqbDw+G++6yVofiG/fv3M2XKFCZPnsz+/fupXLkygwcPZujQofzjH//I1dg6faQoSo4wxkYlNWxot48cgcREGD7cOqzDw63TWvE+1apV4/nnn2f37t188cUXNGvWjFdffZXatWvTtWtX1q9f7/VrqlJQFCVb3Hijzbe0Zg307g2zZ0Pr1nDihO0P8MkHv6RgwYJ0796dL774gj179vDcc8/x+++/E+KDVYg6faQoSq44eRJ+/tkuhAP7euWV1oJo0sRZ2YKZxMREChQokOPzdfpIURSfULp0qkK4eBGqVrV5l5o2teGskybZdBuKd8mNQsgMVQqKoniNwoVh2jQbufT223DunLUYPvjA9rtczsqnZI0qBUVRvE6ZMvDQQ/Dbb/DjjzZKCex6hxYtYMoUtR78FVUKiqL4DGOsYzo5bLViRTh7FoYOtbWmH3zQlhhV/AdVCoqi5Bm9e1vrYdUqm6V12jRb8yGZhATnZFMsqhQURclTjLEhrDNmwIED1hENdv1DWBiMGgWbNjkrY35GlYKiKI5RrhzUqWPfnzsHt9wCkydDo0a2hOi0aXa6Sck7VCkoiuIX1KgBs2ZZ62H8eLsYbvBg+Osv23/+vLPy5ReCcvFafHw8+/fv57z+FfkNoaGhVKtWzaepgJXgQsROIzVqZLd79LBTTOHh1jdRrJiz8gU67havFXRCGF+zf/9+SpYsSa1atTCaCN5xRIS4uDj279+f6yReSv7BmFSFANCpk83aOmgQPPYY9O9v8y/VreucjMFIUE4fnT9/nvLly6tC8BOMMZQvX14tNyVXPPwwbNkCK1ZAt24QGQnz5tm++Hjrk1ByT1AqBUAVgp+h34fiDYyBtm3tCukDB2ykEsDnn9vIpUcegT/+cFbGQCdolYKiKMFNhQpQtqx9f+WV0LWrXTFdvz60aWOd1omJzsoYiKhSwD511KoFISH2NTlPS2648cYbc9SXG5YvX85PP/3kk7EVxZ9p0gQ++shaD+PGweHDMHas/Z8GOHrUWfkCiXyvFD74wEYzxMTYaIeYmEsTeOWUjH6cE5MeW3z1w61KQcnvVKwITz4JW7fC99/b6aZz5+Caa+y00+zZGtqaFfleKTzzzOWLY86etftzQ4kSJQD7Q92hQwfuu+8+GjRocEnfwYMHadu2LY0bN6Z+/fqsXLnysnHGjBlD3bp1adiwIU8++SQAR44coVevXjRv3pzmzZvz448/smfPHiZOnMj48eNp3LgxK1euJCYmhk6dOtGwYUM6derE3r17Afjkk0+oX78+jRo1om3btgDs2bOHm266iSZNmtCkSRNVLkpAY4xN4Q12Cunpp+HgQRuxVLUqPP64fQBUMkBEAro1bdpU0rN58+bL9rnDGBFrI1zajPF4iAwpXry4iIh8//33UqxYMdm1a9dlfW+88Ya89NJLIiKSkJAgp06dumSMuLg4ueaaa8TlcomIyPHjx0VEpG/fvrJy5UoREYmJiZHrrrtORESef/55GTduXMr53bt3l+nTp4uIyJQpU6RHjx4iIlK/fn3Zv3//JWP+/fffcu7cORER2bZtm2T0ueaW7HwviuJtEhNFli4V6d1bpFAhkTVr7P4jR0SS/vTzFUC0ZPCbGpTrFLJDjRoZPzHUqOG9a7Ro0SLD+PzmzZszePBg4uPjufPOO2ncuPEl/aVKlSI0NJShQ4dy22230b17dwCWLFnC5s2bU447deoUp0+fvmz81atX89lnnwHQv39/Ro8eDUDr1q0ZOHAgvXv35q677gLsgr9Ro0axYcMGChQowLZt27xz84riJ4SEQMeOth09CuXL2/1PPw2ffQYDBtip4+uuc1ZOp8n300cvv3z5yshixex+b1G8ePEM97dt25YffviBqlWr0r9/f2bOnHlJf8GCBVm7di29evViwYIFdO3aFQCXy8Xq1avZsGEDGzZs4MCBA5RMzk2cCclhoRMnTuSll15i3759NG7cmLi4OMaPH88VV1zBxo0biY6O5uLFi7m8a0XxXypUsFNMAPffbxXFO+/YPEzt28P8+Y6K5yh+oxSMMWONMZuMMRuMMYuNMWF5cd3774eoKKhZ0/6R1Kxpt++/3/fXjomJoVKlSgwbNowhQ4bw66+/XtJ/5swZTp48Sbdu3XjrrbfYsGEDAF26dGHChAkpxyXvL1my5CUWw4033sicOXMA+OCDD2jTpg0AO3fu5IYbbuDf//43FSpUYN++fZw8eZIqVaoQEhLCrFmzUpziihLstG8Pn3wC+/bBK6/A3r2wdKntE4GdOx0VL+/JaE7JiQaUSvP+YWCiJ+fl1qfgK9L6FG677bYM+6ZPny716tWTxo0bS5s2bS7xO4iIxMbGSvPmzaVBgwZSv379FP/AkSNHpHfv3tKgQQOpU6eOREREiIjI1q1bpUGDBtKoUSP54YcfZPfu3dKhQwdp0KCBdOzYUWJiYkREpGfPnlK/fn2pV6+ePPzww+JyuWTbtm3SoEEDueGGG2TMmDEpMnoTf/heFCUrEhNFTp+271essD7GDh1EPvpI5Px5Z2XzJrjxKfhlQjxjzNNADREZkdWxGSXE27JlC3WS8/EqfoN+L0qgceSITeU9aRLs3m2nnQYOhGefhdKlnZYud7hLiOc300cAxpiXjTH7gPuB5zI5LtwYE22MiT5y5EjeCagoSr6iYkXriN6xA779NjXFRmio7f/zTwg291ueKgVjzBJjzO8ZtB4AIvKMiFQHPgBGuRtHRKJEpJmINKtYsWJeia8oSj4lJAS6dLEJ+HbuhCJFwOWyqTWqVYPRo63iCAbyVCmISGcRqZ9B+zzdoR8CvfJSNkVRFE8oWjT1/cSJtrTom2/C1VdD586QwRrUgMJvpo+MMVen2bwD+NMpWRRFUbIiJMRaCvPn24ilsWNh+3ZbMQ7g0KHAjFzyG6UAvJo0lbQJ6AI84rRAiqIonhAWZp3Pu3bZWg9g1z1cdRXcfDN8+qmt+RAI+M2KZhHR6SJFUQKaAgVS348YYX0PkyfDPffAFVdARAS8+KJz8nmCP1kKznLwILRrl1olPJc4kTo7I6ZPn05sbGzK9tChQy9JkZGdcUaNcuv7VxQlHVWrwr/+Za2HRYvghhvs9FIy33/vn9aDKoVkxo6FVavsqxdwInV2RtdLrxQmT55M3TwoapuQkODzayhKIFCggJ1S+vxzm7obbChrx45QvTr83//ZNRD+gioFsFbCtGk2xmzaNK9YC95Inf3111/Tu3fvlO3ly5dz++23A7B48WJatWpFkyZNuOeeezhz5gwAtWrV4t///jdt2rTho48+Ijo6mvvvv5/GjRtz7tw52rdvT/Jiv2+++YYmTZrQqFEjOnXqBMCxY8e48847adiwIS1btmTTpk2X3Zu7lNwDBw7k8ccfp0OHDvzzn//M9WeoKMFGctGfq6+GL76AFi3gtdegdm245RY/CWvNaJlzIDWvpLkYMUKkcGG7nr1wYZEHH8ze+RngjdTZ8fHxUr16dTlz5oyIiAwfPlxmzZolR44ckZtuuill/6uvviovvviiiIjUrFlTXnvttZQx2rVrJ+vWrbts+/Dhw1KtWrUUueLi4kREZNSoUfLCCy+IiMjSpUulUaNGIiIybdo0GTlypIi4T8k9YMAAue222yQhISHDz0TTXCjK5ezdK/L88yJXXy1y7Jjdt26dyO7dvr0ubtJcqKWQbCUkL0u8eNFr1kIymaXOnjZtGi+88AK//fbbZZlOCxYsSNeuXfniiy9ISEhg0aJF9OjRgzVr1rB582Zat25N48aNmTFjBjFp8n/36dMnS5nWrFlD27ZtU+QqV64cAKtWraJ///4AdOzYkbi4OE6ePHnJuatXr+a+++4DbEruVatWpfTdc889FEjrbVMUJVOqV4cXXrDV4pJrTo8aZetO33orLFgAeTkbq0ph7Fg7bZSWxESv+RYg56mzwf7Az507l2XLltG8eXNKliyJiHDzzTenpM7evHkzU6ZMyfJ6aRGRlFTa6fenJ6Pj3PV7cm1FUS4n7b/Z3LnWSf3bb9Czp63vMnly3sihSmH16suTl1y8CHngDM4qdTZA+/bt+fXXX5k0aVKKBdCyZUt+/PFHdiRNQJ49e9ZtUZz06bSTadWqFStWrGB3kofr2LFjgFVUHyQVqF6+fDkVKlSgVKlSl5zrLiW3oijeoUYNG7q6Z491UDdpkhruevy43ecr68Fv1ik4xvr1jl16+fLljBs3jkKFClGiRIkMLYUCBQrQvXt3pk+fzowZMwCoWLEi06dPp2/fvly4cAGAl156iWuuueay8wcOHMjw4cMpWrQoq1evTtlfsWJFoqKiuOuuu3C5XFSqVInvvvuOF154gUGDBtGwYUOKFSuWcs20vP322wwePJhx48ZRsWJFpk2b5q2PRFGUNBQsCHfcYVsyH39s10D07w8Z/GTkGr9MnZ0dNHV24KDfi6LknoQEu+7hiiugZcucj+MudbZaCoqiKAFEwYLQo4fvxlefgqIoipJC0CqFQJ8WCzb0+1CUwCAolUJoaChxcXH6Q+QniAhxcXGEJperUhTFbwlKn0K1atXYv38/WqrTfwgNDaVatWpOi6EoShYEpVIoVKhQhiuIFUVRlMwJyukjRVEUJWeoUlAURVFSUKWgKIqipBDwK5qNMUeAmCwP9B8qAEedFiKX6D34B8Fwq+WaigAAB/9JREFUDxAc9xGI91BTRCqm3xnwSiHQMMZEZ7S0PJDQe/APguEeIDjuIxjuIRmdPlIURVFSUKWgKIqipKBKIe+JcloAL6D34B8Ewz1AcNxHMNwDoD4FRVEUJQ1qKSiKoigpqFJQFEVRUlCl4GOMMfcYY/4wxriMMW5D1owxXY0xW40xO4wxY/JSxqwwxpQzxnxnjNme9FrWzXF7jDG/GWM2GGOiMzomr8nqczWWt5P6NxljmjghZ2Z4cA/tjTEnkz73DcaY55yQMzOMMVONMYeNMb+76Q+E7yGre/D778EjRESbDxtQB7gWWA40c3NMAWAncCVQGNgI1HVa9jTyvQ6MSXo/BnjNzXF7gApOy5udzxXoBnwNGKAl8LPTcufgHtoDXzotaxb30RZoAvzupt+vvwcP78HvvwdPmloKPkZEtojI1iwOawHsEJFdInIRmAP4sOBetukBzEh6PwO400FZsoMnn2sPYKZY1gBljDFV8lrQTPD3vw2PEJEfgGOZHOLv34Mn9xAUqFLwD6oC+9Js70/a5y9cISIHAZJeK7k5ToDFxphfjDHheSadezz5XP39s/dUvlbGmI3GmK+NMfXyRjSv4u/fg6cE+vcQnPUU8hpjzBKgcgZdz4jI554MkcG+PI0VzuwesjFMaxGJNcZUAr4zxvyZ9HTlFJ58ro5/9lngiXy/YvPYnDHGdAMWAFf7XDLv4u/fgycEw/egSsEbiEjnXA6xH6ieZrsaEJvLMbNFZvdgjDlkjKkiIgeTTPrDbsaITXo9bIyZj536cFIpePK5Ov7ZZ0GW8onIqTTvvzLGvGeMqSAigZSgzd+/hywJku9Bp4/8hHXA1caYfxhjCgP3AgsdliktC4EBSe8HAJdZP8aY4saYksnvgS5AhlEaeYgnn+tC4IGk6JeWwMnkqTI/Ict7MMZUNsaYpPctsP/XcXkuae7w9+8hS4Lke1BLwdcYY3oC7wAVgUXGmA0icosxJgyYLCLdRCTBGDMK+BYbbTJVRP5wUOz0vArMNcYMAfYC9wCkvQfgCmB+0v9EQeBDEfnGIXkBcPe5GmOGJ/VPBL7CRr7sAM4Cg5ySNyM8vIe7gRHGmATgHHCvJIXD+AvGmI+w0TkVjDH7geeBQhAY3wN4dA9+/z14gqa5UBRFUVLQ6SNFURQlBVUKiqIoSgqqFBRFUZQUVCkoiqIoKahSUBRFUVJQpaDkS4wx4caYHOVwMsZMzyoLrCfH+BpjzGhjTPsM9ktSmKuiXIYqBSW/Ek7gJPbLKaOxcfWK4jGqFBRFUZQUVCkoAUPylIwx5k5jzJ/GmPPGmFXGmLrpjgsxxoxJKthywRizzRgzIE3/cqApMCBpKkWMMQOT+h5IGvOYMea4MeZ7k0lxpGzKX8MYMydp7LPGmG+NMdem6a+VJEtvY0xkUsGW/caYF40xIenGusfYokfnkmS8Pt197AHKA8+nucf2aYYoYIz5jzHmiLGFY941xhTxxn0qgY0qBSXQqAm8CYwF7gNKA98aY0LTHPMO8CwQBdwGzAemGmO6J/U/CPyJTa3QKqktSuqrBczEpvK4D5uo7QdjzJW5EdoYUw5YhS24NBzoDRQHlhhjiqY7/HXgDDZtwmzguaT3yWM1w9ZV+BXoic0b9HG6MXoCJ4Epae7x1zT9TwBhQD9gHBABPJKbe1SCBKer/GjT5mkDpmPTKd+YZl9NIAEYnrR9FeACBqQ7dyawLs12NDA9i+uFYPM4/Qk8l06OaA9kjU6zPRabHK1cmn1lsT/cI5O2ayXd38x0Y20A5qTZ/gSbbNCk2Tc66dyBafYdBV7IQDYBfki3bwGwxunvWJvzTS0FJdA4LCI/JW+ISAzwCzZNN0AnrFKYb4wpmNyApUBjY0yBzAY3xtQxxsw3xhwCEoF47NP9NbmUuzPwHXAqjUynk2RPPz21ON32Zmwq6WSaA1+ISNrEZdnNqpvVNZR8imZJVQKNjGo5HAaSSzdWwGYTPenm/CrYKaHLSEr9vRg4BDwOxADngclAaEbnZIMK2NrDfTLoW5pu+0S67Yvprl8ZOJLumPTbWZHVNZR8iioFJdDIqBRoJSA51fgx7HRSa6zFkJ4MCwQl0Qr7tHyziPyZvNMYUzpnol7CMezT/NgM+k5nc6y/sKnY05J+W1FyhCoFJdCoZIy5MXkKyRhTA2gCTEvqX4a1FEqLyHeZjJPRk3Gyw/dC8g5jzI3Yuf5fcin3Uqxz+Q8ROZfLsdYBtxtj/i/NFNIdGRynT/9KtlGloAQaR4FZxph/YQuZ/Bv79D8dQES2GmMmAnOMMa9jHcqhQD3gGhEZmjTOn8AtxphbsA7g3cAabNTPpKRzqwEvAAe8IPeb2EifZcaYd5LGvAJoB6wSkY+yMdZrwM/Ye5wG1AGGJfWltY7+BG4zxnyDva+tIpJdq0TJZ6ijWQk0YoCnsD/Wc4BTwC0icj7NMSOx0zQPYMNOp2NDU9PWi34J2ALMJenJW0QOYUNRK2NLjj6KDR/dkVuhxdbpbYn9oR6P9V28jg2p3ZTNsaKBvti1FguAXsCIpO5TaQ59CvgbG267Lul4RckUrbymBAzGmOlAfRHxymKyYMIY0w+YBVwpIrudlkcJXHT6SFECEGPM+9gQ1+NYn8qzwCJVCEpuUaWgKIFJeeC9pNc47Irm0Y5KpAQFOn2kKIqipKCOZkVRFCUFVQqKoihKCqoUFEVRlBRUKSiKoigpqFJQFEVRUvh/g+2ZqDxXIRIAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor(X = transfX)\n", + "plot_decision_boundary(SVC_clf)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('clf',\n", + " SGDClassifier(alpha=0.0001, average=False, class_weight=None,\n", + " early_stopping=False, epsilon=0.1, eta0=0.001,\n", + " fit_intercept=True, l1_ratio=0.15,\n", + " learning_rate='constant', loss='hinge',\n", + " max_iter=1000, n_iter_no_change=5, n_jobs=None,\n", + " penalty='l2', power_t=0.5, random_state=42,\n", + " shuffle=True, tol=0.001, validation_fraction=0.1,\n", + " verbose=0, warm_start=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import SGDClassifier\n", + "\n", + "sgd_clf = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('clf', SGDClassifier(loss=\"hinge\", learning_rate=\"constant\", eta0=0.001,\n", + " max_iter=1000, tol=1e-3, random_state=42)) \n", + "])\n", + "\n", + "sgd_clf.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plot_setosa_versicolor(X = transfX)\n", + "plot_decision_boundary(sgd_clf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**EXERCISE 10**\n", + "\n", + "Train SVR on California Housing Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_california_housing\n", + "\n", + "housing = fetch_california_housing()\n", + "X = housing[\"data\"]\n", + "y = housing[\"target\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(20640, 8)" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('model',\n", + " SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,\n", + " gamma='scale', kernel='rbf', max_iter=-1, shrinking=True,\n", + " tol=0.001, verbose=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.svm import SVR\n", + "\n", + "SVR_model = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('model', SVR())\n", + "])\n", + "\n", + "SVR_model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7275639524733043" + ] + }, + "execution_count": 134, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SVR_model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.3570026426754465\n", + "0.5974969813107398\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "y_pred = SVR_model.predict(X_test)\n", + "mse = mean_squared_error(y_pred, y_test)\n", + "rmse = np.sqrt(mse)\n", + "print(mse)\n", + "print(rmse)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 3 folds for each of 10 candidates, totalling 30 fits\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492, total= 10.9s\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 10.8s remaining: 0.0s\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492, total= 10.7s\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492 ...\n", + "[CV] model__C=8.732501769442347, model__gamma=0.014138684138012492, total= 10.9s\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492 ...\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492, total= 10.1s\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492 ...\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492, total= 10.1s\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492 ...\n", + "[CV] model__C=10.6073996426796, model__gamma=0.0010782720368618492, total= 10.0s\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784 ....\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784, total= 11.1s\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784 ....\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784, total= 11.1s\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784 ....\n", + "[CV] model__C=5.173843032530275, model__gamma=0.03912018707260784, total= 11.2s\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594 ...\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594, total= 11.2s\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594 ...\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594, total= 11.6s\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594 ...\n", + "[CV] model__C=8.757343573995623, model__gamma=0.026503976649874594, total= 11.8s\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165 ..\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165, total= 9.9s\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165 ..\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165, total= 10.1s\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165 ..\n", + "[CV] model__C=3.993757431556655, model__gamma=0.0037627657376435165, total= 7.3s\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009 ....\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009, total= 6.4s\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009 ....\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009, total= 8.2s\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009 ....\n", + "[CV] model__C=5.908725733130206, model__gamma=0.04722302780443009, total= 11.8s\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101 ...\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101, total= 10.3s\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101 ...\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101, total= 5.7s\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101 ...\n", + "[CV] model__C=6.339380352838232, model__gamma=0.004194342545426101, total= 9.1s\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465 ...\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465, total= 10.7s\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465 ...\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465, total= 11.0s\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465 ...\n", + "[CV] model__C=9.323582401718916, model__gamma=0.011028101996364465, total= 11.6s\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697 ....\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697, total= 11.9s\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697 ....\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697, total= 6.6s\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697 ....\n", + "[CV] model__C=7.485779608851653, model__gamma=0.03852395227919697, total= 11.9s\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038 .....\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038, total= 10.0s\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038 .....\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038, total= 10.0s\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038 .....\n", + "[CV] model__C=5.32139202170268, model__gamma=0.00373396867904038, total= 10.0s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 5.1min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "RandomizedSearchCV(cv=3, error_score=nan,\n", + " estimator=Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True,\n", + " with_mean=True,\n", + " with_std=True)),\n", + " ('model',\n", + " SVR(C=1.0, cache_size=200,\n", + " coef0=0.0, degree=3,\n", + " epsilon=0.1, gamma='scale',\n", + " kernel='rbf', max_iter=-1,\n", + " shrinking=True, tol=0.001,\n", + " verbose=False))],\n", + " verbose=False),\n", + " iid='deprecated', n_iter=10, n_jobs=None,\n", + " param_distributions={'model__C': ,\n", + " 'model__gamma': },\n", + " pre_dispatch='2*n_jobs', random_state=None, refit=True,\n", + " return_train_score=False, scoring=None, verbose=2)" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import RandomizedSearchCV\n", + "from scipy.stats import reciprocal, uniform\n", + "\n", + "SVR_model = Pipeline([\n", + " ('scaler', StandardScaler()),\n", + " ('model', SVR())\n", + "])\n", + "\n", + "param_dists = {\n", + " 'model__gamma': reciprocal(0.001, 0.1),\n", + " 'model__C': uniform(1,10)\n", + "}\n", + "\n", + "rnd_search_cv = RandomizedSearchCV(SVR_model, param_dists, verbose=2, cv=3)\n", + "rnd_search_cv.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('scaler',\n", + " StandardScaler(copy=True, with_mean=True, with_std=True)),\n", + " ('model',\n", + " SVR(C=5.908725733130206, cache_size=200, coef0=0.0, degree=3,\n", + " epsilon=0.1, gamma=0.04722302780443009, kernel='rbf',\n", + " max_iter=-1, shrinking=True, tol=0.001, verbose=False))],\n", + " verbose=False)" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rnd_search_cv.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.37646952041010656\n", + "0.6135711209062129\n" + ] + } + ], + "source": [ + "y_pred = rnd_search_cv.predict(X_test)\n", + "mse = mean_squared_error(y_pred, y_test)\n", + "rmse = np.sqrt(mse)\n", + "print(mse)\n", + "print(rmse)\n", + "\n", + "# Note that our original model performed better thanks to its gamma = 'auto' default option\n", + "# To improce performance, further tuning of gamma would be required" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch6/.ipynb_checkpoints/Exercises-checkpoint.ipynb b/Ch6/.ipynb_checkpoints/Exercises-checkpoint.ipynb new file mode 100644 index 000000000..5d445f323 --- /dev/null +++ b/Ch6/.ipynb_checkpoints/Exercises-checkpoint.ipynb @@ -0,0 +1,1552 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import os\n", + "from sklearn.datasets import make_moons\n", + "from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit\n", + "from matplotlib import pyplot as plt\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.base import clone" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 7**" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_moons(n_samples=10000, noise=0.4)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(10000, 2)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.25044937, 1.08585135])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(10000,)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(X[y==0], 'r+')\n", + "plt.plot(X[y==1], 'gx')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(X_train[y_train==0], 'r+')\n", + "plt.plot(X_train[y_train==1], 'gx')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 145 candidates, totalling 725 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[Parallel(n_jobs=1)]: Done 725 out of 725 | elapsed: 6.5s finished\n" + ] + }, + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score=nan,\n", + " estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None,\n", + " max_features=None,\n", + " max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0,\n", + " min_impurity_split=None,\n", + " min_samples_leaf=1,\n", + " min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0,\n", + " presort='deprecated',\n", + " random_state=None,\n", + " splitter='best'),\n", + " iid='de...\n", + " 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,\n", + " 27., 28., 29., 30., 31., 32.])},\n", + " {'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},\n", + " {'min_samples_leaf': array([0.1, 0.2, 0.3, 0.4, 0.5])},\n", + " {'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21,\n", + " 22, 23, 24, 25, 26, 27, 28, 29, 30,\n", + " 31, ...]}],\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", + " scoring=None, verbose=1)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "param_grid = [\n", + " {'max_depth': np.linspace(1, 32, 32, endpoint=True)},\n", + " {'min_samples_split': np.linspace(0.1, 1, 10)},\n", + " {'min_samples_leaf': np.linspace(0.1, 0.5, 5)},\n", + " {'max_leaf_nodes': list(range(2,100))}\n", + "]\n", + "\n", + "clf = DecisionTreeClassifier()\n", + "\n", + "grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=1)\n", + "\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n", + " max_depth=2.0, max_features=None, max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, presort='deprecated',\n", + " random_state=None, splitter='best')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8565555555555555" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.851" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred = grid_search.predict(X_test)\n", + "accuracy_score(y_pred, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,\n", + " 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,\n", + " 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,\n", + " 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,\n", + " 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,\n", + " 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,\n", + " 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,\n", + " 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n", + " 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,\n", + " 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,\n", + " 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,\n", + " 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,\n", + " 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,\n", + " 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,\n", + " 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,\n", + " 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,\n", + " 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,\n", + " 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,\n", + " 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,\n", + " 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,\n", + " 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,\n", + " 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,\n", + " 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,\n", + " 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,\n", + " 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,\n", + " 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,\n", + " 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,\n", + " 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,\n", + " 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,\n", + " 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,\n", + " 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,\n", + " 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,\n", + " 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,\n", + " 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,\n", + " 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,\n", + " 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n", + " 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,\n", + " 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,\n", + " 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,\n", + " 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1,\n", + " 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,\n", + " 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,\n", + " 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,\n", + " 0, 1, 1, 0, 0, 0, 0, 0, 0, 1], dtype=int64)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 8**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "n_trees = 1000\n", + "n_instances = 100\n", + "\n", + "subsets=[]\n", + "\n", + "# Randomly split up the training set\n", + "rs = ShuffleSplit(n_splits=n_trees, test_size = len(X_train)-n_instances)\n", + "\n", + "for train_subset_index, test_subset_index in rs.split(X_train):\n", + " X_mini_train = X_train[train_subset_index]\n", + " y_mini_train = y_train[train_subset_index]\n", + " subsets.append((X_mini_train, y_mini_train))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a bunch of forest predictors based on our best estimator\n", + "forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]\n", + "\n", + "accuracy_scores = []\n", + "\n", + "# Fit each tree to its training subset and test accuracy\n", + "for tree, (X_mini_train, y_mini_train) in zip(forest, subsets):\n", + " tree.fit(X_mini_train, y_mini_train)\n", + " \n", + " y_pred = tree.predict(X_test)\n", + " accuracy_scores.append(accuracy_score(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8245450000000001" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(accuracy_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nTODO Finish it up with majority rule!\\n'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "TODO Finish it up with majority rule!\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import mode\n", + "\n", + "# Empty array for our predictions\n", + "y_pred = []\n", + "\n", + "for row in X_test:\n", + " predictions = []\n", + " \n", + " # Get a prediction for our sample (row) from each tree\n", + " for tree in forest:\n", + " predictions.append(tree.predict(row.reshape(1,-1)))\n", + " \n", + " # Find the 'best' predictors useing SciPy's mode\n", + " y_pred.append(mode(predictions)[0][0][0])" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.853" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_score(y_pred, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch6/Exercises.ipynb b/Ch6/Exercises.ipynb new file mode 100644 index 000000000..5d445f323 --- /dev/null +++ b/Ch6/Exercises.ipynb @@ -0,0 +1,1552 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import os\n", + "from sklearn.datasets import make_moons\n", + "from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit\n", + "from matplotlib import pyplot as plt\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.base import clone" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 7**" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = make_moons(n_samples=10000, noise=0.4)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(10000, 2)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-0.25044937, 1.08585135])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(10000,)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(X[y==0], 'r+')\n", + "plt.plot(X[y==1], 'gx')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(X_train[y_train==0], 'r+')\n", + "plt.plot(X_train[y_train==1], 'gx')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 145 candidates, totalling 725 fits\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n", + "[Parallel(n_jobs=1)]: Done 725 out of 725 | elapsed: 6.5s finished\n" + ] + }, + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score=nan,\n", + " estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None,\n", + " max_features=None,\n", + " max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0,\n", + " min_impurity_split=None,\n", + " min_samples_leaf=1,\n", + " min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0,\n", + " presort='deprecated',\n", + " random_state=None,\n", + " splitter='best'),\n", + " iid='de...\n", + " 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,\n", + " 27., 28., 29., 30., 31., 32.])},\n", + " {'min_samples_split': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},\n", + " {'min_samples_leaf': array([0.1, 0.2, 0.3, 0.4, 0.5])},\n", + " {'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,\n", + " 13, 14, 15, 16, 17, 18, 19, 20, 21,\n", + " 22, 23, 24, 25, 26, 27, 28, 29, 30,\n", + " 31, ...]}],\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n", + " scoring=None, verbose=1)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "param_grid = [\n", + " {'max_depth': np.linspace(1, 32, 32, endpoint=True)},\n", + " {'min_samples_split': np.linspace(0.1, 1, 10)},\n", + " {'min_samples_leaf': np.linspace(0.1, 0.5, 5)},\n", + " {'max_leaf_nodes': list(range(2,100))}\n", + "]\n", + "\n", + "clf = DecisionTreeClassifier()\n", + "\n", + "grid_search = GridSearchCV(clf, param_grid, cv=5, verbose=1)\n", + "\n", + "grid_search.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',\n", + " max_depth=2.0, max_features=None, max_leaf_nodes=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, presort='deprecated',\n", + " random_state=None, splitter='best')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.best_estimator_" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8565555555555555" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_search.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.851" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred = grid_search.predict(X_test)\n", + "accuracy_score(y_pred, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,\n", + " 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,\n", + " 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,\n", + " 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,\n", + " 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,\n", + " 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,\n", + " 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,\n", + " 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,\n", + " 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,\n", + " 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,\n", + " 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,\n", + " 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,\n", + " 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,\n", + " 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,\n", + " 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0,\n", + " 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0,\n", + " 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0,\n", + " 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,\n", + " 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1,\n", + " 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,\n", + " 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,\n", + " 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,\n", + " 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,\n", + " 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,\n", + " 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,\n", + " 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,\n", + " 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1,\n", + " 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,\n", + " 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,\n", + " 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1,\n", + " 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,\n", + " 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,\n", + " 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,\n", + " 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0,\n", + " 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,\n", + " 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,\n", + " 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,\n", + " 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,\n", + " 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,\n", + " 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,\n", + " 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1,\n", + " 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,\n", + " 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,\n", + " 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,\n", + " 0, 1, 1, 0, 0, 0, 0, 0, 0, 1], dtype=int64)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 8**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "n_trees = 1000\n", + "n_instances = 100\n", + "\n", + "subsets=[]\n", + "\n", + "# Randomly split up the training set\n", + "rs = ShuffleSplit(n_splits=n_trees, test_size = len(X_train)-n_instances)\n", + "\n", + "for train_subset_index, test_subset_index in rs.split(X_train):\n", + " X_mini_train = X_train[train_subset_index]\n", + " y_mini_train = y_train[train_subset_index]\n", + " subsets.append((X_mini_train, y_mini_train))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a bunch of forest predictors based on our best estimator\n", + "forest = [clone(grid_search.best_estimator_) for _ in range(n_trees)]\n", + "\n", + "accuracy_scores = []\n", + "\n", + "# Fit each tree to its training subset and test accuracy\n", + "for tree, (X_mini_train, y_mini_train) in zip(forest, subsets):\n", + " tree.fit(X_mini_train, y_mini_train)\n", + " \n", + " y_pred = tree.predict(X_test)\n", + " accuracy_scores.append(accuracy_score(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8245450000000001" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(accuracy_scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nTODO Finish it up with majority rule!\\n'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "TODO Finish it up with majority rule!\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import mode\n", + "\n", + "# Empty array for our predictions\n", + "y_pred = []\n", + "\n", + "for row in X_test:\n", + " predictions = []\n", + " \n", + " # Get a prediction for our sample (row) from each tree\n", + " for tree in forest:\n", + " predictions.append(tree.predict(row.reshape(1,-1)))\n", + " \n", + " # Find the 'best' predictors useing SciPy's mode\n", + " y_pred.append(mode(predictions)[0][0][0])" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 1,\n", + " 1,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 0,\n", + " 1]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.853" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_score(y_pred, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch7/.ipynb_checkpoints/Exercises-checkpoint.ipynb b/Ch7/.ipynb_checkpoints/Exercises-checkpoint.ipynb new file mode 100644 index 000000000..c3cab63fb --- /dev/null +++ b/Ch7/.ipynb_checkpoints/Exercises-checkpoint.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from matplotlib import pyplot as plt\n", + "import os\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.preprocessing import normalize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 8**\n", + "\n", + "Create hard/soft voting ensemble on mnist" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "mnist = fetch_openml('mnist_784', version=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mnist.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(70000, 784)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y = mnist['data'], mnist['target']\n", + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Split into train, val, test sets of size 50k, 10k, 10k\n", + "\n", + "X_train = X[:50000]\n", + "y_train = y[:50000]\n", + "X_val = X[50000:60000]\n", + "y_val = y[50000:60000]\n", + "X_test = X[60000:]\n", + "y_test = y[60000:]\n", + "\n", + "# Normalize features\n", + "\n", + "X_train /= 255.0\n", + "X_val /= 255.0\n", + "X_test /= 255.0" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0\n" + ] + } + ], + "source": [ + "print(X_test.max())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training our RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=100,\n", + " n_jobs=None, oob_score=False, random_state=None,\n", + " verbose=0, warm_start=False)\n", + "Training our ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=100,\n", + " n_jobs=None, oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False)\n", + "Training our LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", + " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", + " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", + " verbose=0)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n" + ] + } + ], + "source": [ + "rfc = RandomForestClassifier()\n", + "etc = ExtraTreesClassifier()\n", + "svc = LinearSVC()\n", + "\n", + "classifiers = [rfc, etc, svc]\n", + "scores = []\n", + "\n", + "# Fit each classifier to the training set and predict on X_val\n", + "for clf in classifiers:\n", + " print('Training our ', clf)\n", + " clf.fit(X_train, y_train)\n", + " score = clf.score(X_val, y_val)\n", + " scores.append(score)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.9719, 0.9741, 0.9208]\n" + ] + } + ], + "source": [ + "print(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "VotingClassifier(estimators=[('rf',\n", + " RandomForestClassifier(bootstrap=True,\n", + " ccp_alpha=0.0,\n", + " class_weight=None,\n", + " criterion='gini',\n", + " max_depth=None,\n", + " max_features='auto',\n", + " max_leaf_nodes=None,\n", + " max_samples=None,\n", + " min_impurity_decrease=0.0,\n", + " min_impurity_split=None,\n", + " min_samples_leaf=1,\n", + " min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0,\n", + " n_estimators=100,\n", + " n_jobs=None,\n", + " oob_score...\n", + " n_estimators=100,\n", + " n_jobs=None, oob_score=False,\n", + " random_state=None, verbose=0,\n", + " warm_start=False)),\n", + " ('sv',\n", + " LinearSVC(C=1.0, class_weight=None, dual=True,\n", + " fit_intercept=True, intercept_scaling=1,\n", + " loss='squared_hinge', max_iter=1000,\n", + " multi_class='ovr', penalty='l2',\n", + " random_state=None, tol=0.0001,\n", + " verbose=0))],\n", + " flatten_transform=True, n_jobs=None, voting='hard',\n", + " weights=None)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import VotingClassifier\n", + "\n", + "# Hard vote ensmeble\n", + "voting_clf = VotingClassifier(\n", + " estimators=[('rf', rfc), ('et', etc), ('sv', svc)],\n", + " voting='hard'\n", + ")\n", + "\n", + "voting_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9719" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_clf.score(X_val, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Try without SVC\n", + "\n", + "del voting_clf.estimators_[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9732" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_clf.score(X_val, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9752" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set to soft voting and check if better\n", + "\n", + "voting_clf.voting='soft'\n", + "\n", + "voting_clf.score(X_val, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9707" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check on Test Set\n", + "\n", + "voting_clf.score(X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 9**\n", + "\n", + "train a stacking ensemble on our previous classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Round up our predictions\n", + "\n", + "X_val_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n", + "\n", + "for index, clf in enumerate(classifiers):\n", + " X_val_predictions[:, index] = clf.predict(X_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[3. 3. 3.]\n", + " [8. 8. 8.]\n", + " [6. 6. 6.]\n", + " ...\n", + " [5. 5. 5.]\n", + " [6. 6. 6.]\n", + " [8. 8. 8.]]\n" + ] + } + ], + "source": [ + "print(X_val_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=200,\n", + " n_jobs=None, oob_score=True, random_state=None,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train a classifier which will take as input our predictions matrix\n", + "blender = RandomForestClassifier(n_estimators=200, oob_score=True)\n", + "blender.fit(X_val_predictions, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9727" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check our out of bag score to get an idea of accuracy\n", + "blender.oob_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Round up predictions for X_test\n", + "X_test_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n", + "\n", + "for index, clf in enumerate(classifiers):\n", + " X_test_predictions[:, index] = clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Use our blender to predict based on our predictions matrix\n", + "y_pred = blender.predict(X_test_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.968" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_score(y_pred, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Ch7/Exercises.ipynb b/Ch7/Exercises.ipynb new file mode 100644 index 000000000..c3cab63fb --- /dev/null +++ b/Ch7/Exercises.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from matplotlib import pyplot as plt\n", + "import os\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.preprocessing import normalize" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 8**\n", + "\n", + "Create hard/soft voting ensemble on mnist" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "mnist = fetch_openml('mnist_784', version=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mnist.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(70000, 784)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X, y = mnist['data'], mnist['target']\n", + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Split into train, val, test sets of size 50k, 10k, 10k\n", + "\n", + "X_train = X[:50000]\n", + "y_train = y[:50000]\n", + "X_val = X[50000:60000]\n", + "y_val = y[50000:60000]\n", + "X_test = X[60000:]\n", + "y_test = y[60000:]\n", + "\n", + "# Normalize features\n", + "\n", + "X_train /= 255.0\n", + "X_val /= 255.0\n", + "X_test /= 255.0" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0\n" + ] + } + ], + "source": [ + "print(X_test.max())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training our RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=100,\n", + " n_jobs=None, oob_score=False, random_state=None,\n", + " verbose=0, warm_start=False)\n", + "Training our ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=100,\n", + " n_jobs=None, oob_score=False, random_state=None, verbose=0,\n", + " warm_start=False)\n", + "Training our LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", + " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", + " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", + " verbose=0)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n" + ] + } + ], + "source": [ + "rfc = RandomForestClassifier()\n", + "etc = ExtraTreesClassifier()\n", + "svc = LinearSVC()\n", + "\n", + "classifiers = [rfc, etc, svc]\n", + "scores = []\n", + "\n", + "# Fit each classifier to the training set and predict on X_val\n", + "for clf in classifiers:\n", + " print('Training our ', clf)\n", + " clf.fit(X_train, y_train)\n", + " score = clf.score(X_val, y_val)\n", + " scores.append(score)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.9719, 0.9741, 0.9208]\n" + ] + } + ], + "source": [ + "print(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " \"the number of iterations.\", ConvergenceWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "VotingClassifier(estimators=[('rf',\n", + " RandomForestClassifier(bootstrap=True,\n", + " ccp_alpha=0.0,\n", + " class_weight=None,\n", + " criterion='gini',\n", + " max_depth=None,\n", + " max_features='auto',\n", + " max_leaf_nodes=None,\n", + " max_samples=None,\n", + " min_impurity_decrease=0.0,\n", + " min_impurity_split=None,\n", + " min_samples_leaf=1,\n", + " min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0,\n", + " n_estimators=100,\n", + " n_jobs=None,\n", + " oob_score...\n", + " n_estimators=100,\n", + " n_jobs=None, oob_score=False,\n", + " random_state=None, verbose=0,\n", + " warm_start=False)),\n", + " ('sv',\n", + " LinearSVC(C=1.0, class_weight=None, dual=True,\n", + " fit_intercept=True, intercept_scaling=1,\n", + " loss='squared_hinge', max_iter=1000,\n", + " multi_class='ovr', penalty='l2',\n", + " random_state=None, tol=0.0001,\n", + " verbose=0))],\n", + " flatten_transform=True, n_jobs=None, voting='hard',\n", + " weights=None)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.ensemble import VotingClassifier\n", + "\n", + "# Hard vote ensmeble\n", + "voting_clf = VotingClassifier(\n", + " estimators=[('rf', rfc), ('et', etc), ('sv', svc)],\n", + " voting='hard'\n", + ")\n", + "\n", + "voting_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9719" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_clf.score(X_val, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Try without SVC\n", + "\n", + "del voting_clf.estimators_[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9732" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_clf.score(X_val, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9752" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Set to soft voting and check if better\n", + "\n", + "voting_clf.voting='soft'\n", + "\n", + "voting_clf.score(X_val, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9707" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check on Test Set\n", + "\n", + "voting_clf.score(X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Exercise 9**\n", + "\n", + "train a stacking ensemble on our previous classifiers" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# Round up our predictions\n", + "\n", + "X_val_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n", + "\n", + "for index, clf in enumerate(classifiers):\n", + " X_val_predictions[:, index] = clf.predict(X_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[3. 3. 3.]\n", + " [8. 8. 8.]\n", + " [6. 6. 6.]\n", + " ...\n", + " [5. 5. 5.]\n", + " [6. 6. 6.]\n", + " [8. 8. 8.]]\n" + ] + } + ], + "source": [ + "print(X_val_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=None, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=200,\n", + " n_jobs=None, oob_score=True, random_state=None,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train a classifier which will take as input our predictions matrix\n", + "blender = RandomForestClassifier(n_estimators=200, oob_score=True)\n", + "blender.fit(X_val_predictions, y_val)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9727" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Check our out of bag score to get an idea of accuracy\n", + "blender.oob_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "# Round up predictions for X_test\n", + "X_test_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n", + "\n", + "for index, clf in enumerate(classifiers):\n", + " X_test_predictions[:, index] = clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "# Use our blender to predict based on our predictions matrix\n", + "y_pred = blender.predict(X_test_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.968" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_score(y_pred, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}