GeronBook/Ch7/Exercises.ipynb

522 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from matplotlib import pyplot as plt\n",
"import os\n",
"from sklearn.datasets import fetch_openml\n",
"from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.preprocessing import normalize"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Exercise 8**\n",
"\n",
"Create hard/soft voting ensemble on mnist"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"mnist = fetch_openml('mnist_784', version=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mnist.keys()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(70000, 784)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X, y = mnist['data'], mnist['target']\n",
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Split into train, val, test sets of size 50k, 10k, 10k\n",
"\n",
"X_train = X[:50000]\n",
"y_train = y[:50000]\n",
"X_val = X[50000:60000]\n",
"y_val = y[50000:60000]\n",
"X_test = X[60000:]\n",
"y_test = y[60000:]\n",
"\n",
"# Normalize features\n",
"\n",
"X_train /= 255.0\n",
"X_val /= 255.0\n",
"X_test /= 255.0"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0\n"
]
}
],
"source": [
"print(X_test.max())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training our RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
" criterion='gini', max_depth=None, max_features='auto',\n",
" max_leaf_nodes=None, max_samples=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
" n_jobs=None, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False)\n",
"Training our ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,\n",
" criterion='gini', max_depth=None, max_features='auto',\n",
" max_leaf_nodes=None, max_samples=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
" n_jobs=None, oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)\n",
"Training our LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
" verbose=0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
" \"the number of iterations.\", ConvergenceWarning)\n"
]
}
],
"source": [
"rfc = RandomForestClassifier()\n",
"etc = ExtraTreesClassifier()\n",
"svc = LinearSVC()\n",
"\n",
"classifiers = [rfc, etc, svc]\n",
"scores = []\n",
"\n",
"# Fit each classifier to the training set and predict on X_val\n",
"for clf in classifiers:\n",
" print('Training our ', clf)\n",
" clf.fit(X_train, y_train)\n",
" score = clf.score(X_val, y_val)\n",
" scores.append(score)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.9719, 0.9741, 0.9208]\n"
]
}
],
"source": [
"print(scores)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
" \"the number of iterations.\", ConvergenceWarning)\n"
]
},
{
"data": {
"text/plain": [
"VotingClassifier(estimators=[('rf',\n",
" RandomForestClassifier(bootstrap=True,\n",
" ccp_alpha=0.0,\n",
" class_weight=None,\n",
" criterion='gini',\n",
" max_depth=None,\n",
" max_features='auto',\n",
" max_leaf_nodes=None,\n",
" max_samples=None,\n",
" min_impurity_decrease=0.0,\n",
" min_impurity_split=None,\n",
" min_samples_leaf=1,\n",
" min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0,\n",
" n_estimators=100,\n",
" n_jobs=None,\n",
" oob_score...\n",
" n_estimators=100,\n",
" n_jobs=None, oob_score=False,\n",
" random_state=None, verbose=0,\n",
" warm_start=False)),\n",
" ('sv',\n",
" LinearSVC(C=1.0, class_weight=None, dual=True,\n",
" fit_intercept=True, intercept_scaling=1,\n",
" loss='squared_hinge', max_iter=1000,\n",
" multi_class='ovr', penalty='l2',\n",
" random_state=None, tol=0.0001,\n",
" verbose=0))],\n",
" flatten_transform=True, n_jobs=None, voting='hard',\n",
" weights=None)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.ensemble import VotingClassifier\n",
"\n",
"# Hard vote ensmeble\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('rf', rfc), ('et', etc), ('sv', svc)],\n",
" voting='hard'\n",
")\n",
"\n",
"voting_clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9719"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"voting_clf.score(X_val, y_val)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# Try without SVC\n",
"\n",
"del voting_clf.estimators_[2]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9732"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"voting_clf.score(X_val, y_val)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9752"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Set to soft voting and check if better\n",
"\n",
"voting_clf.voting='soft'\n",
"\n",
"voting_clf.score(X_val, y_val)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9707"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check on Test Set\n",
"\n",
"voting_clf.score(X_test, y_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Exercise 9**\n",
"\n",
"train a stacking ensemble on our previous classifiers"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# Round up our predictions\n",
"\n",
"X_val_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n",
"\n",
"for index, clf in enumerate(classifiers):\n",
" X_val_predictions[:, index] = clf.predict(X_val)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[3. 3. 3.]\n",
" [8. 8. 8.]\n",
" [6. 6. 6.]\n",
" ...\n",
" [5. 5. 5.]\n",
" [6. 6. 6.]\n",
" [8. 8. 8.]]\n"
]
}
],
"source": [
"print(X_val_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
" criterion='gini', max_depth=None, max_features='auto',\n",
" max_leaf_nodes=None, max_samples=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=200,\n",
" n_jobs=None, oob_score=True, random_state=None,\n",
" verbose=0, warm_start=False)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Train a classifier which will take as input our predictions matrix\n",
"blender = RandomForestClassifier(n_estimators=200, oob_score=True)\n",
"blender.fit(X_val_predictions, y_val)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9727"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check our out of bag score to get an idea of accuracy\n",
"blender.oob_score_"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"# Round up predictions for X_test\n",
"X_test_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n",
"\n",
"for index, clf in enumerate(classifiers):\n",
" X_test_predictions[:, index] = clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"# Use our blender to predict based on our predictions matrix\n",
"y_pred = blender.predict(X_test_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.968"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_score(y_pred, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}