522 lines
14 KiB
Plaintext
522 lines
14 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"from matplotlib import pyplot as plt\n",
|
|
"import os\n",
|
|
"from sklearn.datasets import fetch_openml\n",
|
|
"from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\n",
|
|
"from sklearn.svm import LinearSVC\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"from sklearn.preprocessing import normalize"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Exercise 8**\n",
|
|
"\n",
|
|
"Create hard/soft voting ensemble on mnist"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mnist = fetch_openml('mnist_784', version=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"mnist.keys()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(70000, 784)"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"X, y = mnist['data'], mnist['target']\n",
|
|
"X.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Split into train, val, test sets of size 50k, 10k, 10k\n",
|
|
"\n",
|
|
"X_train = X[:50000]\n",
|
|
"y_train = y[:50000]\n",
|
|
"X_val = X[50000:60000]\n",
|
|
"y_val = y[50000:60000]\n",
|
|
"X_test = X[60000:]\n",
|
|
"y_test = y[60000:]\n",
|
|
"\n",
|
|
"# Normalize features\n",
|
|
"\n",
|
|
"X_train /= 255.0\n",
|
|
"X_val /= 255.0\n",
|
|
"X_test /= 255.0"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(X_test.max())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Training our RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
|
|
" criterion='gini', max_depth=None, max_features='auto',\n",
|
|
" max_leaf_nodes=None, max_samples=None,\n",
|
|
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
|
|
" min_samples_leaf=1, min_samples_split=2,\n",
|
|
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
|
|
" n_jobs=None, oob_score=False, random_state=None,\n",
|
|
" verbose=0, warm_start=False)\n",
|
|
"Training our ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,\n",
|
|
" criterion='gini', max_depth=None, max_features='auto',\n",
|
|
" max_leaf_nodes=None, max_samples=None,\n",
|
|
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
|
|
" min_samples_leaf=1, min_samples_split=2,\n",
|
|
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
|
|
" n_jobs=None, oob_score=False, random_state=None, verbose=0,\n",
|
|
" warm_start=False)\n",
|
|
"Training our LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
|
|
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
|
|
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
|
|
" verbose=0)\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
|
|
" \"the number of iterations.\", ConvergenceWarning)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"rfc = RandomForestClassifier()\n",
|
|
"etc = ExtraTreesClassifier()\n",
|
|
"svc = LinearSVC()\n",
|
|
"\n",
|
|
"classifiers = [rfc, etc, svc]\n",
|
|
"scores = []\n",
|
|
"\n",
|
|
"# Fit each classifier to the training set and predict on X_val\n",
|
|
"for clf in classifiers:\n",
|
|
" print('Training our ', clf)\n",
|
|
" clf.fit(X_train, y_train)\n",
|
|
" score = clf.score(X_val, y_val)\n",
|
|
" scores.append(score)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[0.9719, 0.9741, 0.9208]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(scores)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"c:\\users\\tsb\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
|
|
" \"the number of iterations.\", ConvergenceWarning)\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"VotingClassifier(estimators=[('rf',\n",
|
|
" RandomForestClassifier(bootstrap=True,\n",
|
|
" ccp_alpha=0.0,\n",
|
|
" class_weight=None,\n",
|
|
" criterion='gini',\n",
|
|
" max_depth=None,\n",
|
|
" max_features='auto',\n",
|
|
" max_leaf_nodes=None,\n",
|
|
" max_samples=None,\n",
|
|
" min_impurity_decrease=0.0,\n",
|
|
" min_impurity_split=None,\n",
|
|
" min_samples_leaf=1,\n",
|
|
" min_samples_split=2,\n",
|
|
" min_weight_fraction_leaf=0.0,\n",
|
|
" n_estimators=100,\n",
|
|
" n_jobs=None,\n",
|
|
" oob_score...\n",
|
|
" n_estimators=100,\n",
|
|
" n_jobs=None, oob_score=False,\n",
|
|
" random_state=None, verbose=0,\n",
|
|
" warm_start=False)),\n",
|
|
" ('sv',\n",
|
|
" LinearSVC(C=1.0, class_weight=None, dual=True,\n",
|
|
" fit_intercept=True, intercept_scaling=1,\n",
|
|
" loss='squared_hinge', max_iter=1000,\n",
|
|
" multi_class='ovr', penalty='l2',\n",
|
|
" random_state=None, tol=0.0001,\n",
|
|
" verbose=0))],\n",
|
|
" flatten_transform=True, n_jobs=None, voting='hard',\n",
|
|
" weights=None)"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.ensemble import VotingClassifier\n",
|
|
"\n",
|
|
"# Hard vote ensmeble\n",
|
|
"voting_clf = VotingClassifier(\n",
|
|
" estimators=[('rf', rfc), ('et', etc), ('sv', svc)],\n",
|
|
" voting='hard'\n",
|
|
")\n",
|
|
"\n",
|
|
"voting_clf.fit(X_train, y_train)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.9719"
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"voting_clf.score(X_val, y_val)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Try without SVC\n",
|
|
"\n",
|
|
"del voting_clf.estimators_[2]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.9732"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"voting_clf.score(X_val, y_val)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.9752"
|
|
]
|
|
},
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Set to soft voting and check if better\n",
|
|
"\n",
|
|
"voting_clf.voting='soft'\n",
|
|
"\n",
|
|
"voting_clf.score(X_val, y_val)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.9707"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Check on Test Set\n",
|
|
"\n",
|
|
"voting_clf.score(X_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**Exercise 9**\n",
|
|
"\n",
|
|
"train a stacking ensemble on our previous classifiers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Round up our predictions\n",
|
|
"\n",
|
|
"X_val_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n",
|
|
"\n",
|
|
"for index, clf in enumerate(classifiers):\n",
|
|
" X_val_predictions[:, index] = clf.predict(X_val)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[[3. 3. 3.]\n",
|
|
" [8. 8. 8.]\n",
|
|
" [6. 6. 6.]\n",
|
|
" ...\n",
|
|
" [5. 5. 5.]\n",
|
|
" [6. 6. 6.]\n",
|
|
" [8. 8. 8.]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(X_val_predictions)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
|
|
" criterion='gini', max_depth=None, max_features='auto',\n",
|
|
" max_leaf_nodes=None, max_samples=None,\n",
|
|
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
|
|
" min_samples_leaf=1, min_samples_split=2,\n",
|
|
" min_weight_fraction_leaf=0.0, n_estimators=200,\n",
|
|
" n_jobs=None, oob_score=True, random_state=None,\n",
|
|
" verbose=0, warm_start=False)"
|
|
]
|
|
},
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Train a classifier which will take as input our predictions matrix\n",
|
|
"blender = RandomForestClassifier(n_estimators=200, oob_score=True)\n",
|
|
"blender.fit(X_val_predictions, y_val)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.9727"
|
|
]
|
|
},
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Check our out of bag score to get an idea of accuracy\n",
|
|
"blender.oob_score_"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Round up predictions for X_test\n",
|
|
"X_test_predictions = np.empty((len(X_val), len(classifiers)), dtype=np.float32)\n",
|
|
"\n",
|
|
"for index, clf in enumerate(classifiers):\n",
|
|
" X_test_predictions[:, index] = clf.predict(X_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Use our blender to predict based on our predictions matrix\n",
|
|
"y_pred = blender.predict(X_test_predictions)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.968"
|
|
]
|
|
},
|
|
"execution_count": 38,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"accuracy_score(y_pred, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|