{ "cells": [ { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import os\n", "from keras.utils.np_utils import to_categorical # convert to one-hot-encoding" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "train = pd.read_csv(os.path.join('data', 'train.csv'))\n", "test = pd.read_csv(os.path.join('data', 'test.csv'))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelpixel0pixel1pixel2pixel3pixel4pixel5pixel6pixel7pixel8...pixel774pixel775pixel776pixel777pixel778pixel779pixel780pixel781pixel782pixel783
01000000000...0000000000
10000000000...0000000000
21000000000...0000000000
34000000000...0000000000
40000000000...0000000000
\n", "

5 rows × 785 columns

\n", "
" ], "text/plain": [ " label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 \\\n", "0 1 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 0 0 \n", "2 1 0 0 0 0 0 0 0 0 \n", "3 4 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 0 \n", "\n", " pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 \\\n", "0 0 ... 0 0 0 0 0 0 \n", "1 0 ... 0 0 0 0 0 0 \n", "2 0 ... 0 0 0 0 0 0 \n", "3 0 ... 0 0 0 0 0 0 \n", "4 0 ... 0 0 0 0 0 0 \n", "\n", " pixel780 pixel781 pixel782 pixel783 \n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "\n", "[5 rows x 785 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pixel0pixel1pixel2pixel3pixel4pixel5pixel6pixel7pixel8pixel9...pixel774pixel775pixel776pixel777pixel778pixel779pixel780pixel781pixel782pixel783
00000000000...0000000000
10000000000...0000000000
20000000000...0000000000
30000000000...0000000000
40000000000...0000000000
\n", "

5 rows × 784 columns

\n", "
" ], "text/plain": [ " pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 \\\n", "0 0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 0 \n", "\n", " pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 \\\n", "0 0 ... 0 0 0 0 0 0 \n", "1 0 ... 0 0 0 0 0 0 \n", "2 0 ... 0 0 0 0 0 0 \n", "3 0 ... 0 0 0 0 0 0 \n", "4 0 ... 0 0 0 0 0 0 \n", "\n", " pixel780 pixel781 pixel782 pixel783 \n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "\n", "[5 rows x 784 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test.head()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "X_train = train.drop('label', axis=1)\n", "y = train['label']\n", "X_test = test" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(42000, 784)\n", "(42000,)\n", "(28000, 784)\n" ] } ], "source": [ "print(X_train.shape)\n", "print(y.shape)\n", "print(X_test.shape)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Feature normalization (max greyscale value is 255)\n", "\n", "X_train = X_train / 255.0" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# Reshape our input matrices to match shape of image (28x28) \n", "# with only one filter (would be 3 for RBG)\n", "# and one hot encode our labels vector\n", "\n", "X_train = X_train.values.reshape(-1, 28, 28, 1)\n", "X_test = X_test.values.reshape(-1, 28, 28, 1)\n", "y = to_categorical(y, num_classes=10)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(42000, 28, 28, 1)\n", "(42000, 10)\n", "(28000, 28, 28, 1)\n" ] } ], "source": [ "print(X_train.shape)\n", "print(y.shape)\n", "print(X_test.shape)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "np.save(\"data\\X_train.csv\", X_train)\n", "np.save('data\\y.csv', y)\n", "np.save('data\\X_test', X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }