diff --git a/Supplement-4_2.ipynb b/Supplement-4_2.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..23cad5d53834d486343f0ada9d032c7787bb380d --- /dev/null +++ b/Supplement-4_2.ipynb @@ -0,0 +1,378 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Supplement 4: Classification" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.spatial.distance import cdist\n", + "from scipy.stats import mode\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4.2 Programming Task: K-Nearest Neighbor\n", + "The datasets in files __train-knn.csv__ and __test-knn.csv__ contain samples from a synthetic dataset for training a K-Nearest Neighbor classifier.\n", + "The dataset consists of 7 columns: the first six columns, denoted as x1, x2, ..., x6 represent\n", + " the input features for each data sample, and the last column represents the class label given by 0 or 1.\n", + "There are 200 samples in the __train-knn.csv__ and 100 samples in the __test-knn.csv__}.\n", + "\n", + "i\\. Implement the K-Nearest Neighbor classification algorithm using NumPy and SciPy.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "train_set = pd.read_csv('train-knn.csv').to_numpy()\n", + "test_set = pd.read_csv('test-knn.csv').to_numpy()\n", + "\n", + "\n", + "def knn_predict(train_X, train_y, test_X,n):\n", + "\n", + " # Calculate the distance\n", + " distance_matrix = cdist(train_X, test_X,)\n", + " # Find the indices of nearest neighbours\n", + " neighbour_indices = distance_matrix.argsort(axis=0)[:n]\n", + " # Find class of nearest neighbours\n", + " neighbour_class = train_y[neighbour_indices]\n", + " # Find most frequent class among neighbour\n", + " frequent_class,_ = mode(neighbour_class,axis=0)\n", + "\n", + " return frequent_class.reshape(-1,)\n", + "\n", + "def get_accuracy(predict_labels, true_labels):\n", + " correct_count = np.sum([predict_labels == true_labels])\n", + " total_samples = predict_labels.shape[0]\n", + "\n", + " accuracy = correct_count / total_samples\n", + "\n", + " return accuracy\n", + "\n", + "# last coloumn is class label\n", + "train_X = train_set[:,:-1]\n", + "train_y = train_set[:,-1]\n", + "\n", + "test_X = test_set[:,:-1]\n", + "test_y = test_set[:,-1]\n", + "\n", + "# Using a subset of the original dataset\n", + "predicted_y = knn_predict(train_X[:0], train_y[:0], test_X[:1],n=3)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ii\\. Perform cross-validation (with 5 folds) on the train dataset __train-knn.csv__ to determine a suitable value of K.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "best k from cross validation: 8\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>k</th>\n", + " <th>train_acc</th>\n", + " <th>val_acc</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>8</td>\n", + " <td>0.88875</td>\n", + " <td>0.855</td>\n", + " </tr>\n", + " <tr>\n", + " <th>47</th>\n", + " <td>48</td>\n", + " <td>0.82750</td>\n", + " <td>0.845</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>6</td>\n", + " <td>0.89750</td>\n", + " <td>0.845</td>\n", + " </tr>\n", + " <tr>\n", + " <th>49</th>\n", + " <td>50</td>\n", + " <td>0.83125</td>\n", + " <td>0.840</td>\n", + " </tr>\n", + " <tr>\n", + " <th>45</th>\n", + " <td>46</td>\n", + " <td>0.82375</td>\n", + " <td>0.840</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " k train_acc val_acc\n", + "7 8 0.88875 0.855\n", + "47 48 0.82750 0.845\n", + "5 6 0.89750 0.845\n", + "49 50 0.83125 0.840\n", + "45 46 0.82375 0.840" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_fold = 5\n", + "\n", + "train_size = train_set.shape[0]\n", + "indices = np.arange(train_size)\n", + "np.random.shuffle(indices)\n", + "\n", + "# split dataset\n", + "splits = np.array_split(indices, n_fold)\n", + "\n", + "# List to store metrics for each value of k\n", + "exp = []\n", + "\n", + "k_max = 50\n", + "for k in range(1,k_max+1):\n", + "\n", + " val_acc = np.zeros(n_fold)\n", + " train_acc = np.zeros(n_fold)\n", + "\n", + " for fold in range(n_fold):\n", + " val_indices = splits[fold]\n", + "\n", + " # Create a mask that has 1 at val_set and 0 elsewhere\n", + " mask = np.zeros(train_size, dtype =bool)\n", + " mask[val_indices] = True\n", + "\n", + " # Get cal data using mask\n", + " cv_val_X = train_X[mask]\n", + " cv_val_y = train_y[mask]\n", + "\n", + " # Get train data by inverting the mask\n", + " cv_train_X = train_X[np.invert(mask)]\n", + " cv_train_y = train_y[np.invert(mask)]\n", + "\n", + "\n", + " # Train accuracy\n", + " predicted_labels_train = knn_predict(cv_train_X, cv_train_y, cv_train_X,k)\n", + " train_acc[fold] = get_accuracy(predicted_labels_train, cv_train_y)\n", + "\n", + " # Validation accuracy\n", + " predicted_labels_val = knn_predict(cv_train_X, cv_train_y, cv_val_X, k)\n", + " val_acc[fold] = get_accuracy(predicted_labels_val, cv_val_y)\n", + "\n", + " exp.append([k,train_acc.mean(),val_acc.mean()])\n", + "\n", + "# print(np.argmax(exp))\n", + "\n", + "exp = pd.DataFrame(exp, columns=['k','train_acc','val_acc'])\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.plot(exp['k'], exp['train_acc'],'r',label='train')\n", + "plt.plot(exp['k'], exp['val_acc'],'b',label='validation')\n", + "plt.grid()\n", + "plt.legend()\n", + "plt.ylabel('Accuracy')\n", + "plt.xlabel('Value of K in K-Nearest Neighbour')\n", + "plt.xticks(np.arange(0,k_max,step=2))\n", + "# plt.plot(train_exp[1:])\n", + "plt.show()\n", + "\n", + "exp.sort_values(by=['val_acc'], ascending=False, inplace = True)\n", + "\n", + "best_k = int(exp.iloc[0]['k'])\n", + "print('best k from cross validation: ' , best_k)\n", + "\n", + "exp.head()\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "iii\\. Using the optimal value of k from the cross-validation, obtain the accuracy of your model on the test dataset __test-knn.csv__.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.81\n" + ] + } + ], + "source": [ + "predicted_labels = knn_predict(train_X, train_y, test_X, best_k)\n", + "acc = get_accuracy(predicted_labels, test_y)\n", + "print(acc)\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "iv\\. Compare your result with the KNeighborsClassifier model from the scikit-learn library." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.81\n", + "8\n" + ] + } + ], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import GridSearchCV, KFold\n", + "\n", + "model = KNeighborsClassifier(n_neighbors=best_k)\n", + "model.fit(train_X, train_y)\n", + "\n", + "print(model.score(test_X, test_y))\n", + "\n", + "knn2 = KNeighborsClassifier()\n", + "param_grid = {'n_neighbors': np.arange(1, 50)}\n", + "# Use gridsearch to test all values for n_neighbors\n", + "knn_gscv = GridSearchCV(knn2, param_grid, cv=KFold(5,shuffle=True))\n", + "# fit model to data\n", + "knn_gscv.fit(train_X, train_y)\n", + "\n", + "print(knn_gscv.best_params_['n_neighbors'])\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "v\\. How do the bias and variance of each model vary as K increases?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernel_info": { + "name": "python3" + }, + "kernelspec": { + "display_name": "Python 3.9.7 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "nteract": { + "version": "0.15.0" + }, + "vscode": { + "interpreter": { + "hash": "ecd4d27f568e4e411014651395f8ce8e2a5bffeb2f41f9e1cf81fec543185c37" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}