diff --git a/topic_29/KNN.ipynb b/topic_29/KNN.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..02301ebd0d10349c88df141bc8f9d791b33f1f87 --- /dev/null +++ b/topic_29/KNN.ipynb @@ -0,0 +1,340 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## K-Nearest Neighbors (kNN) Classifier\n", + "\n", + "* Tell me who your neighbours are, and I'll tell you who you are!\n", + "* K-Nearest Neighbors (kNN) is an algorithm that classifies a data point based on this principle. This means it classifies a given data point according to the classification labels of surrounding data points\n", + "* It uses Euclidean distance between the data point and other data points to determine who are its neighbours\n", + "* The k represents the number of neighbours to include in the classification problem\n", + "* kNN is well suited for classification tasks where the relationship between the features are complex and hard to understand." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "from sklearn.neighbors import KNeighborsClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xBlue = np.array([0.3,0.5,1,1.4,1.7,2])\n", + "yBlue = np.array([1,4.5,2.3,1.9,8.9,4.1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xRed = np.array([3.3,3.5,4,4.4,5.7,6])\n", + "yRed = np.array([7,1.5,6.3,1.9,2.9,7.1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X = np.array([[0.3,1],[0.5,4.5],[1,2.3],[1.4,1.9],[1.7,8.9],[2,4.1],[3.3,7],[3.5,1.5],[4,6.3],[4.4,1.9],[5.7,2.9],[6,7.1]])\n", + "y = np.array([0,0,0,0,0,0,1,1,1,1,1,1]) # 0: blue class, 1: red class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.plot(xBlue, yBlue, 'ro', color = 'blue')\n", + "plt.plot(xRed, yRed, 'ro', color='red')\n", + "plt.plot(2,8,'ro',color='green', markersize=10)\n", + "plt.axis([-0.5,10,-0.5,10])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = KNeighborsClassifier(n_neighbors=3) # this is the k value\n", + "classifier.fit(X,y)\n", + "\n", + "pred = classifier.predict(np.array([[2,8]]))\n", + "\n", + "if pred == 0:\n", + " print(\"Data point is blue\")\n", + "else:\n", + " print(\"Data point is red\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Let's apply this to 'real' data!\n", + "You can obtain the dataset used in this lecture here: https://www.kaggle.com/rakeshrau/social-network-ads/data. It is a 'categorical dataset to determine whether a user purchased a particular product'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import confusion_matrix\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn import preprocessing\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "def check_NaN(dataframe):\n", + " print(\"Total NaN:\", dataframe.isnull().values.sum())\n", + " print(\"NaN by column:\\n\",dataframe.isnull().sum())\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv(\".\\datasets\\Social_Network_Ads.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "check_NaN(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is another way of selecting the features you want where data.features represents your x values, and data.target represents your y values. These are numpy arrays so there is no need to convert them as we did before!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.features = data[[\"EstimatedSalary\",\"Age\"]]\n", + "data.target = data.Purchased" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is the normalisation step. This is very important as we don't want huge or tiny values to obscure meaning of the data. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.features = preprocessing.MinMaxScaler().fit_transform(data.features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### kNN is a Lazy Learner!\n", + "* Although we define train/test splits, there is no explicity training\n", + "* Lazy Learning means there is no explicit training, and no generalisation based on the training data\n", + "* kNN keeps all the training data for inference\n", + "* Making the predictions on the test data is rather slow (because the distance between a test data point and all the training data points must be calculated)\n", + "* kNN uses non-parametric learning: no parameters are to be learned about the data because it makes no assumptions about data distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(data.features,data.target, test_size=0.2,random_state=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we are finding the optimal k value by using cross_val_score. You can find more info on this function here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "k_scores = []\n", + "\n", + "for k in range(1,100):\n", + " knn = KNeighborsClassifier(n_neighbors=k)\n", + " scores = cross_val_score(knn,data.features,data.target,cv=10,scoring='accuracy')\n", + " k_scores.append(scores.mean())\n", + " \n", + "optimal_k = np.argmax(k_scores) \n", + "print(\"Optimal k with cross-validation: \", np.argmax(k_scores)) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = KNeighborsClassifier(optimal_k)\n", + "classifier.fit(x_train, y_train)\n", + "predictions = classifier.predict(x_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Evaluation using mean squared error and accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(confusion_matrix(y_test, predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Accuracy:\",str(accuracy_score(y_test, predictions)*100)+\"%\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#mean squared error\n", + "print(\"Mean squared error: \",str(np.mean((predictions - y_test) ** 2)*100)+\"%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from matplotlib.colors import *\n", + "\n", + "xs, ys = x_test, y_test\n", + "\n", + "X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),\n", + " np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))\n", + "\n", + "plt.contourf(X1,X2, classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),\n", + " alpha = 0.75, cmap = ListedColormap(('orange','grey')))\n", + "\n", + "plt.xlim(X1.min(),X1.max())\n", + "plt.ylim(X2.min(),X2.max())\n", + "\n", + "for i, j in enumerate(np.unique(ys)):\n", + " plt.scatter(xs[ys==j,0],xs[ys==j,1],\n", + " c=ListedColormap(('orange','grey'))(i),label = j)\n", + "\n", + "plt.title('Train Set')\n", + "plt.xlabel('Estimated Salary')\n", + "plt.ylabel('Age')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise\n", + "Add the feature 'Gender' to the training set, and see if the model improves?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}