diff --git a/topic_29/KNN.ipynb b/topic_29/KNN.ipynb deleted file mode 100644 index 02301ebd0d10349c88df141bc8f9d791b33f1f87..0000000000000000000000000000000000000000 --- a/topic_29/KNN.ipynb +++ /dev/null @@ -1,340 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## K-Nearest Neighbors (kNN) Classifier\n", - "\n", - "* Tell me who your neighbours are, and I'll tell you who you are!\n", - "* K-Nearest Neighbors (kNN) is an algorithm that classifies a data point based on this principle. This means it classifies a given data point according to the classification labels of surrounding data points\n", - "* It uses Euclidean distance between the data point and other data points to determine who are its neighbours\n", - "* The k represents the number of neighbours to include in the classification problem\n", - "* kNN is well suited for classification tasks where the relationship between the features are complex and hard to understand." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "from matplotlib import pyplot as plt\n", - "from sklearn.neighbors import KNeighborsClassifier" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "xBlue = np.array([0.3,0.5,1,1.4,1.7,2])\n", - "yBlue = np.array([1,4.5,2.3,1.9,8.9,4.1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "xRed = np.array([3.3,3.5,4,4.4,5.7,6])\n", - "yRed = np.array([7,1.5,6.3,1.9,2.9,7.1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X = np.array([[0.3,1],[0.5,4.5],[1,2.3],[1.4,1.9],[1.7,8.9],[2,4.1],[3.3,7],[3.5,1.5],[4,6.3],[4.4,1.9],[5.7,2.9],[6,7.1]])\n", - "y = np.array([0,0,0,0,0,0,1,1,1,1,1,1]) # 0: blue class, 1: red class" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.plot(xBlue, yBlue, 'ro', color = 'blue')\n", - "plt.plot(xRed, yRed, 'ro', color='red')\n", - "plt.plot(2,8,'ro',color='green', markersize=10)\n", - "plt.axis([-0.5,10,-0.5,10])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "classifier = KNeighborsClassifier(n_neighbors=3) # this is the k value\n", - "classifier.fit(X,y)\n", - "\n", - "pred = classifier.predict(np.array([[2,8]]))\n", - "\n", - "if pred == 0:\n", - " print(\"Data point is blue\")\n", - "else:\n", - " print(\"Data point is red\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Let's apply this to 'real' data!\n", - "You can obtain the dataset used in this lecture here: https://www.kaggle.com/rakeshrau/social-network-ads/data. It is a 'categorical dataset to determine whether a user purchased a particular product'." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.neighbors import KNeighborsClassifier\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.metrics import accuracy_score\n", - "from sklearn import preprocessing\n", - "from sklearn.model_selection import cross_val_score\n", - "\n", - "def check_NaN(dataframe):\n", - " print(\"Total NaN:\", dataframe.isnull().values.sum())\n", - " print(\"NaN by column:\\n\",dataframe.isnull().sum())\n", - " return" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv(\".\\datasets\\Social_Network_Ads.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "check_NaN(data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is another way of selecting the features you want where data.features represents your x values, and data.target represents your y values. These are numpy arrays so there is no need to convert them as we did before!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.features = data[[\"EstimatedSalary\",\"Age\"]]\n", - "data.target = data.Purchased" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is the normalisation step. This is very important as we don't want huge or tiny values to obscure meaning of the data. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.features = preprocessing.MinMaxScaler().fit_transform(data.features)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### kNN is a Lazy Learner!\n", - "* Although we define train/test splits, there is no explicity training\n", - "* Lazy Learning means there is no explicit training, and no generalisation based on the training data\n", - "* kNN keeps all the training data for inference\n", - "* Making the predictions on the test data is rather slow (because the distance between a test data point and all the training data points must be calculated)\n", - "* kNN uses non-parametric learning: no parameters are to be learned about the data because it makes no assumptions about data distribution" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "x_train, x_test, y_train, y_test = train_test_split(data.features,data.target, test_size=0.2,random_state=42)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, we are finding the optimal k value by using cross_val_score. You can find more info on this function here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "k_scores = []\n", - "\n", - "for k in range(1,100):\n", - " knn = KNeighborsClassifier(n_neighbors=k)\n", - " scores = cross_val_score(knn,data.features,data.target,cv=10,scoring='accuracy')\n", - " k_scores.append(scores.mean())\n", - " \n", - "optimal_k = np.argmax(k_scores) \n", - "print(\"Optimal k with cross-validation: \", np.argmax(k_scores)) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "classifier = KNeighborsClassifier(optimal_k)\n", - "classifier.fit(x_train, y_train)\n", - "predictions = classifier.predict(x_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Evaluation using mean squared error and accuracy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(confusion_matrix(y_test, predictions))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Accuracy:\",str(accuracy_score(y_test, predictions)*100)+\"%\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#mean squared error\n", - "print(\"Mean squared error: \",str(np.mean((predictions - y_test) ** 2)*100)+\"%\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Visualisation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from matplotlib.colors import *\n", - "\n", - "xs, ys = x_test, y_test\n", - "\n", - "X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),\n", - " np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))\n", - "\n", - "plt.contourf(X1,X2, classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),\n", - " alpha = 0.75, cmap = ListedColormap(('orange','grey')))\n", - "\n", - "plt.xlim(X1.min(),X1.max())\n", - "plt.ylim(X2.min(),X2.max())\n", - "\n", - "for i, j in enumerate(np.unique(ys)):\n", - " plt.scatter(xs[ys==j,0],xs[ys==j,1],\n", - " c=ListedColormap(('orange','grey'))(i),label = j)\n", - "\n", - "plt.title('Train Set')\n", - "plt.xlabel('Estimated Salary')\n", - "plt.ylabel('Age')\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exercise\n", - "Add the feature 'Gender' to the training set, and see if the model improves?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}