From 0a067f597078edee14bbeb41313a9e89c5a8c200 Mon Sep 17 00:00:00 2001 From: ashepley <ashepley@myune.edu.au> Date: Tue, 4 Aug 2020 13:25:21 +1000 Subject: [PATCH] Upload New File --- .../Non-linear_Support_Vector_Machinces.ipynb | 332 ++++++++++++++++++ 1 file changed, 332 insertions(+) create mode 100644 topic_28/Non-linear_Support_Vector_Machinces.ipynb diff --git a/topic_28/Non-linear_Support_Vector_Machinces.ipynb b/topic_28/Non-linear_Support_Vector_Machinces.ipynb new file mode 100644 index 0000000..e38af6e --- /dev/null +++ b/topic_28/Non-linear_Support_Vector_Machinces.ipynb @@ -0,0 +1,332 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Non-linear Support Vector Machines\n", + "\n", + "You can obtain the dataset used in this lecture here: https://www.kaggle.com/rakeshrau/social-network-ads/data. It is a 'categorical dataset to determine whether a user purchased a particular product'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#import\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib.colors import *\n", + "import pandas as pd\n", + "from sklearn.model_selection import *\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import *\n", + "from sklearn import metrics\n", + "from sklearn.svm import SVC\n", + "from sklearn.preprocessing import StandardScaler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are the functions you used in the previous lectures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def load_data(DATASET_PATH):\n", + " return pd.read_csv(DATASET_PATH)\n", + "\n", + "def check_NaN(dataframe):\n", + " print(\"Total NaN:\", dataframe.isnull().values.sum())\n", + " print(\"NaN by column:\\n\",dataframe.isnull().sum())\n", + " return\n", + "\n", + "def one_hot_encode(dataframe, col_name):\n", + " dataframe = pd.get_dummies(dataframe, columns=[col_name], prefix = [col_name])\n", + " return dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the dataset, and have a look at its contents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PATH = './datasets/Social_Network_Ads.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#create pandas object\n", + "ads = load_data(DATASET_PATH)\n", + "ads.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll be training our SVM on the Age and Estimated Salary features, to predict whether the user purchased a product based on an ad or not. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chosen_columns = ['Age','EstimatedSalary','Purchased']\n", + "subset = ads.filter(chosen_columns)\n", + "subset.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Always check whether your subset contains NaN values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "check_NaN(subset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Split the dataset into train and test sets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(subset.drop(['Purchased'], axis=1),subset['Purchased'],test_size=0.2,random_state=42) \n", + "print(\"x train/test \",x_train.shape, x_test.shape)\n", + "print(\"y train/test \",y_train.shape, y_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_dev = x_train.values\n", + "y_dev = y_train.values\n", + "x_t = x_test.values\n", + "y_t = y_test.values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Normalisation of data is expected when using SVMs. Learn more here:\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#feature scaling\n", + "sc = StandardScaler()\n", + "\n", + "x_dev = sc.fit_transform(x_dev)\n", + "x_t = sc.fit_transform(x_t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### SVM Classifier\n", + "Create the SVM, and train it on the standardised data\n", + "\n", + "### Parameters for SVC: Gamma and C\n", + "A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset resulting in over-fitting.\n", + "\n", + "C parameter used is to maintain regularization. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "svm_classifier = SVC(kernel = 'rbf', random_state=0)#gamma=0.001, C=10\n", + "svm_classifier.fit(x_dev, y_dev)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Inference" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pass in the test set, to see how well it performs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = svm_classifier.predict(x_t)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Confusion matrix shows you how many of each class were correctly and incorrectly classified" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "confusion_matrix(y_t, predictions, labels = [0,1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Evaluation\n", + "Evaluation using mean squared error and accuracy, precision and recall" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#mean squared error\n", + "np.mean((predictions - y_t) ** 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Accuracy:\",str(metrics.accuracy_score(y_t, predictions)*100)+\"%\")\n", + "print(\"Precision:\",str(round(metrics.precision_score(y_t, predictions)*100))+\"%\")\n", + "print(\"Recall:\",str(round(metrics.recall_score(y_t, predictions)*100))+\"%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Visualisation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "xs, ys = x_t, y_t\n", + "\n", + "X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),\n", + " np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))\n", + "\n", + "plt.contourf(X1,X2, svm_classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),\n", + " alpha = 0.75, cmap = ListedColormap(('orange','grey')))\n", + "\n", + "plt.xlim(X1.min(),X1.max())\n", + "plt.ylim(X2.min(),X2.max())\n", + "\n", + "for i, j in enumerate(np.unique(ys)):\n", + " plt.scatter(xs[ys==j,0],xs[ys==j,1],\n", + " c=ListedColormap(('orange','grey'))(i),label = j)\n", + "\n", + "plt.title('Test Set')\n", + "plt.xlabel('Age')\n", + "plt.ylabel('Estimated Salary')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise\n", + "Add the feature 'Gender' to the training set, and see if the accuracy improves and the mean squared error drops!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab