From 0a067f597078edee14bbeb41313a9e89c5a8c200 Mon Sep 17 00:00:00 2001
From: ashepley <ashepley@myune.edu.au>
Date: Tue, 4 Aug 2020 13:25:21 +1000
Subject: [PATCH] Upload New File

---
 .../Non-linear_Support_Vector_Machinces.ipynb | 332 ++++++++++++++++++
 1 file changed, 332 insertions(+)
 create mode 100644 topic_28/Non-linear_Support_Vector_Machinces.ipynb

diff --git a/topic_28/Non-linear_Support_Vector_Machinces.ipynb b/topic_28/Non-linear_Support_Vector_Machinces.ipynb
new file mode 100644
index 0000000..e38af6e
--- /dev/null
+++ b/topic_28/Non-linear_Support_Vector_Machinces.ipynb
@@ -0,0 +1,332 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Non-linear Support Vector Machines\n",
+    "\n",
+    "You can obtain the dataset used in this lecture here: https://www.kaggle.com/rakeshrau/social-network-ads/data. It is a 'categorical dataset to determine whether a user purchased a particular product'."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#import\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib.colors import *\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import *\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.metrics import *\n",
+    "from sklearn import metrics\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.preprocessing import StandardScaler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here are the functions you used in the previous lectures"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_data(DATASET_PATH):\n",
+    "    return pd.read_csv(DATASET_PATH)\n",
+    "\n",
+    "def check_NaN(dataframe):\n",
+    "    print(\"Total NaN:\", dataframe.isnull().values.sum())\n",
+    "    print(\"NaN by column:\\n\",dataframe.isnull().sum())\n",
+    "    return\n",
+    "\n",
+    "def one_hot_encode(dataframe, col_name):\n",
+    "    dataframe = pd.get_dummies(dataframe, columns=[col_name], prefix = [col_name])\n",
+    "    return dataframe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the dataset, and have a look at its contents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATASET_PATH = './datasets/Social_Network_Ads.csv'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#create pandas object\n",
+    "ads = load_data(DATASET_PATH)\n",
+    "ads.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll be training our SVM on the Age and Estimated Salary features, to predict whether the user purchased a product based on an ad or not. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chosen_columns = ['Age','EstimatedSalary','Purchased']\n",
+    "subset = ads.filter(chosen_columns)\n",
+    "subset.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Always check whether your subset contains NaN values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_NaN(subset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split the dataset into train and test sets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train, x_test, y_train, y_test = train_test_split(subset.drop(['Purchased'], axis=1),subset['Purchased'],test_size=0.2,random_state=42)                                                                       \n",
+    "print(\"x train/test \",x_train.shape, x_test.shape)\n",
+    "print(\"y train/test \",y_train.shape, y_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_dev = x_train.values\n",
+    "y_dev = y_train.values\n",
+    "x_t = x_test.values\n",
+    "y_t = y_test.values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Normalisation of data is expected when using SVMs. Learn more here:\n",
+    "https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#feature scaling\n",
+    "sc = StandardScaler()\n",
+    "\n",
+    "x_dev = sc.fit_transform(x_dev)\n",
+    "x_t = sc.fit_transform(x_t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### SVM Classifier\n",
+    "Create the SVM, and train it on the standardised data\n",
+    "\n",
+    "### Parameters for SVC: Gamma and C\n",
+    "A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset resulting in over-fitting.\n",
+    "\n",
+    "C parameter used is to maintain regularization. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svm_classifier = SVC(kernel = 'rbf', random_state=0)#gamma=0.001, C=10\n",
+    "svm_classifier.fit(x_dev, y_dev)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Inference"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Pass in the test set, to see how well it performs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = svm_classifier.predict(x_t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Confusion matrix shows you how many of each class were correctly and incorrectly classified"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "confusion_matrix(y_t, predictions, labels = [0,1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Evaluation\n",
+    "Evaluation using mean squared error and accuracy, precision and recall"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#mean squared error\n",
+    "np.mean((predictions - y_t) ** 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Accuracy:\",str(metrics.accuracy_score(y_t, predictions)*100)+\"%\")\n",
+    "print(\"Precision:\",str(round(metrics.precision_score(y_t, predictions)*100))+\"%\")\n",
+    "print(\"Recall:\",str(round(metrics.recall_score(y_t, predictions)*100))+\"%\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Visualisation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xs, ys = x_t, y_t\n",
+    "\n",
+    "X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),\n",
+    "                     np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))\n",
+    "\n",
+    "plt.contourf(X1,X2, svm_classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),\n",
+    "            alpha = 0.75, cmap = ListedColormap(('orange','grey')))\n",
+    "\n",
+    "plt.xlim(X1.min(),X1.max())\n",
+    "plt.ylim(X2.min(),X2.max())\n",
+    "\n",
+    "for i, j in enumerate(np.unique(ys)):\n",
+    "    plt.scatter(xs[ys==j,0],xs[ys==j,1],\n",
+    "                c=ListedColormap(('orange','grey'))(i),label = j)\n",
+    "\n",
+    "plt.title('Test Set')\n",
+    "plt.xlabel('Age')\n",
+    "plt.ylabel('Estimated Salary')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise\n",
+    "Add the feature 'Gender' to the training set, and see if the accuracy improves and the mean squared error drops!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab