From ad4fdf25cb2369bfa03bc3d0590d5d380317f6bd Mon Sep 17 00:00:00 2001
From: ashepley <ashepley@myune.edu.au>
Date: Tue, 4 Aug 2020 13:21:19 +1000
Subject: [PATCH] Upload New File

---
 ...Linear_Support_Vector_Machines-Final.ipynb | 404 ++++++++++++++++++
 1 file changed, 404 insertions(+)
 create mode 100644 topic_27/Linear_Support_Vector_Machines-Final.ipynb

diff --git a/topic_27/Linear_Support_Vector_Machines-Final.ipynb b/topic_27/Linear_Support_Vector_Machines-Final.ipynb
new file mode 100644
index 0000000..1c95764
--- /dev/null
+++ b/topic_27/Linear_Support_Vector_Machines-Final.ipynb
@@ -0,0 +1,404 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Linear Support Vector Machines\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from matplotlib import pyplot as plt\n",
+    "from sklearn import svm\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.metrics import *\n",
+    "from sklearn import metrics\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib.colors import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xBlue = np.array([0.3,0.5,1,1.4,1.7,2])\n",
+    "yBlue = np.array([1,4.5,2.3,1.9,8.9,4.1])\n",
+    "xRed = np.array([3.3,3.5,4,4.4,5.7,6])\n",
+    "yRed = np.array([7,1.5,6.3,1.9,2.9,7.1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = np.array([[0.3,1],[0.5,4.5],[1,2.3],[1.4,1.9],[1.7,8.9],[2,4.1],[3.3,7],[3.5,1.5],[4,6.3],[4.4,1.9],[5.7,2.9],[6,7.1]])\n",
+    "y = np.array([0,0,0,0,0,0,1,1,1,1,1,1]) # 0: blue class, 1: red class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.plot(xBlue, yBlue, 'ro', color='blue')\n",
+    "plt.plot(xRed, yRed, 'ro', color='red')\n",
+    "plt.plot(4.5,4.5,'ro',color='green')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier = svm.SVC()\n",
+    "classifier.fit(X,y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coord = [4.5,4.5]\n",
+    "blue_red = classifier.predict([coord])\n",
+    "\n",
+    "if blue_red == 1:\n",
+    "    print(coord,\" is red\")\n",
+    "else:\n",
+    "    print(coord, \" is blue\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Let's apply this to 'real' data!\n",
+    "\n",
+    "We'll be using a Support Vector Machine to predict whether a country is developed or not based its World Health Organisation life expectancy and GDP. You can access the dataset in this lecture here: https://www.kaggle.com/augustus0498/life-expectancy-who"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import *\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "import numpy as np\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "def load_data(DATASET_PATH):\n",
+    "    return pd.read_csv(DATASET_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATASET_PATH = './datasets/2015.csv'\n",
+    "dataset = load_data(DATASET_PATH)\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#functions\n",
+    "def check_NaN(dataframe):\n",
+    "    print(\"Total NaN:\", dataframe.isnull().values.sum())\n",
+    "    print(\"NaN by column:\\n\",dataframe.isnull().sum())\n",
+    "    return\n",
+    "\n",
+    "def fillNaN_median(dataframe, key):\n",
+    "    median = dataframe[key].median()\n",
+    "    dataframe[key].fillna(median, inplace = True)\n",
+    "    return \n",
+    "\n",
+    "def one_hot_encode(dataframe, col_name):\n",
+    "    dataframe = pd.get_dummies(dataframe, columns=[col_name], prefix = [col_name])\n",
+    "    return dataframe\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.loc[dataset['Happiness Score'] < 5, 'Happiness Score'] = 0\n",
+    "dataset.loc[dataset['Happiness Score'] >= 5, 'Happiness Score'] = 1\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Choose your features: We'll be choosing Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#chosen_columns = ['Happiness Score','Economy (GDP per Capita)','Family']\n",
+    "chosen_columns = ['Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)']\n",
+    "#You can experiment with others, such as;'Measles','AdultMortality','infantdeaths','Alcohol','HepatitisB','Measles','Polio','Population','thinness5-9years','HIV/AIDS','BMI','Diphtheria','GDP']\n",
+    "life_expectancy = dataset.filter(chosen_columns)\n",
+    "life_expectancy.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Check the feature columns for NaN values and correct any missing data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_NaN(life_expectancy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create the train and test splits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train, x_test, y_train, y_test = train_test_split(life_expectancy.drop(['Happiness Score'], axis=1),life_expectancy['Happiness Score'],test_size=0.2,random_state=12)                                                                       \n",
+    "print(\"x train/test \",x_train.shape, x_test.shape)\n",
+    "print(\"y train/test \",y_train.shape, y_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_dev = x_train.values\n",
+    "y_dev = y_train.values\n",
+    "x_t = x_test.values\n",
+    "y_t = y_test.values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Normalisation of data is expected when using SVMs. \n",
+    "Learn more here:\n",
+    "https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#feature scaling\n",
+    "sc = StandardScaler()\n",
+    "\n",
+    "x_dev = sc.fit_transform(x_dev)\n",
+    "x_t = sc.fit_transform(x_t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Train the linear SVM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameters for SVC: Gamma and C\n",
+    "A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset resulting in over-fitting.\n",
+    "\n",
+    "C parameter used is to maintain regularization. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane.\n",
+    "\t\t\t\t\t\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svm_classifier = SVC(kernel = 'linear')#gamma=0.001,C=100\n",
+    "svm_classifier.fit(x_dev, y_dev)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Inference\n",
+    "Pass in the test set..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = svm_classifier.predict(x_t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Evaluation\n",
+    "Check out the mean squared error and accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#mean squared error\n",
+    "np.mean((predictions - y_t) ** 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Accuracy:\",str(round(metrics.accuracy_score(y_t, predictions)*100))+\"%\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Visualise Linearly Seperable Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xs, ys = x_t, y_t\n",
+    "\n",
+    "X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),\n",
+    "                     np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))\n",
+    "\n",
+    "plt.contourf(X1,X2, svm_classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),\n",
+    "            alpha = 0.75, cmap = ListedColormap(('orange','grey')))\n",
+    "\n",
+    "plt.xlim(X1.min(),X1.max())\n",
+    "plt.ylim(X2.min(),X2.max())\n",
+    "\n",
+    "for i, j in enumerate(np.unique(ys)):\n",
+    "    plt.scatter(xs[ys==j,0],xs[ys==j,1],\n",
+    "                c=ListedColormap(('orange','grey'))(i),label = j)\n",
+    "\n",
+    "plt.title('Test Set')\n",
+    "plt.xlabel('Trust')\n",
+    "plt.ylabel('GDP')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise\n",
+    "Train an SVM using Family and Life Expectancy to predict whether a country is happy or not. Is this data linearly seperable?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
-- 
GitLab