Upload New File

ad4fdf25 · ashepley · 89a06d2a · ad4fdf25
Commit ad4fdf25 authored Aug 4, 2020 by ashepley
--- a/topic_27/Linear_Support_Vector_Machines-Final.ipynb
+++ b/topic_27/Linear_Support_Vector_Machines-Final.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Linear Support Vector Machines\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from matplotlib import pyplot as plt\n",
+    "from sklearn import svm\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.metrics import *\n",
+    "from sklearn import metrics\n",
+    "import matplotlib.pyplot as plt\n",
+    "from matplotlib.colors import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xBlue = np.array([0.3,0.5,1,1.4,1.7,2])\n",
+    "yBlue = np.array([1,4.5,2.3,1.9,8.9,4.1])\n",
+    "xRed = np.array([3.3,3.5,4,4.4,5.7,6])\n",
+    "yRed = np.array([7,1.5,6.3,1.9,2.9,7.1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = np.array([[0.3,1],[0.5,4.5],[1,2.3],[1.4,1.9],[1.7,8.9],[2,4.1],[3.3,7],[3.5,1.5],[4,6.3],[4.4,1.9],[5.7,2.9],[6,7.1]])\n",
+    "y = np.array([0,0,0,0,0,0,1,1,1,1,1,1]) # 0: blue class, 1: red class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.plot(xBlue, yBlue, 'ro', color='blue')\n",
+    "plt.plot(xRed, yRed, 'ro', color='red')\n",
+    "plt.plot(4.5,4.5,'ro',color='green')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier = svm.SVC()\n",
+    "classifier.fit(X,y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coord = [4.5,4.5]\n",
+    "blue_red = classifier.predict([coord])\n",
+    "\n",
+    "if blue_red == 1:\n",
+    "    print(coord,\" is red\")\n",
+    "else:\n",
+    "    print(coord, \" is blue\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Let's apply this to 'real' data!\n",
+    "\n",
+    "We'll be using a Support Vector Machine to predict whether a country is developed or not based its World Health Organisation life expectancy and GDP. You can access the dataset in this lecture here: https://www.kaggle.com/augustus0498/life-expectancy-who"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.model_selection import *\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "import numpy as np\n",
+    "from sklearn.metrics import mean_squared_error\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "def load_data(DATASET_PATH):\n",
+    "    return pd.read_csv(DATASET_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATASET_PATH = './datasets/2015.csv'\n",
+    "dataset = load_data(DATASET_PATH)\n",
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#functions\n",
+    "def check_NaN(dataframe):\n",
+    "    print(\"Total NaN:\", dataframe.isnull().values.sum())\n",
+    "    print(\"NaN by column:\\n\",dataframe.isnull().sum())\n",
+    "    return\n",
+    "\n",
+    "def fillNaN_median(dataframe, key):\n",
+    "    median = dataframe[key].median()\n",
+    "    dataframe[key].fillna(median, inplace = True)\n",
+    "    return \n",
+    "\n",
+    "def one_hot_encode(dataframe, col_name):\n",
+    "    dataframe = pd.get_dummies(dataframe, columns=[col_name], prefix = [col_name])\n",
+    "    return dataframe\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.loc[dataset['Happiness Score'] < 5, 'Happiness Score'] = 0\n",
+    "dataset.loc[dataset['Happiness Score'] >= 5, 'Happiness Score'] = 1\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Choose your features: We'll be choosing Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#chosen_columns = ['Happiness Score','Economy (GDP per Capita)','Family']\n",
+    "chosen_columns = ['Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)']\n",
+    "#You can experiment with others, such as;'Measles','AdultMortality','infantdeaths','Alcohol','HepatitisB','Measles','Polio','Population','thinness5-9years','HIV/AIDS','BMI','Diphtheria','GDP']\n",
+    "life_expectancy = dataset.filter(chosen_columns)\n",
+    "life_expectancy.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Check the feature columns for NaN values and correct any missing data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_NaN(life_expectancy)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Create the train and test splits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train, x_test, y_train, y_test = train_test_split(life_expectancy.drop(['Happiness Score'], axis=1),life_expectancy['Happiness Score'],test_size=0.2,random_state=12)                                                                       \n",
+    "print(\"x train/test \",x_train.shape, x_test.shape)\n",
+    "print(\"y train/test \",y_train.shape, y_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_dev = x_train.values\n",
+    "y_dev = y_train.values\n",
+    "x_t = x_test.values\n",
+    "y_t = y_test.values"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Normalisation of data is expected when using SVMs. \n",
+    "Learn more here:\n",
+    "https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#feature scaling\n",
+    "sc = StandardScaler()\n",
+    "\n",
+    "x_dev = sc.fit_transform(x_dev)\n",
+    "x_t = sc.fit_transform(x_t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Train the linear SVM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameters for SVC: Gamma and C\n",
+    "A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset resulting in over-fitting.\n",
+    "\n",
+    "C parameter used is to maintain regularization. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane.\n",
+    "\t\t\t\t\t\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "svm_classifier = SVC(kernel = 'linear')#gamma=0.001,C=100\n",
+    "svm_classifier.fit(x_dev, y_dev)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Inference\n",
+    "Pass in the test set..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = svm_classifier.predict(x_t)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Evaluation\n",
+    "Check out the mean squared error and accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#mean squared error\n",
+    "np.mean((predictions - y_t) ** 2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Accuracy:\",str(round(metrics.accuracy_score(y_t, predictions)*100))+\"%\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Visualise Linearly Seperable Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "xs, ys = x_t, y_t\n",
+    "\n",
+    "X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),\n",
+    "                     np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))\n",
+    "\n",
+    "plt.contourf(X1,X2, svm_classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),\n",
+    "            alpha = 0.75, cmap = ListedColormap(('orange','grey')))\n",
+    "\n",
+    "plt.xlim(X1.min(),X1.max())\n",
+    "plt.ylim(X2.min(),X2.max())\n",
+    "\n",
+    "for i, j in enumerate(np.unique(ys)):\n",
+    "    plt.scatter(xs[ys==j,0],xs[ys==j,1],\n",
+    "                c=ListedColormap(('orange','grey'))(i),label = j)\n",
+    "\n",
+    "plt.title('Test Set')\n",
+    "plt.xlabel('Trust')\n",
+    "plt.ylabel('GDP')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise\n",
+    "Train an SVM using Family and Life Expectancy to predict whether a country is happy or not. Is this data linearly seperable?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+### Linear Support Vector Machines
+%% Cell type:code id: tags:
+``` python
+import numpy as np
+from matplotlib import pyplot as plt
+from sklearn import svm
+from sklearn.preprocessing import StandardScaler
+from sklearn.svm import SVC
+from sklearn.metrics import *
+from sklearn import metrics
+import matplotlib.pyplot as plt
+from matplotlib.colors import *
+```
+%% Cell type:code id: tags:
+``` python
+xBlue = np.array([0.3,0.5,1,1.4,1.7,2])
+yBlue = np.array([1,4.5,2.3,1.9,8.9,4.1])
+xRed = np.array([3.3,3.5,4,4.4,5.7,6])
+yRed = np.array([7,1.5,6.3,1.9,2.9,7.1])
+```
+%% Cell type:code id: tags:
+``` python
+X = np.array([[0.3,1],[0.5,4.5],[1,2.3],[1.4,1.9],[1.7,8.9],[2,4.1],[3.3,7],[3.5,1.5],[4,6.3],[4.4,1.9],[5.7,2.9],[6,7.1]])
+y = np.array([0,0,0,0,0,0,1,1,1,1,1,1]) # 0: blue class, 1: red class
+```
+%% Cell type:code id: tags:
+``` python
+plt.plot(xBlue, yBlue, 'ro', color='blue')
+plt.plot(xRed, yRed, 'ro', color='red')
+plt.plot(4.5,4.5,'ro',color='green')
+```
+%% Cell type:code id: tags:
+``` python
+classifier = svm.SVC()
+classifier.fit(X,y)
+```
+%% Cell type:code id: tags:
+``` python
+coord = [4.5,4.5]
+blue_red = classifier.predict([coord])
+if blue_red == 1:
+    print(coord," is red")
+else:
+    print(coord, " is blue")
+```
+%% Cell type:markdown id: tags:
+### Let's apply this to 'real' data!
+We'll be using a Support Vector Machine to predict whether a country is developed or not based its World Health Organisation life expectancy and GDP. You can access the dataset in this lecture here: https://www.kaggle.com/augustus0498/life-expectancy-who
+%% Cell type:code id: tags:
+``` python
+import pandas as pd
+from sklearn.model_selection import *
+from sklearn.linear_model import LinearRegression
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from sklearn.preprocessing import StandardScaler
+def load_data(DATASET_PATH):
+    return pd.read_csv(DATASET_PATH)
+```
+%% Cell type:code id: tags:
+``` python
+DATASET_PATH = './datasets/2015.csv'
+dataset = load_data(DATASET_PATH)
+dataset.head()
+```
+%% Cell type:code id: tags:
+``` python
+#functions
+def check_NaN(dataframe):
+    print("Total NaN:", dataframe.isnull().values.sum())
+    print("NaN by column:\n",dataframe.isnull().sum())
+    return
+def fillNaN_median(dataframe, key):
+    median = dataframe[key].median()
+    dataframe[key].fillna(median, inplace = True)
+    return
+def one_hot_encode(dataframe, col_name):
+    dataframe = pd.get_dummies(dataframe, columns=[col_name], prefix = [col_name])
+    return dataframe
+```
+%% Cell type:code id: tags:
+``` python
+dataset.columns
+```
+%% Cell type:code id: tags:
+``` python
+dataset.loc[dataset['Happiness Score'] < 5, 'Happiness Score'] = 0
+dataset.loc[dataset['Happiness Score'] >= 5, 'Happiness Score'] = 1
+```
+%% Cell type:code id: tags:
+``` python
+dataset.head()
+```
+%% Cell type:markdown id: tags:
+#### Choose your features: We'll be choosing Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)
+%% Cell type:code id: tags:
+``` python
+#chosen_columns = ['Happiness Score','Economy (GDP per Capita)','Family']
+chosen_columns = ['Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)']
+#You can experiment with others, such as;'Measles','AdultMortality','infantdeaths','Alcohol','HepatitisB','Measles','Polio','Population','thinness5-9years','HIV/AIDS','BMI','Diphtheria','GDP']
+life_expectancy = dataset.filter(chosen_columns)
+life_expectancy.head()
+```
+%% Cell type:markdown id: tags:
+#### Check the feature columns for NaN values and correct any missing data
+%% Cell type:code id: tags:
+``` python
+check_NaN(life_expectancy)
+```
+%% Cell type:markdown id: tags:
+#### Create the train and test splits
+%% Cell type:code id: tags:
+``` python
+x_train, x_test, y_train, y_test = train_test_split(life_expectancy.drop(['Happiness Score'], axis=1),life_expectancy['Happiness Score'],test_size=0.2,random_state=12)
+print("x train/test ",x_train.shape, x_test.shape)
+print("y train/test ",y_train.shape, y_test.shape)
+```
+%% Cell type:code id: tags:
+``` python
+x_dev = x_train.values
+y_dev = y_train.values
+x_t = x_test.values
+y_t = y_test.values
+```
+%% Cell type:markdown id: tags:
+#### Normalisation of data is expected when using SVMs.
+Learn more here:
+https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
+%% Cell type:code id: tags:
+``` python
+#feature scaling
+sc = StandardScaler()
+x_dev = sc.fit_transform(x_dev)
+x_t = sc.fit_transform(x_t)
+```
+%% Cell type:markdown id: tags:
+#### Train the linear SVM
+%% Cell type:markdown id: tags:
+### Parameters for SVC: Gamma and C
+A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset resulting in over-fitting.
+C parameter used is to maintain regularization. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane.
+%% Cell type:code id: tags:
+``` python
+svm_classifier = SVC(kernel = 'linear')#gamma=0.001,C=100
+svm_classifier.fit(x_dev, y_dev)
+```
+%% Cell type:markdown id: tags:
+#### Inference
+Pass in the test set...
+%% Cell type:code id: tags:
+``` python
+predictions = svm_classifier.predict(x_t)
+```
+%% Cell type:markdown id: tags:
+#### Evaluation
+Check out the mean squared error and accuracy
+%% Cell type:code id: tags:
+``` python
+#mean squared error
+np.mean((predictions - y_t) ** 2)
+```
+%% Cell type:code id: tags:
+``` python
+print("Accuracy:",str(round(metrics.accuracy_score(y_t, predictions)*100))+"%")
+```
+%% Cell type:markdown id: tags:
+#### Visualise Linearly Seperable Data
+%% Cell type:code id: tags:
+``` python
+xs, ys = x_t, y_t
+X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),
+                     np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))
+plt.contourf(X1,X2, svm_classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
+            alpha = 0.75, cmap = ListedColormap(('orange','grey')))
+plt.xlim(X1.min(),X1.max())
+plt.ylim(X2.min(),X2.max())
+for i, j in enumerate(np.unique(ys)):
+    plt.scatter(xs[ys==j,0],xs[ys==j,1],
+                c=ListedColormap(('orange','grey'))(i),label = j)
+plt.title('Test Set')
+plt.xlabel('Trust')
+plt.ylabel('GDP')
+plt.legend()
+plt.show()
+```
+%% Cell type:markdown id: tags:
+### Exercise
+Train an SVM using Family and Life Expectancy to predict whether a country is happy or not. Is this data linearly seperable?
+%% Cell type:code id: tags:
+``` python
+```