Skip to content
Snippets Groups Projects
Commit ad4fdf25 authored by ashepley's avatar ashepley
Browse files

Upload New File

parent 89a06d2a
Branches
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
### Linear Support Vector Machines
%% Cell type:code id: tags:
``` python
import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib.colors import *
```
%% Cell type:code id: tags:
``` python
xBlue = np.array([0.3,0.5,1,1.4,1.7,2])
yBlue = np.array([1,4.5,2.3,1.9,8.9,4.1])
xRed = np.array([3.3,3.5,4,4.4,5.7,6])
yRed = np.array([7,1.5,6.3,1.9,2.9,7.1])
```
%% Cell type:code id: tags:
``` python
X = np.array([[0.3,1],[0.5,4.5],[1,2.3],[1.4,1.9],[1.7,8.9],[2,4.1],[3.3,7],[3.5,1.5],[4,6.3],[4.4,1.9],[5.7,2.9],[6,7.1]])
y = np.array([0,0,0,0,0,0,1,1,1,1,1,1]) # 0: blue class, 1: red class
```
%% Cell type:code id: tags:
``` python
plt.plot(xBlue, yBlue, 'ro', color='blue')
plt.plot(xRed, yRed, 'ro', color='red')
plt.plot(4.5,4.5,'ro',color='green')
```
%% Cell type:code id: tags:
``` python
classifier = svm.SVC()
classifier.fit(X,y)
```
%% Cell type:code id: tags:
``` python
coord = [4.5,4.5]
blue_red = classifier.predict([coord])
if blue_red == 1:
print(coord," is red")
else:
print(coord, " is blue")
```
%% Cell type:markdown id: tags:
### Let's apply this to 'real' data!
We'll be using a Support Vector Machine to predict whether a country is developed or not based its World Health Organisation life expectancy and GDP. You can access the dataset in this lecture here: https://www.kaggle.com/augustus0498/life-expectancy-who
%% Cell type:code id: tags:
``` python
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
def load_data(DATASET_PATH):
return pd.read_csv(DATASET_PATH)
```
%% Cell type:code id: tags:
``` python
DATASET_PATH = './datasets/2015.csv'
dataset = load_data(DATASET_PATH)
dataset.head()
```
%% Cell type:code id: tags:
``` python
#functions
def check_NaN(dataframe):
print("Total NaN:", dataframe.isnull().values.sum())
print("NaN by column:\n",dataframe.isnull().sum())
return
def fillNaN_median(dataframe, key):
median = dataframe[key].median()
dataframe[key].fillna(median, inplace = True)
return
def one_hot_encode(dataframe, col_name):
dataframe = pd.get_dummies(dataframe, columns=[col_name], prefix = [col_name])
return dataframe
```
%% Cell type:code id: tags:
``` python
dataset.columns
```
%% Cell type:code id: tags:
``` python
dataset.loc[dataset['Happiness Score'] < 5, 'Happiness Score'] = 0
dataset.loc[dataset['Happiness Score'] >= 5, 'Happiness Score'] = 1
```
%% Cell type:code id: tags:
``` python
dataset.head()
```
%% Cell type:markdown id: tags:
#### Choose your features: We'll be choosing Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)
%% Cell type:code id: tags:
``` python
#chosen_columns = ['Happiness Score','Economy (GDP per Capita)','Family']
chosen_columns = ['Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)']
#You can experiment with others, such as;'Measles','AdultMortality','infantdeaths','Alcohol','HepatitisB','Measles','Polio','Population','thinness5-9years','HIV/AIDS','BMI','Diphtheria','GDP']
life_expectancy = dataset.filter(chosen_columns)
life_expectancy.head()
```
%% Cell type:markdown id: tags:
#### Check the feature columns for NaN values and correct any missing data
%% Cell type:code id: tags:
``` python
check_NaN(life_expectancy)
```
%% Cell type:markdown id: tags:
#### Create the train and test splits
%% Cell type:code id: tags:
``` python
x_train, x_test, y_train, y_test = train_test_split(life_expectancy.drop(['Happiness Score'], axis=1),life_expectancy['Happiness Score'],test_size=0.2,random_state=12)
print("x train/test ",x_train.shape, x_test.shape)
print("y train/test ",y_train.shape, y_test.shape)
```
%% Cell type:code id: tags:
``` python
x_dev = x_train.values
y_dev = y_train.values
x_t = x_test.values
y_t = y_test.values
```
%% Cell type:markdown id: tags:
#### Normalisation of data is expected when using SVMs.
Learn more here:
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
%% Cell type:code id: tags:
``` python
#feature scaling
sc = StandardScaler()
x_dev = sc.fit_transform(x_dev)
x_t = sc.fit_transform(x_t)
```
%% Cell type:markdown id: tags:
#### Train the linear SVM
%% Cell type:markdown id: tags:
### Parameters for SVC: Gamma and C
A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset resulting in over-fitting.
C parameter used is to maintain regularization. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane.
%% Cell type:code id: tags:
``` python
svm_classifier = SVC(kernel = 'linear')#gamma=0.001,C=100
svm_classifier.fit(x_dev, y_dev)
```
%% Cell type:markdown id: tags:
#### Inference
Pass in the test set...
%% Cell type:code id: tags:
``` python
predictions = svm_classifier.predict(x_t)
```
%% Cell type:markdown id: tags:
#### Evaluation
Check out the mean squared error and accuracy
%% Cell type:code id: tags:
``` python
#mean squared error
np.mean((predictions - y_t) ** 2)
```
%% Cell type:code id: tags:
``` python
print("Accuracy:",str(round(metrics.accuracy_score(y_t, predictions)*100))+"%")
```
%% Cell type:markdown id: tags:
#### Visualise Linearly Seperable Data
%% Cell type:code id: tags:
``` python
xs, ys = x_t, y_t
X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),
np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))
plt.contourf(X1,X2, svm_classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('orange','grey')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i, j in enumerate(np.unique(ys)):
plt.scatter(xs[ys==j,0],xs[ys==j,1],
c=ListedColormap(('orange','grey'))(i),label = j)
plt.title('Test Set')
plt.xlabel('Trust')
plt.ylabel('GDP')
plt.legend()
plt.show()
```
%% Cell type:markdown id: tags:
### Exercise
Train an SVM using Family and Life Expectancy to predict whether a country is happy or not. Is this data linearly seperable?
%% Cell type:code id: tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment