Skip to content
Snippets Groups Projects
Commit ad4fdf25 authored by ashepley's avatar ashepley
Browse files

Upload New File

parent 89a06d2a
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
### Linear Support Vector Machines
%% Cell type:code id: tags:
``` python
import numpy as np
from matplotlib import pyplot as plt
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import *
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib.colors import *
```
%% Cell type:code id: tags:
``` python
xBlue = np.array([0.3,0.5,1,1.4,1.7,2])
yBlue = np.array([1,4.5,2.3,1.9,8.9,4.1])
xRed = np.array([3.3,3.5,4,4.4,5.7,6])
yRed = np.array([7,1.5,6.3,1.9,2.9,7.1])
```
%% Cell type:code id: tags:
``` python
X = np.array([[0.3,1],[0.5,4.5],[1,2.3],[1.4,1.9],[1.7,8.9],[2,4.1],[3.3,7],[3.5,1.5],[4,6.3],[4.4,1.9],[5.7,2.9],[6,7.1]])
y = np.array([0,0,0,0,0,0,1,1,1,1,1,1]) # 0: blue class, 1: red class
```
%% Cell type:code id: tags:
``` python
plt.plot(xBlue, yBlue, 'ro', color='blue')
plt.plot(xRed, yRed, 'ro', color='red')
plt.plot(4.5,4.5,'ro',color='green')
```
%% Cell type:code id: tags:
``` python
classifier = svm.SVC()
classifier.fit(X,y)
```
%% Cell type:code id: tags:
``` python
coord = [4.5,4.5]
blue_red = classifier.predict([coord])
if blue_red == 1:
print(coord," is red")
else:
print(coord, " is blue")
```
%% Cell type:markdown id: tags:
### Let's apply this to 'real' data!
We'll be using a Support Vector Machine to predict whether a country is developed or not based its World Health Organisation life expectancy and GDP. You can access the dataset in this lecture here: https://www.kaggle.com/augustus0498/life-expectancy-who
%% Cell type:code id: tags:
``` python
import pandas as pd
from sklearn.model_selection import *
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
def load_data(DATASET_PATH):
return pd.read_csv(DATASET_PATH)
```
%% Cell type:code id: tags:
``` python
DATASET_PATH = './datasets/2015.csv'
dataset = load_data(DATASET_PATH)
dataset.head()
```
%% Cell type:code id: tags:
``` python
#functions
def check_NaN(dataframe):
print("Total NaN:", dataframe.isnull().values.sum())
print("NaN by column:\n",dataframe.isnull().sum())
return
def fillNaN_median(dataframe, key):
median = dataframe[key].median()
dataframe[key].fillna(median, inplace = True)
return
def one_hot_encode(dataframe, col_name):
dataframe = pd.get_dummies(dataframe, columns=[col_name], prefix = [col_name])
return dataframe
```
%% Cell type:code id: tags:
``` python
dataset.columns
```
%% Cell type:code id: tags:
``` python
dataset.loc[dataset['Happiness Score'] < 5, 'Happiness Score'] = 0
dataset.loc[dataset['Happiness Score'] >= 5, 'Happiness Score'] = 1
```
%% Cell type:code id: tags:
``` python
dataset.head()
```
%% Cell type:markdown id: tags:
#### Choose your features: We'll be choosing Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)
%% Cell type:code id: tags:
``` python
#chosen_columns = ['Happiness Score','Economy (GDP per Capita)','Family']
chosen_columns = ['Happiness Score','Trust (Government Corruption)','Economy (GDP per Capita)']
#You can experiment with others, such as;'Measles','AdultMortality','infantdeaths','Alcohol','HepatitisB','Measles','Polio','Population','thinness5-9years','HIV/AIDS','BMI','Diphtheria','GDP']
life_expectancy = dataset.filter(chosen_columns)
life_expectancy.head()
```
%% Cell type:markdown id: tags:
#### Check the feature columns for NaN values and correct any missing data
%% Cell type:code id: tags:
``` python
check_NaN(life_expectancy)
```
%% Cell type:markdown id: tags:
#### Create the train and test splits
%% Cell type:code id: tags:
``` python
x_train, x_test, y_train, y_test = train_test_split(life_expectancy.drop(['Happiness Score'], axis=1),life_expectancy['Happiness Score'],test_size=0.2,random_state=12)
print("x train/test ",x_train.shape, x_test.shape)
print("y train/test ",y_train.shape, y_test.shape)
```
%% Cell type:code id: tags:
``` python
x_dev = x_train.values
y_dev = y_train.values
x_t = x_test.values
y_t = y_test.values
```
%% Cell type:markdown id: tags:
#### Normalisation of data is expected when using SVMs.
Learn more here:
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
%% Cell type:code id: tags:
``` python
#feature scaling
sc = StandardScaler()
x_dev = sc.fit_transform(x_dev)
x_t = sc.fit_transform(x_t)
```
%% Cell type:markdown id: tags:
#### Train the linear SVM
%% Cell type:markdown id: tags:
### Parameters for SVC: Gamma and C
A lower value of Gamma will loosely fit the training dataset, whereas a higher value of gamma will exactly fit the training dataset resulting in over-fitting.
C parameter used is to maintain regularization. A smaller value of C creates a small-margin hyperplane and a larger value of C creates a larger-margin hyperplane.
%% Cell type:code id: tags:
``` python
svm_classifier = SVC(kernel = 'linear')#gamma=0.001,C=100
svm_classifier.fit(x_dev, y_dev)
```
%% Cell type:markdown id: tags:
#### Inference
Pass in the test set...
%% Cell type:code id: tags:
``` python
predictions = svm_classifier.predict(x_t)
```
%% Cell type:markdown id: tags:
#### Evaluation
Check out the mean squared error and accuracy
%% Cell type:code id: tags:
``` python
#mean squared error
np.mean((predictions - y_t) ** 2)
```
%% Cell type:code id: tags:
``` python
print("Accuracy:",str(round(metrics.accuracy_score(y_t, predictions)*100))+"%")
```
%% Cell type:markdown id: tags:
#### Visualise Linearly Seperable Data
%% Cell type:code id: tags:
``` python
xs, ys = x_t, y_t
X1, X2 = np.meshgrid(np.arange(start = xs[:,0].min() - 1,stop = xs[:,0].max() + 1,step = 0.01),
np.arange(start = xs[:,1].min() - 1,stop = xs[:,1].max() + 1,step = 0.01))
plt.contourf(X1,X2, svm_classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('orange','grey')))
plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i, j in enumerate(np.unique(ys)):
plt.scatter(xs[ys==j,0],xs[ys==j,1],
c=ListedColormap(('orange','grey'))(i),label = j)
plt.title('Test Set')
plt.xlabel('Trust')
plt.ylabel('GDP')
plt.legend()
plt.show()
```
%% Cell type:markdown id: tags:
### Exercise
Train an SVM using Family and Life Expectancy to predict whether a country is happy or not. Is this data linearly seperable?
%% Cell type:code id: tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment