From 40fec9be5a049213090a830243af43526bd2025d Mon Sep 17 00:00:00 2001 From: ashepley <ashepley@myune.edu.au> Date: Fri, 10 Jul 2020 09:31:33 +1000 Subject: [PATCH] Upload New File --- ...nd_to_End_Machine_Learning_Topic__12.ipynb | 887 ++++++++++++++++++ 1 file changed, 887 insertions(+) create mode 100644 topic_12/End_to_End_Machine_Learning_Topic__12.ipynb diff --git a/topic_12/End_to_End_Machine_Learning_Topic__12.ipynb b/topic_12/End_to_End_Machine_Learning_Topic__12.ipynb new file mode 100644 index 0000000..cbf32ca --- /dev/null +++ b/topic_12/End_to_End_Machine_Learning_Topic__12.ipynb @@ -0,0 +1,887 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training and Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset 1: World Happiness Report up to 2020\n", + "Our goal is to predict the happiness score of a country using both Single and Multiple Linear Regression Models. You can access the World Happiness Report dataset here: www.kaggle.com/mathurinache/world-happiness-report \n", + "\n", + "#### What is Linear Regression?\n", + "Linear regression is statistical modelling technique used in supervised machine learning algorithm. Linear regression is used for predictive analysis, by displaying the relationship between two variables (x and y). " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import *\n", + "from sklearn.linear_model import LinearRegression\n", + "import numpy as np\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "def load_data(DATASET_PATH):\n", + " return pd.read_csv(DATASET_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PATH = './datasets/happiness/2015.csv'\n", + "\n", + "#create pandas object\n", + "happiness = load_data(DATASET_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Region</th>\n", + " <th>Happiness Rank</th>\n", + " <th>Happiness Score</th>\n", + " <th>Standard Error</th>\n", + " <th>Economy (GDP per Capita)</th>\n", + " <th>Family</th>\n", + " <th>Health (Life Expectancy)</th>\n", + " <th>Freedom</th>\n", + " <th>Trust (Government Corruption)</th>\n", + " <th>Generosity</th>\n", + " <th>Dystopia Residual</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Switzerland</td>\n", + " <td>Western Europe</td>\n", + " <td>1</td>\n", + " <td>7.587</td>\n", + " <td>0.03411</td>\n", + " <td>1.39651</td>\n", + " <td>1.34951</td>\n", + " <td>0.94143</td>\n", + " <td>0.66557</td>\n", + " <td>0.41978</td>\n", + " <td>0.29678</td>\n", + " <td>2.51738</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Iceland</td>\n", + " <td>Western Europe</td>\n", + " <td>2</td>\n", + " <td>7.561</td>\n", + " <td>0.04884</td>\n", + " <td>1.30232</td>\n", + " <td>1.40223</td>\n", + " <td>0.94784</td>\n", + " <td>0.62877</td>\n", + " <td>0.14145</td>\n", + " <td>0.43630</td>\n", + " <td>2.70201</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Denmark</td>\n", + " <td>Western Europe</td>\n", + " <td>3</td>\n", + " <td>7.527</td>\n", + " <td>0.03328</td>\n", + " <td>1.32548</td>\n", + " <td>1.36058</td>\n", + " <td>0.87464</td>\n", + " <td>0.64938</td>\n", + " <td>0.48357</td>\n", + " <td>0.34139</td>\n", + " <td>2.49204</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Norway</td>\n", + " <td>Western Europe</td>\n", + " <td>4</td>\n", + " <td>7.522</td>\n", + " <td>0.03880</td>\n", + " <td>1.45900</td>\n", + " <td>1.33095</td>\n", + " <td>0.88521</td>\n", + " <td>0.66973</td>\n", + " <td>0.36503</td>\n", + " <td>0.34699</td>\n", + " <td>2.46531</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Canada</td>\n", + " <td>North America</td>\n", + " <td>5</td>\n", + " <td>7.427</td>\n", + " <td>0.03553</td>\n", + " <td>1.32629</td>\n", + " <td>1.32261</td>\n", + " <td>0.90563</td>\n", + " <td>0.63297</td>\n", + " <td>0.32957</td>\n", + " <td>0.45811</td>\n", + " <td>2.45176</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Country Region Happiness Rank Happiness Score \\\n", + "0 Switzerland Western Europe 1 7.587 \n", + "1 Iceland Western Europe 2 7.561 \n", + "2 Denmark Western Europe 3 7.527 \n", + "3 Norway Western Europe 4 7.522 \n", + "4 Canada North America 5 7.427 \n", + "\n", + " Standard Error Economy (GDP per Capita) Family \\\n", + "0 0.03411 1.39651 1.34951 \n", + "1 0.04884 1.30232 1.40223 \n", + "2 0.03328 1.32548 1.36058 \n", + "3 0.03880 1.45900 1.33095 \n", + "4 0.03553 1.32629 1.32261 \n", + "\n", + " Health (Life Expectancy) Freedom Trust (Government Corruption) \\\n", + "0 0.94143 0.66557 0.41978 \n", + "1 0.94784 0.62877 0.14145 \n", + "2 0.87464 0.64938 0.48357 \n", + "3 0.88521 0.66973 0.36503 \n", + "4 0.90563 0.63297 0.32957 \n", + "\n", + " Generosity Dystopia Residual \n", + "0 0.29678 2.51738 \n", + "1 0.43630 2.70201 \n", + "2 0.34139 2.49204 \n", + "3 0.34699 2.46531 \n", + "4 0.45811 2.45176 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "happiness.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Single Linear Regression\n", + "We'll be building a Single Linear Regression model to display the linear relationship between Economy (GDP per Capita) and a country's Happiness Score. This will hopefully allow us to predict a country's Happiness Score based on its Economy.\n", + "Single Linear Regression is a statistical method that shows the relationships between two continuous variables. The independent or explanatory variable (x), is the predictor, and y is the dependent variable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Feature Selection" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Happiness Score</th>\n", + " <th>Economy (GDP per Capita)</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>7.587</td>\n", + " <td>1.39651</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>7.561</td>\n", + " <td>1.30232</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>7.527</td>\n", + " <td>1.32548</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>7.522</td>\n", + " <td>1.45900</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>7.427</td>\n", + " <td>1.32629</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Happiness Score Economy (GDP per Capita)\n", + "0 7.587 1.39651\n", + "1 7.561 1.30232\n", + "2 7.527 1.32548\n", + "3 7.522 1.45900\n", + "4 7.427 1.32629" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chosen_columns = ['Happiness Score','Economy (GDP per Capita)']\n", + "economy_happiness = happiness.filter(chosen_columns)\n", + "economy_happiness.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Train/test split" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x train/test (126, 1) (32, 1)\n", + "y train/test (126,) (32,)\n" + ] + } + ], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(economy_happiness.drop(['Happiness Score'], axis=1),economy_happiness['Happiness Score'],test_size=0.2,random_state=42) \n", + "print(\"x train/test \",x_train.shape, x_test.shape)\n", + "print(\"y train/test \",y_train.shape, y_test.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Convert from pandas to np" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>\n" + ] + } + ], + "source": [ + "print(type(x_train),type(y_train))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "x = x_train.values\n", + "y=y_train.values" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'numpy.ndarray'> <class 'numpy.ndarray'>\n" + ] + } + ], + "source": [ + "print(type(x), type(y))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Create and train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n", + " normalize=False)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lin_reg_model = LinearRegression()\n", + "lin_reg_model.fit(x, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If the coefficient of x is positive, then there exists a positive relationship between x (Economy) and y (the Happiness Score). If the coefficient of x is negative, then the relationship between x and y is negative, meaning as x increases, y decreases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lin_reg_model.coef_, lin_reg_model.intercept_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Test and evaluate the model" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "#convert the panda dataframe/series into np arrays\n", + "test_x = x_test.values\n", + "test_y = y_test.values\n", + "\n", + "#pass the test x dataset into the model\n", + "predictions = lin_reg_model.predict(test_x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* The mean squared error indicates how close a regression line is to a set of points. \n", + "* The distances between the points and the regression line (error between predicted value and actual value) are squared.\n", + "* Squaring removes any negative signs and emphasises large errors.\n", + "* The mean of all the squared errors represents the average of the errors " + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5080153381136627" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#mean squared error\n", + "np.mean((predictions - test_y) ** 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Visualise the predictions v Actual Happiness Scores**" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.scatter(lin_reg_model.predict(test_x), test_y)\n", + "plt.title('Predictions v Actual Happiness Scores')\n", + "plt.xlabel('Predictions')\n", + "plt.ylabel('Actual Happiness Scores');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Multiple Linear Regression\n", + "Multiple linear regression is a statistical modelling technique that uses several predictors, or explanatory variables to predict the dependent variable (y). Multiple Linear Regression can provide better prediction reliability, and lower error because the combination of independent variables can lead to a higher probabliity of a given outcome. For example, if a country has a high GDP per capita (wealthy) it is more likely that people will be healthy, with a strong family unit, meaning they will be happiness overall. Therefore we would expect our Multiple Linear Regression model to outperform our Single Linear Regression model, which relies solely on Economy to predict happiness. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Feature Selection**" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Happiness Score</th>\n", + " <th>Family</th>\n", + " <th>Health (Life Expectancy)</th>\n", + " <th>Economy (GDP per Capita)</th>\n", + " <th>Freedom</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>7.587</td>\n", + " <td>1.34951</td>\n", + " <td>0.94143</td>\n", + " <td>1.39651</td>\n", + " <td>0.66557</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>7.561</td>\n", + " <td>1.40223</td>\n", + " <td>0.94784</td>\n", + " <td>1.30232</td>\n", + " <td>0.62877</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>7.527</td>\n", + " <td>1.36058</td>\n", + " <td>0.87464</td>\n", + " <td>1.32548</td>\n", + " <td>0.64938</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>7.522</td>\n", + " <td>1.33095</td>\n", + " <td>0.88521</td>\n", + " <td>1.45900</td>\n", + " <td>0.66973</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>7.427</td>\n", + " <td>1.32261</td>\n", + " <td>0.90563</td>\n", + " <td>1.32629</td>\n", + " <td>0.63297</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Happiness Score Family Health (Life Expectancy) \\\n", + "0 7.587 1.34951 0.94143 \n", + "1 7.561 1.40223 0.94784 \n", + "2 7.527 1.36058 0.87464 \n", + "3 7.522 1.33095 0.88521 \n", + "4 7.427 1.32261 0.90563 \n", + "\n", + " Economy (GDP per Capita) Freedom \n", + "0 1.39651 0.66557 \n", + "1 1.30232 0.62877 \n", + "2 1.32548 0.64938 \n", + "3 1.45900 0.66973 \n", + "4 1.32629 0.63297 " + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chosen_features = ['Happiness Score','Family', 'Health (Life Expectancy)', 'Economy (GDP per Capita)', 'Freedom']\n", + "multiple_happiness = happiness.filter(chosen_features)\n", + "multiple_happiness.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create the train/test splits**" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x train/test (126, 4) (32, 4)\n", + "y train/test (126,) (32,)\n" + ] + } + ], + "source": [ + "x_train, x_test, y_train, y_test = train_test_split(multiple_happiness.drop(['Happiness Score'], axis=1),multiple_happiness['Happiness Score'],test_size=0.2,random_state=42) \n", + "print(\"x train/test \",x_train.shape, x_test.shape)\n", + "print(\"y train/test \",y_train.shape, y_test.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Prepare the training sets**" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "#convert from pandas dataframe/series to np array for training\n", + "x = x_train.values\n", + "y=y_train.values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Create and train a linear regression model**" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n", + " normalize=False)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lin_reg_model = LinearRegression()\n", + "lin_reg_model.fit(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.20973375, 1.06133406, 0.91949607, 1.74411857]), 1.960421057319616)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lin_reg_model.coef_,lin_reg_model.intercept_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's test it!**" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "#convert the panda dataframe/series into np arrays\n", + "test_x = x_test.values\n", + "test_y = y_test.values\n", + "\n", + "#pass the test x dataset into the model\n", + "predictions = lin_reg_model.predict(test_x)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's Evaluate it!**" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.2320463982294615" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#mean squared error\n", + "np.mean((predictions - test_y) ** 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.70905 0.48246 0.27108 0.44017] 4.3069999999999995 4.347199667982519\n", + "[1.25712 0.99111 1.27074 0.49615] 5.987 6.566885214446166\n", + "[0.747 0.61712 0.8818 0.17288] 4.194 4.6313974975723315\n", + "[0.41587 0.22396 0.0153 0.1185 ] 2.905 2.921955747845097\n", + "[0.95152 0.43873 0.18847 0.46582] 5.0569999999999995 4.56288874190275\n", + "[1.24823 0.78723 1.05351 0.44974] 6.574 6.0590592182554275\n", + "[1.04103 0.07612 0.37545 0.31767] 4.898 4.199857874493746\n", + "[0.73803 0.54909 0.59066 0.59591] 4.876 5.018456020765323\n", + "[0.91916 0.79081 0.83223 0.09245] 4.949 4.838149496430775\n", + "[1.13299 0.33861 0.21102 0.45727] 3.931 4.681980781245059\n", + "[1.12575 0.80925 1.42727 0.64157] 6.901 6.612506721311087\n", + "[1.14184 0.74314 0.59325 0.55475] 5.827999999999999 5.643504054954601\n", + "[1.23287 0.69702 0.98124 0.49049] 6.983 5.94935561254865\n", + "[1.22668 0.53886 0.95847 0.4761 ] 5.547999999999999 5.72797197341618\n", + "[1.1985 0.79661 1.06353 0.5421 ] 6.7860000000000005 6.179154614012047\n", + "[1.20643 0.84483 1.17898 0.46364] 6.505 6.209237609834251\n", + "[0.95571 0. 0.33024 0.4084 ] 4.507 4.132528102453181\n", + "[0.54447 0.69805 1.0088 0.30033] 4.686 4.8113477999285355\n", + "[1.02626 0.09131 0.08308 0.34037] 4.971 3.968870196574679\n", + "[1.07008 0.92356 1.20806 0.49027] 5.695 6.201034067408417\n", + "[0.77115 0.15185 0.46534 0.46866] 3.655 4.299747723696424\n", + "[1.28566 0.89667 1.30782 0.5845 ] 6.937 6.689366416355663\n", + "[1.29704 0.89042 1.33723 0.62433] 7.2 6.793010470490197\n", + "[1.30923 0.93156 1.33358 0.65124] 7.284 6.894998478232223\n", + "[1.2089 0.8116 1.06166 0.60362] 6.485 6.313223959197274\n", + "[1.05392 0.69639 0.90198 0.40661] 5.192 5.513029193967952\n", + "[1.01528 0.61826 0.59448 0.32818] 5.888999999999999 4.963826790421297\n", + "[0.79273 0.36315 0.23906 0.22917] 3.9560000000000004 3.9243511389537744\n", + "[0.59207 0.36291 0.44025 0.46074] 4.369 4.270230196212314\n", + "[0.66801 0.46721 0.20824 0.19184] 3.681 3.7904687536157557\n", + "[1.00268 0.38215 0.2852 0.32878] 3.781 4.4146572856324005\n", + "[0.94632 0.73172 1.06098 0.22815] 5.332000000000001 5.255303251064371\n" + ] + } + ], + "source": [ + "#see the results\n", + "for each, actual_happiness, preds in zip(test_x, test_y, predictions):\n", + " print(each, actual_happiness, preds)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "plt.scatter(lin_reg_model.predict(test_x), test_y)\n", + "plt.title('Predictions v Actual Happiness Scores')\n", + "plt.xlabel('Predictions')\n", + "plt.ylabel('Actual Happiness Scores');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise\n", + "* Create Simple and Multiple Linear Regression Models trained on stratified training sets\n", + "* Evaluate the models and compare the results to those presented in this notebook\n", + "* Are the results better, worse or the same? Why?" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab