diff --git a/topic_11/End_to_End_Machine_Learning_Topic__11.ipynb b/topic_11/End_to_End_Machine_Learning_Topic__11.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c2cb59970c9cb9a3010c91e1fed301555da5ccee --- /dev/null +++ b/topic_11/End_to_End_Machine_Learning_Topic__11.ipynb @@ -0,0 +1,1429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Cleaning\n", + "* Datasets may come in a large range of formats\n", + "* Datasets vary in usefulness and quality\n", + "* Data is not always recorded consistently and accurately\n", + "* It is very important to check our datasets for errors, empty or undefined cells, etc.\n", + "\n", + "* Data cleaning is the process of searching through a dataset to address any problems and inconsistencies, including missing values\n", + "* Data cleaning cannot address issues such as bias within a dataset\n", + "* Data cleaning resolves issues with data entry and format to prevent errors in processing or incorrect conclusions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def load_data(DATASET_PATH):\n", + " return pd.read_csv(DATASET_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset 1: Police Shootings\n", + "You can access this dataset here: www.kaggle.com/washingtonpost/police-shootings\n", + "> This is an example of a dataset that is:\n", + "* Moderate quality\n", + "* Categorical" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PATH = './datasets/police_shooting/fatal-police-shootings-data.csv'\n", + "\n", + "police = load_data(DATASET_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 5416 entries, 0 to 5415\n", + "Data columns (total 13 columns):\n", + "name 5416 non-null object\n", + "date 5416 non-null object\n", + "manner_of_death 5416 non-null object\n", + "armed 5189 non-null object\n", + "age 5181 non-null float64\n", + "gender 5414 non-null object\n", + "race 4895 non-null object\n", + "city 5416 non-null object\n", + "state 5416 non-null object\n", + "signs_of_mental_illness 5416 non-null bool\n", + "threat_level 5416 non-null object\n", + "flee 5167 non-null object\n", + "body_camera 5416 non-null bool\n", + "dtypes: bool(2), float64(1), object(10)\n", + "memory usage: 476.1+ KB\n" + ] + } + ], + "source": [ + "police.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Choosing a Subset - Feature Selection\n", + "* Datasets often contain a large range of data types\n", + "* Some of these data types may not be relevant or useful\n", + "* Performing feature selection after Exploratory Data Analysis enables you to focus training on data that is relevant to what you want to predict\n", + "* Creating a subset of the dataset can be a good way to reduce computational cost, error, etc\n", + "* Creating a subset can make it easier to clean data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "police_simple = police.filter(['armed','age','race','state','signs_of_mental_illness','flee'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>armed</th>\n", + " <th>age</th>\n", + " <th>race</th>\n", + " <th>state</th>\n", + " <th>signs_of_mental_illness</th>\n", + " <th>flee</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>gun</td>\n", + " <td>53.0</td>\n", + " <td>A</td>\n", + " <td>WA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>gun</td>\n", + " <td>47.0</td>\n", + " <td>W</td>\n", + " <td>OR</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>unarmed</td>\n", + " <td>23.0</td>\n", + " <td>H</td>\n", + " <td>KS</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>toy weapon</td>\n", + " <td>32.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>nail gun</td>\n", + " <td>39.0</td>\n", + " <td>H</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " armed age race state signs_of_mental_illness flee\n", + "0 gun 53.0 A WA True Not fleeing\n", + "1 gun 47.0 W OR False Not fleeing\n", + "2 unarmed 23.0 H KS False Not fleeing\n", + "3 toy weapon 32.0 W CA True Not fleeing\n", + "4 nail gun 39.0 H CO False Not fleeing" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "police_simple.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are a few things to look for when cleaning data including: \n", + "* Missing values - this could mean data was unavailable, deliberately excluded, or simpply forgotten \n", + "* 'Placeholder' values used in place of missing values, e.g. 999 for unknown age or 12:00am for unknown date\n", + "* Data entered by humans - they are often inconsistent and subject to error such as mispelling \n", + "* This link will help you identify and resolve many bad data issues: https://github.com/Quartz/bad-data-guide\n", + "\n", + "**Firstly, find NaN entries (Missing values)**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>armed</th>\n", + " <th>age</th>\n", + " <th>race</th>\n", + " <th>state</th>\n", + " <th>signs_of_mental_illness</th>\n", + " <th>flee</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>NaN</td>\n", + " <td>28.0</td>\n", + " <td>W</td>\n", + " <td>MT</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>NaN</td>\n", + " <td>24.0</td>\n", + " <td>B</td>\n", + " <td>MN</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>NaN</td>\n", + " <td>29.0</td>\n", + " <td>W</td>\n", + " <td>MO</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>45</th>\n", + " <td>NaN</td>\n", + " <td>42.0</td>\n", + " <td>B</td>\n", + " <td>AZ</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>52</th>\n", + " <td>NaN</td>\n", + " <td>29.0</td>\n", + " <td>B</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>59</th>\n", + " <td>gun</td>\n", + " <td>59.0</td>\n", + " <td>NaN</td>\n", + " <td>NJ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>60</th>\n", + " <td>NaN</td>\n", + " <td>17.0</td>\n", + " <td>H</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>65</th>\n", + " <td>NaN</td>\n", + " <td>26.0</td>\n", + " <td>N</td>\n", + " <td>AZ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>105</th>\n", + " <td>NaN</td>\n", + " <td>23.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>122</th>\n", + " <td>NaN</td>\n", + " <td>50.0</td>\n", + " <td>H</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>127</th>\n", + " <td>gun</td>\n", + " <td>NaN</td>\n", + " <td>H</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>138</th>\n", + " <td>NaN</td>\n", + " <td>27.0</td>\n", + " <td>B</td>\n", + " <td>OK</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>166</th>\n", + " <td>NaN</td>\n", + " <td>48.0</td>\n", + " <td>B</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>170</th>\n", + " <td>NaN</td>\n", + " <td>35.0</td>\n", + " <td>W</td>\n", + " <td>DE</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>179</th>\n", + " <td>NaN</td>\n", + " <td>37.0</td>\n", + " <td>B</td>\n", + " <td>MD</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>185</th>\n", + " <td>NaN</td>\n", + " <td>25.0</td>\n", + " <td>H</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>199</th>\n", + " <td>NaN</td>\n", + " <td>24.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>239</th>\n", + " <td>NaN</td>\n", + " <td>26.0</td>\n", + " <td>W</td>\n", + " <td>OH</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>240</th>\n", + " <td>gun</td>\n", + " <td>54.0</td>\n", + " <td>NaN</td>\n", + " <td>NV</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>244</th>\n", + " <td>NaN</td>\n", + " <td>27.0</td>\n", + " <td>B</td>\n", + " <td>MD</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>263</th>\n", + " <td>NaN</td>\n", + " <td>42.0</td>\n", + " <td>B</td>\n", + " <td>GA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>270</th>\n", + " <td>NaN</td>\n", + " <td>54.0</td>\n", + " <td>NaN</td>\n", + " <td>OK</td>\n", + " <td>False</td>\n", + " <td>Other</td>\n", + " </tr>\n", + " <tr>\n", + " <th>298</th>\n", + " <td>NaN</td>\n", + " <td>35.0</td>\n", + " <td>B</td>\n", + " <td>NJ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>307</th>\n", + " <td>NaN</td>\n", + " <td>46.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>340</th>\n", + " <td>knife</td>\n", + " <td>72.0</td>\n", + " <td>NaN</td>\n", + " <td>GA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>342</th>\n", + " <td>NaN</td>\n", + " <td>21.0</td>\n", + " <td>B</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>346</th>\n", + " <td>NaN</td>\n", + " <td>31.0</td>\n", + " <td>B</td>\n", + " <td>MN</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>348</th>\n", + " <td>NaN</td>\n", + " <td>34.0</td>\n", + " <td>B</td>\n", + " <td>MD</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>349</th>\n", + " <td>NaN</td>\n", + " <td>30.0</td>\n", + " <td>B</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>398</th>\n", + " <td>gun</td>\n", + " <td>40.0</td>\n", + " <td>NaN</td>\n", + " <td>OR</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5348</th>\n", + " <td>gun</td>\n", + " <td>44.0</td>\n", + " <td>NaN</td>\n", + " <td>TN</td>\n", + " <td>True</td>\n", + " <td>Foot</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5349</th>\n", + " <td>gun</td>\n", + " <td>52.0</td>\n", + " <td>NaN</td>\n", + " <td>MT</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5350</th>\n", + " <td>knife</td>\n", + " <td>43.0</td>\n", + " <td>NaN</td>\n", + " <td>TN</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5353</th>\n", + " <td>toy weapon</td>\n", + " <td>35.0</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5354</th>\n", + " <td>knife</td>\n", + " <td>33.0</td>\n", + " <td>NaN</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5355</th>\n", + " <td>toy weapon</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5357</th>\n", + " <td>toy weapon</td>\n", + " <td>61.0</td>\n", + " <td>NaN</td>\n", + " <td>NY</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5360</th>\n", + " <td>gun</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5362</th>\n", + " <td>gun</td>\n", + " <td>47.0</td>\n", + " <td>NaN</td>\n", + " <td>OH</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5363</th>\n", + " <td>undetermined</td>\n", + " <td>61.0</td>\n", + " <td>NaN</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>Foot</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5366</th>\n", + " <td>sword</td>\n", + " <td>50.0</td>\n", + " <td>H</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5368</th>\n", + " <td>knife</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>OK</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5369</th>\n", + " <td>undetermined</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>AL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5373</th>\n", + " <td>crowbar</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5375</th>\n", + " <td>knife</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>OR</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5377</th>\n", + " <td>toy weapon</td>\n", + " <td>NaN</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5378</th>\n", + " <td>gun</td>\n", + " <td>32.0</td>\n", + " <td>A</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5379</th>\n", + " <td>knife</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>PA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5383</th>\n", + " <td>undetermined</td>\n", + " <td>49.0</td>\n", + " <td>W</td>\n", + " <td>OR</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5385</th>\n", + " <td>gun</td>\n", + " <td>53.0</td>\n", + " <td>B</td>\n", + " <td>KY</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5388</th>\n", + " <td>gun</td>\n", + " <td>65.0</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5398</th>\n", + " <td>metal pipe</td>\n", + " <td>37.0</td>\n", + " <td>NaN</td>\n", + " <td>TN</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5399</th>\n", + " <td>pellet gun</td>\n", + " <td>26.0</td>\n", + " <td>NaN</td>\n", + " <td>NY</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5401</th>\n", + " <td>gun</td>\n", + " <td>81.0</td>\n", + " <td>NaN</td>\n", + " <td>NM</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5402</th>\n", + " <td>gun</td>\n", + " <td>31.0</td>\n", + " <td>NaN</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5403</th>\n", + " <td>gun</td>\n", + " <td>38.0</td>\n", + " <td>B</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5404</th>\n", + " <td>gun</td>\n", + " <td>59.0</td>\n", + " <td>NaN</td>\n", + " <td>ID</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5412</th>\n", + " <td>undetermined</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5414</th>\n", + " <td>gun</td>\n", + " <td>24.0</td>\n", + " <td>NaN</td>\n", + " <td>IL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5415</th>\n", + " <td>gun</td>\n", + " <td>27.0</td>\n", + " <td>NaN</td>\n", + " <td>AZ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>1016 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " armed age race state signs_of_mental_illness flee\n", + "15 NaN 28.0 W MT False Not fleeing\n", + "26 NaN 24.0 B MN False Not fleeing\n", + "27 NaN 29.0 W MO False Not fleeing\n", + "45 NaN 42.0 B AZ False Car\n", + "52 NaN 29.0 B FL False Not fleeing\n", + "59 gun 59.0 NaN NJ False Not fleeing\n", + "60 NaN 17.0 H CO False Not fleeing\n", + "65 NaN 26.0 N AZ False Not fleeing\n", + "105 NaN 23.0 W CA False Car\n", + "122 NaN 50.0 H TX False Not fleeing\n", + "127 gun NaN H TX False Car\n", + "138 NaN 27.0 B OK False Car\n", + "166 NaN 48.0 B FL False Not fleeing\n", + "170 NaN 35.0 W DE False Car\n", + "179 NaN 37.0 B MD False Not fleeing\n", + "185 NaN 25.0 H CA False Not fleeing\n", + "199 NaN 24.0 W CA True Not fleeing\n", + "239 NaN 26.0 W OH False Car\n", + "240 gun 54.0 NaN NV False Not fleeing\n", + "244 NaN 27.0 B MD False Car\n", + "263 NaN 42.0 B GA False Car\n", + "270 NaN 54.0 NaN OK False Other\n", + "298 NaN 35.0 B NJ False Not fleeing\n", + "307 NaN 46.0 W CA False Car\n", + "340 knife 72.0 NaN GA True Not fleeing\n", + "342 NaN 21.0 B CA False Car\n", + "346 NaN 31.0 B MN False Car\n", + "348 NaN 34.0 B MD False Car\n", + "349 NaN 30.0 B TX False Car\n", + "398 gun 40.0 NaN OR True Not fleeing\n", + "... ... ... ... ... ... ...\n", + "5348 gun 44.0 NaN TN True Foot\n", + "5349 gun 52.0 NaN MT False Not fleeing\n", + "5350 knife 43.0 NaN TN True Not fleeing\n", + "5353 toy weapon 35.0 NaN CA True Not fleeing\n", + "5354 knife 33.0 NaN TX False Not fleeing\n", + "5355 toy weapon NaN NaN CA True Not fleeing\n", + "5357 toy weapon 61.0 NaN NY True Not fleeing\n", + "5360 gun NaN NaN CO False NaN\n", + "5362 gun 47.0 NaN OH False Not fleeing\n", + "5363 undetermined 61.0 NaN FL False Foot\n", + "5366 sword 50.0 H CA False NaN\n", + "5368 knife 30.0 NaN OK False Not fleeing\n", + "5369 undetermined NaN NaN AL False Not fleeing\n", + "5373 crowbar NaN NaN CA False Not fleeing\n", + "5375 knife NaN NaN OR False Not fleeing\n", + "5377 toy weapon NaN W CA False NaN\n", + "5378 gun 32.0 A CA False NaN\n", + "5379 knife NaN NaN PA False Not fleeing\n", + "5383 undetermined 49.0 W OR False NaN\n", + "5385 gun 53.0 B KY False NaN\n", + "5388 gun 65.0 NaN CA False Not fleeing\n", + "5398 metal pipe 37.0 NaN TN False Not fleeing\n", + "5399 pellet gun 26.0 NaN NY False Not fleeing\n", + "5401 gun 81.0 NaN NM True Not fleeing\n", + "5402 gun 31.0 NaN CO False NaN\n", + "5403 gun 38.0 B FL False NaN\n", + "5404 gun 59.0 NaN ID False NaN\n", + "5412 undetermined NaN NaN CA False Not fleeing\n", + "5414 gun 24.0 NaN IL False Not fleeing\n", + "5415 gun 27.0 NaN AZ False Not fleeing\n", + "\n", + "[1016 rows x 6 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_rows = police_simple.isnull().any(axis=1)\n", + "police_simple[null_rows]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def check_NaN(dataframe):\n", + " print(\"Total NaN:\", dataframe.isnull().values.sum())\n", + " print(\"NaN by column:\\n\",dataframe.isnull().sum())\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total NaN: 1232\n", + "NaN by column:\n", + " armed 227\n", + "age 235\n", + "race 521\n", + "state 0\n", + "signs_of_mental_illness 0\n", + "flee 249\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "check_NaN(police_simple)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> Given there are undefined values, we have three choices:\n", + "* Remove affected rows\n", + "* Remove affected attributes, e.g. remove the entire column showing whether the person killed was armed\n", + "* Fill blank with 'dummy' data, e.g \"unreported\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def fillNaN_unreported(dataframe, key):\n", + " dataframe[key].fillna(\"unreported\", inplace = True)\n", + " return " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "fillNaN_unreported(police_simple, \"armed\")\n", + "fillNaN_unreported(police_simple, \"race\")\n", + "fillNaN_unreported(police_simple, \"flee\")\n", + "\n", + "median = police_simple[\"age\"].median()\n", + "police_simple[\"age\"].fillna(median, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total NaN: 0\n", + "NaN by column:\n", + " armed 0\n", + "age 0\n", + "race 0\n", + "state 0\n", + "signs_of_mental_illness 0\n", + "flee 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "check_NaN(police_simple)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Impact of NaN values on data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "\n", + "race_count = police_simple['race'].value_counts()\n", + "sns.set(style=\"darkgrid\")\n", + "sns.barplot(race_count.index, race_count.values)\n", + "plt.title('Frequency Distribution of Races')\n", + "plt.ylabel('Number of Occurrences', fontsize=12)\n", + "plt.xlabel('Race', fontsize=12)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's look for 'placeholder' values used in place of missing values**" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "#check the age column for impossible ages\n", + "for each_age in police_simple['age']:\n", + " if each_age > 100 or each_age < 0:\n", + " print(each_age)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's consider if any values were entered by humans**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'unreported',\n", + " 'shovel', 'hammer', 'hatchet', 'undetermined', 'sword', 'machete',\n", + " 'box cutter', 'metal object', 'screwdriver', 'lawn mower blade',\n", + " 'flagpole', 'guns and explosives', 'cordless drill', 'crossbow',\n", + " 'metal pole', 'Taser', 'metal pipe', 'metal hand tool',\n", + " 'blunt object', 'metal stick', 'sharp object', 'meat cleaver',\n", + " 'carjack', 'chain', \"contractor's level\", 'unknown weapon',\n", + " 'stapler', 'beer bottle', 'bean-bag gun',\n", + " 'baseball bat and fireplace poker', 'straight edge razor',\n", + " 'gun and knife', 'ax', 'brick', 'baseball bat', 'hand torch',\n", + " 'chain saw', 'garden tool', 'scissors', 'pole', 'pick-axe',\n", + " 'flashlight', 'vehicle', 'baton', 'spear', 'chair', 'pitchfork',\n", + " 'hatchet and gun', 'rock', 'piece of wood', 'bayonet', 'pipe',\n", + " 'glass shard', 'motorcycle', 'pepper spray', 'metal rake',\n", + " 'crowbar', 'oar', 'machete and gun', 'tire iron',\n", + " 'air conditioner', 'pole and knife', 'baseball bat and bottle',\n", + " 'fireworks', 'pen', 'chainsaw', 'gun and sword', 'gun and car',\n", + " 'pellet gun', 'claimed to be armed', 'BB gun', 'incendiary device',\n", + " 'samurai sword', 'bow and arrow', 'gun and vehicle',\n", + " 'vehicle and gun', 'wrench', 'walking stick', 'barstool',\n", + " 'grenade', 'BB gun and vehicle', 'wasp spray', 'air pistol',\n", + " 'Airsoft pistol', 'baseball bat and knife', 'vehicle and machete',\n", + " 'ice pick', 'car, knife and mace'], dtype=object)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check for spelling errors\n", + "police_simple['armed'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "police_simple.loc[police_simple['armed'] == 'undetermined', 'armed'] = \"unreported\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'unreported',\n", + " 'shovel', 'hammer', 'hatchet', 'sword', 'machete', 'box cutter',\n", + " 'metal object', 'screwdriver', 'lawn mower blade', 'flagpole',\n", + " 'guns and explosives', 'cordless drill', 'crossbow', 'metal pole',\n", + " 'Taser', 'metal pipe', 'metal hand tool', 'blunt object',\n", + " 'metal stick', 'sharp object', 'meat cleaver', 'carjack', 'chain',\n", + " \"contractor's level\", 'unknown weapon', 'stapler', 'beer bottle',\n", + " 'bean-bag gun', 'baseball bat and fireplace poker',\n", + " 'straight edge razor', 'gun and knife', 'ax', 'brick',\n", + " 'baseball bat', 'hand torch', 'chain saw', 'garden tool',\n", + " 'scissors', 'pole', 'pick-axe', 'flashlight', 'vehicle', 'baton',\n", + " 'spear', 'chair', 'pitchfork', 'hatchet and gun', 'rock',\n", + " 'piece of wood', 'bayonet', 'pipe', 'glass shard', 'motorcycle',\n", + " 'pepper spray', 'metal rake', 'crowbar', 'oar', 'machete and gun',\n", + " 'tire iron', 'air conditioner', 'pole and knife',\n", + " 'baseball bat and bottle', 'fireworks', 'pen', 'chainsaw',\n", + " 'gun and sword', 'gun and car', 'pellet gun',\n", + " 'claimed to be armed', 'BB gun', 'incendiary device',\n", + " 'samurai sword', 'bow and arrow', 'gun and vehicle',\n", + " 'vehicle and gun', 'wrench', 'walking stick', 'barstool',\n", + " 'grenade', 'BB gun and vehicle', 'wasp spray', 'air pistol',\n", + " 'Airsoft pistol', 'baseball bat and knife', 'vehicle and machete',\n", + " 'ice pick', 'car, knife and mace'], dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "police_simple['armed'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "police_simple.loc[police_simple['armed'] == 'unknown weapon', 'armed'] = \"unreported\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'unreported',\n", + " 'shovel', 'hammer', 'hatchet', 'sword', 'machete', 'box cutter',\n", + " 'metal object', 'screwdriver', 'lawn mower blade', 'flagpole',\n", + " 'guns and explosives', 'cordless drill', 'crossbow', 'metal pole',\n", + " 'Taser', 'metal pipe', 'metal hand tool', 'blunt object',\n", + " 'metal stick', 'sharp object', 'meat cleaver', 'carjack', 'chain',\n", + " \"contractor's level\", 'stapler', 'beer bottle', 'bean-bag gun',\n", + " 'baseball bat and fireplace poker', 'straight edge razor',\n", + " 'gun and knife', 'ax', 'brick', 'baseball bat', 'hand torch',\n", + " 'chain saw', 'garden tool', 'scissors', 'pole', 'pick-axe',\n", + " 'flashlight', 'vehicle', 'baton', 'spear', 'chair', 'pitchfork',\n", + " 'hatchet and gun', 'rock', 'piece of wood', 'bayonet', 'pipe',\n", + " 'glass shard', 'motorcycle', 'pepper spray', 'metal rake',\n", + " 'crowbar', 'oar', 'machete and gun', 'tire iron',\n", + " 'air conditioner', 'pole and knife', 'baseball bat and bottle',\n", + " 'fireworks', 'pen', 'chainsaw', 'gun and sword', 'gun and car',\n", + " 'pellet gun', 'claimed to be armed', 'BB gun', 'incendiary device',\n", + " 'samurai sword', 'bow and arrow', 'gun and vehicle',\n", + " 'vehicle and gun', 'wrench', 'walking stick', 'barstool',\n", + " 'grenade', 'BB gun and vehicle', 'wasp spray', 'air pistol',\n", + " 'Airsoft pistol', 'baseball bat and knife', 'vehicle and machete',\n", + " 'ice pick', 'car, knife and mace'], dtype=object)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "police_simple['armed'].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset 2: World Happiness Report up to 2020\n", + "You can access this dataset here: www.kaggle.com/mathurinache/world-happiness-report \n", + "> This is an example of a dataset that is:\n", + "* High quality\n", + "* Numerical" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PATH = './datasets/happiness/2015.csv'\n", + "\n", + "#create pandas object\n", + "happiness = load_data(DATASET_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total NaN: 0\n", + "NaN by column:\n", + " Country 0\n", + "Region 0\n", + "Happiness Rank 0\n", + "Happiness Score 0\n", + "Standard Error 0\n", + "Economy (GDP per Capita) 0\n", + "Family 0\n", + "Health (Life Expectancy) 0\n", + "Freedom 0\n", + "Trust (Government Corruption) 0\n", + "Generosity 0\n", + "Dystopia Residual 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "check_NaN(happiness)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Region</th>\n", + " <th>Happiness Rank</th>\n", + " <th>Happiness Score</th>\n", + " <th>Standard Error</th>\n", + " <th>Economy (GDP per Capita)</th>\n", + " <th>Family</th>\n", + " <th>Health (Life Expectancy)</th>\n", + " <th>Freedom</th>\n", + " <th>Trust (Government Corruption)</th>\n", + " <th>Generosity</th>\n", + " <th>Dystopia Residual</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Switzerland</td>\n", + " <td>Western Europe</td>\n", + " <td>1</td>\n", + " <td>7.587</td>\n", + " <td>0.03411</td>\n", + " <td>1.39651</td>\n", + " <td>1.34951</td>\n", + " <td>0.94143</td>\n", + " <td>0.66557</td>\n", + " <td>0.41978</td>\n", + " <td>0.29678</td>\n", + " <td>2.51738</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Iceland</td>\n", + " <td>Western Europe</td>\n", + " <td>2</td>\n", + " <td>7.561</td>\n", + " <td>0.04884</td>\n", + " <td>1.30232</td>\n", + " <td>1.40223</td>\n", + " <td>0.94784</td>\n", + " <td>0.62877</td>\n", + " <td>0.14145</td>\n", + " <td>0.43630</td>\n", + " <td>2.70201</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Denmark</td>\n", + " <td>Western Europe</td>\n", + " <td>3</td>\n", + " <td>7.527</td>\n", + " <td>0.03328</td>\n", + " <td>1.32548</td>\n", + " <td>1.36058</td>\n", + " <td>0.87464</td>\n", + " <td>0.64938</td>\n", + " <td>0.48357</td>\n", + " <td>0.34139</td>\n", + " <td>2.49204</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Norway</td>\n", + " <td>Western Europe</td>\n", + " <td>4</td>\n", + " <td>7.522</td>\n", + " <td>0.03880</td>\n", + " <td>1.45900</td>\n", + " <td>1.33095</td>\n", + " <td>0.88521</td>\n", + " <td>0.66973</td>\n", + " <td>0.36503</td>\n", + " <td>0.34699</td>\n", + " <td>2.46531</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Canada</td>\n", + " <td>North America</td>\n", + " <td>5</td>\n", + " <td>7.427</td>\n", + " <td>0.03553</td>\n", + " <td>1.32629</td>\n", + " <td>1.32261</td>\n", + " <td>0.90563</td>\n", + " <td>0.63297</td>\n", + " <td>0.32957</td>\n", + " <td>0.45811</td>\n", + " <td>2.45176</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Country Region Happiness Rank Happiness Score \\\n", + "0 Switzerland Western Europe 1 7.587 \n", + "1 Iceland Western Europe 2 7.561 \n", + "2 Denmark Western Europe 3 7.527 \n", + "3 Norway Western Europe 4 7.522 \n", + "4 Canada North America 5 7.427 \n", + "\n", + " Standard Error Economy (GDP per Capita) Family \\\n", + "0 0.03411 1.39651 1.34951 \n", + "1 0.04884 1.30232 1.40223 \n", + "2 0.03328 1.32548 1.36058 \n", + "3 0.03880 1.45900 1.33095 \n", + "4 0.03553 1.32629 1.32261 \n", + "\n", + " Health (Life Expectancy) Freedom Trust (Government Corruption) \\\n", + "0 0.94143 0.66557 0.41978 \n", + "1 0.94784 0.62877 0.14145 \n", + "2 0.87464 0.64938 0.48357 \n", + "3 0.88521 0.66973 0.36503 \n", + "4 0.90563 0.63297 0.32957 \n", + "\n", + " Generosity Dystopia Residual \n", + "0 0.29678 2.51738 \n", + "1 0.43630 2.70201 \n", + "2 0.34139 2.49204 \n", + "3 0.34699 2.46531 \n", + "4 0.45811 2.45176 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "happiness.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}