diff --git a/topic_11/End_to_End_Machine_Learning_Topic__11.ipynb b/topic_11/End_to_End_Machine_Learning_Topic__11.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c2cb59970c9cb9a3010c91e1fed301555da5ccee --- /dev/null +++ b/topic_11/End_to_End_Machine_Learning_Topic__11.ipynb @@ -0,0 +1,1429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Cleaning\n", + "* Datasets may come in a large range of formats\n", + "* Datasets vary in usefulness and quality\n", + "* Data is not always recorded consistently and accurately\n", + "* It is very important to check our datasets for errors, empty or undefined cells, etc.\n", + "\n", + "* Data cleaning is the process of searching through a dataset to address any problems and inconsistencies, including missing values\n", + "* Data cleaning cannot address issues such as bias within a dataset\n", + "* Data cleaning resolves issues with data entry and format to prevent errors in processing or incorrect conclusions" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def load_data(DATASET_PATH):\n", + " return pd.read_csv(DATASET_PATH)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset 1: Police Shootings\n", + "You can access this dataset here: www.kaggle.com/washingtonpost/police-shootings\n", + "> This is an example of a dataset that is:\n", + "* Moderate quality\n", + "* Categorical" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PATH = './datasets/police_shooting/fatal-police-shootings-data.csv'\n", + "\n", + "police = load_data(DATASET_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 5416 entries, 0 to 5415\n", + "Data columns (total 13 columns):\n", + "name 5416 non-null object\n", + "date 5416 non-null object\n", + "manner_of_death 5416 non-null object\n", + "armed 5189 non-null object\n", + "age 5181 non-null float64\n", + "gender 5414 non-null object\n", + "race 4895 non-null object\n", + "city 5416 non-null object\n", + "state 5416 non-null object\n", + "signs_of_mental_illness 5416 non-null bool\n", + "threat_level 5416 non-null object\n", + "flee 5167 non-null object\n", + "body_camera 5416 non-null bool\n", + "dtypes: bool(2), float64(1), object(10)\n", + "memory usage: 476.1+ KB\n" + ] + } + ], + "source": [ + "police.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Choosing a Subset - Feature Selection\n", + "* Datasets often contain a large range of data types\n", + "* Some of these data types may not be relevant or useful\n", + "* Performing feature selection after Exploratory Data Analysis enables you to focus training on data that is relevant to what you want to predict\n", + "* Creating a subset of the dataset can be a good way to reduce computational cost, error, etc\n", + "* Creating a subset can make it easier to clean data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "police_simple = police.filter(['armed','age','race','state','signs_of_mental_illness','flee'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>armed</th>\n", + " <th>age</th>\n", + " <th>race</th>\n", + " <th>state</th>\n", + " <th>signs_of_mental_illness</th>\n", + " <th>flee</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>gun</td>\n", + " <td>53.0</td>\n", + " <td>A</td>\n", + " <td>WA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>gun</td>\n", + " <td>47.0</td>\n", + " <td>W</td>\n", + " <td>OR</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>unarmed</td>\n", + " <td>23.0</td>\n", + " <td>H</td>\n", + " <td>KS</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>toy weapon</td>\n", + " <td>32.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>nail gun</td>\n", + " <td>39.0</td>\n", + " <td>H</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " armed age race state signs_of_mental_illness flee\n", + "0 gun 53.0 A WA True Not fleeing\n", + "1 gun 47.0 W OR False Not fleeing\n", + "2 unarmed 23.0 H KS False Not fleeing\n", + "3 toy weapon 32.0 W CA True Not fleeing\n", + "4 nail gun 39.0 H CO False Not fleeing" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "police_simple.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are a few things to look for when cleaning data including: \n", + "* Missing values - this could mean data was unavailable, deliberately excluded, or simpply forgotten \n", + "* 'Placeholder' values used in place of missing values, e.g. 999 for unknown age or 12:00am for unknown date\n", + "* Data entered by humans - they are often inconsistent and subject to error such as mispelling \n", + "* This link will help you identify and resolve many bad data issues: https://github.com/Quartz/bad-data-guide\n", + "\n", + "**Firstly, find NaN entries (Missing values)**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>armed</th>\n", + " <th>age</th>\n", + " <th>race</th>\n", + " <th>state</th>\n", + " <th>signs_of_mental_illness</th>\n", + " <th>flee</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>15</th>\n", + " <td>NaN</td>\n", + " <td>28.0</td>\n", + " <td>W</td>\n", + " <td>MT</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>26</th>\n", + " <td>NaN</td>\n", + " <td>24.0</td>\n", + " <td>B</td>\n", + " <td>MN</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>27</th>\n", + " <td>NaN</td>\n", + " <td>29.0</td>\n", + " <td>W</td>\n", + " <td>MO</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>45</th>\n", + " <td>NaN</td>\n", + " <td>42.0</td>\n", + " <td>B</td>\n", + " <td>AZ</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>52</th>\n", + " <td>NaN</td>\n", + " <td>29.0</td>\n", + " <td>B</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>59</th>\n", + " <td>gun</td>\n", + " <td>59.0</td>\n", + " <td>NaN</td>\n", + " <td>NJ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>60</th>\n", + " <td>NaN</td>\n", + " <td>17.0</td>\n", + " <td>H</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>65</th>\n", + " <td>NaN</td>\n", + " <td>26.0</td>\n", + " <td>N</td>\n", + " <td>AZ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>105</th>\n", + " <td>NaN</td>\n", + " <td>23.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>122</th>\n", + " <td>NaN</td>\n", + " <td>50.0</td>\n", + " <td>H</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>127</th>\n", + " <td>gun</td>\n", + " <td>NaN</td>\n", + " <td>H</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>138</th>\n", + " <td>NaN</td>\n", + " <td>27.0</td>\n", + " <td>B</td>\n", + " <td>OK</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>166</th>\n", + " <td>NaN</td>\n", + " <td>48.0</td>\n", + " <td>B</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>170</th>\n", + " <td>NaN</td>\n", + " <td>35.0</td>\n", + " <td>W</td>\n", + " <td>DE</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>179</th>\n", + " <td>NaN</td>\n", + " <td>37.0</td>\n", + " <td>B</td>\n", + " <td>MD</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>185</th>\n", + " <td>NaN</td>\n", + " <td>25.0</td>\n", + " <td>H</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>199</th>\n", + " <td>NaN</td>\n", + " <td>24.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>239</th>\n", + " <td>NaN</td>\n", + " <td>26.0</td>\n", + " <td>W</td>\n", + " <td>OH</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>240</th>\n", + " <td>gun</td>\n", + " <td>54.0</td>\n", + " <td>NaN</td>\n", + " <td>NV</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>244</th>\n", + " <td>NaN</td>\n", + " <td>27.0</td>\n", + " <td>B</td>\n", + " <td>MD</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>263</th>\n", + " <td>NaN</td>\n", + " <td>42.0</td>\n", + " <td>B</td>\n", + " <td>GA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>270</th>\n", + " <td>NaN</td>\n", + " <td>54.0</td>\n", + " <td>NaN</td>\n", + " <td>OK</td>\n", + " <td>False</td>\n", + " <td>Other</td>\n", + " </tr>\n", + " <tr>\n", + " <th>298</th>\n", + " <td>NaN</td>\n", + " <td>35.0</td>\n", + " <td>B</td>\n", + " <td>NJ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>307</th>\n", + " <td>NaN</td>\n", + " <td>46.0</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>340</th>\n", + " <td>knife</td>\n", + " <td>72.0</td>\n", + " <td>NaN</td>\n", + " <td>GA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>342</th>\n", + " <td>NaN</td>\n", + " <td>21.0</td>\n", + " <td>B</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>346</th>\n", + " <td>NaN</td>\n", + " <td>31.0</td>\n", + " <td>B</td>\n", + " <td>MN</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>348</th>\n", + " <td>NaN</td>\n", + " <td>34.0</td>\n", + " <td>B</td>\n", + " <td>MD</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>349</th>\n", + " <td>NaN</td>\n", + " <td>30.0</td>\n", + " <td>B</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Car</td>\n", + " </tr>\n", + " <tr>\n", + " <th>398</th>\n", + " <td>gun</td>\n", + " <td>40.0</td>\n", + " <td>NaN</td>\n", + " <td>OR</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5348</th>\n", + " <td>gun</td>\n", + " <td>44.0</td>\n", + " <td>NaN</td>\n", + " <td>TN</td>\n", + " <td>True</td>\n", + " <td>Foot</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5349</th>\n", + " <td>gun</td>\n", + " <td>52.0</td>\n", + " <td>NaN</td>\n", + " <td>MT</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5350</th>\n", + " <td>knife</td>\n", + " <td>43.0</td>\n", + " <td>NaN</td>\n", + " <td>TN</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5353</th>\n", + " <td>toy weapon</td>\n", + " <td>35.0</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5354</th>\n", + " <td>knife</td>\n", + " <td>33.0</td>\n", + " <td>NaN</td>\n", + " <td>TX</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5355</th>\n", + " <td>toy weapon</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5357</th>\n", + " <td>toy weapon</td>\n", + " <td>61.0</td>\n", + " <td>NaN</td>\n", + " <td>NY</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5360</th>\n", + " <td>gun</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5362</th>\n", + " <td>gun</td>\n", + " <td>47.0</td>\n", + " <td>NaN</td>\n", + " <td>OH</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5363</th>\n", + " <td>undetermined</td>\n", + " <td>61.0</td>\n", + " <td>NaN</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>Foot</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5366</th>\n", + " <td>sword</td>\n", + " <td>50.0</td>\n", + " <td>H</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5368</th>\n", + " <td>knife</td>\n", + " <td>30.0</td>\n", + " <td>NaN</td>\n", + " <td>OK</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5369</th>\n", + " <td>undetermined</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>AL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5373</th>\n", + " <td>crowbar</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5375</th>\n", + " <td>knife</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>OR</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5377</th>\n", + " <td>toy weapon</td>\n", + " <td>NaN</td>\n", + " <td>W</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5378</th>\n", + " <td>gun</td>\n", + " <td>32.0</td>\n", + " <td>A</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5379</th>\n", + " <td>knife</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>PA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5383</th>\n", + " <td>undetermined</td>\n", + " <td>49.0</td>\n", + " <td>W</td>\n", + " <td>OR</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5385</th>\n", + " <td>gun</td>\n", + " <td>53.0</td>\n", + " <td>B</td>\n", + " <td>KY</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5388</th>\n", + " <td>gun</td>\n", + " <td>65.0</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5398</th>\n", + " <td>metal pipe</td>\n", + " <td>37.0</td>\n", + " <td>NaN</td>\n", + " <td>TN</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5399</th>\n", + " <td>pellet gun</td>\n", + " <td>26.0</td>\n", + " <td>NaN</td>\n", + " <td>NY</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5401</th>\n", + " <td>gun</td>\n", + " <td>81.0</td>\n", + " <td>NaN</td>\n", + " <td>NM</td>\n", + " <td>True</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5402</th>\n", + " <td>gun</td>\n", + " <td>31.0</td>\n", + " <td>NaN</td>\n", + " <td>CO</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5403</th>\n", + " <td>gun</td>\n", + " <td>38.0</td>\n", + " <td>B</td>\n", + " <td>FL</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5404</th>\n", + " <td>gun</td>\n", + " <td>59.0</td>\n", + " <td>NaN</td>\n", + " <td>ID</td>\n", + " <td>False</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5412</th>\n", + " <td>undetermined</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>CA</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5414</th>\n", + " <td>gun</td>\n", + " <td>24.0</td>\n", + " <td>NaN</td>\n", + " <td>IL</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5415</th>\n", + " <td>gun</td>\n", + " <td>27.0</td>\n", + " <td>NaN</td>\n", + " <td>AZ</td>\n", + " <td>False</td>\n", + " <td>Not fleeing</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>1016 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " armed age race state signs_of_mental_illness flee\n", + "15 NaN 28.0 W MT False Not fleeing\n", + "26 NaN 24.0 B MN False Not fleeing\n", + "27 NaN 29.0 W MO False Not fleeing\n", + "45 NaN 42.0 B AZ False Car\n", + "52 NaN 29.0 B FL False Not fleeing\n", + "59 gun 59.0 NaN NJ False Not fleeing\n", + "60 NaN 17.0 H CO False Not fleeing\n", + "65 NaN 26.0 N AZ False Not fleeing\n", + "105 NaN 23.0 W CA False Car\n", + "122 NaN 50.0 H TX False Not fleeing\n", + "127 gun NaN H TX False Car\n", + "138 NaN 27.0 B OK False Car\n", + "166 NaN 48.0 B FL False Not fleeing\n", + "170 NaN 35.0 W DE False Car\n", + "179 NaN 37.0 B MD False Not fleeing\n", + "185 NaN 25.0 H CA False Not fleeing\n", + "199 NaN 24.0 W CA True Not fleeing\n", + "239 NaN 26.0 W OH False Car\n", + "240 gun 54.0 NaN NV False Not fleeing\n", + "244 NaN 27.0 B MD False Car\n", + "263 NaN 42.0 B GA False Car\n", + "270 NaN 54.0 NaN OK False Other\n", + "298 NaN 35.0 B NJ False Not fleeing\n", + "307 NaN 46.0 W CA False Car\n", + "340 knife 72.0 NaN GA True Not fleeing\n", + "342 NaN 21.0 B CA False Car\n", + "346 NaN 31.0 B MN False Car\n", + "348 NaN 34.0 B MD False Car\n", + "349 NaN 30.0 B TX False Car\n", + "398 gun 40.0 NaN OR True Not fleeing\n", + "... ... ... ... ... ... ...\n", + "5348 gun 44.0 NaN TN True Foot\n", + "5349 gun 52.0 NaN MT False Not fleeing\n", + "5350 knife 43.0 NaN TN True Not fleeing\n", + "5353 toy weapon 35.0 NaN CA True Not fleeing\n", + "5354 knife 33.0 NaN TX False Not fleeing\n", + "5355 toy weapon NaN NaN CA True Not fleeing\n", + "5357 toy weapon 61.0 NaN NY True Not fleeing\n", + "5360 gun NaN NaN CO False NaN\n", + "5362 gun 47.0 NaN OH False Not fleeing\n", + "5363 undetermined 61.0 NaN FL False Foot\n", + "5366 sword 50.0 H CA False NaN\n", + "5368 knife 30.0 NaN OK False Not fleeing\n", + "5369 undetermined NaN NaN AL False Not fleeing\n", + "5373 crowbar NaN NaN CA False Not fleeing\n", + "5375 knife NaN NaN OR False Not fleeing\n", + "5377 toy weapon NaN W CA False NaN\n", + "5378 gun 32.0 A CA False NaN\n", + "5379 knife NaN NaN PA False Not fleeing\n", + "5383 undetermined 49.0 W OR False NaN\n", + "5385 gun 53.0 B KY False NaN\n", + "5388 gun 65.0 NaN CA False Not fleeing\n", + "5398 metal pipe 37.0 NaN TN False Not fleeing\n", + "5399 pellet gun 26.0 NaN NY False Not fleeing\n", + "5401 gun 81.0 NaN NM True Not fleeing\n", + "5402 gun 31.0 NaN CO False NaN\n", + "5403 gun 38.0 B FL False NaN\n", + "5404 gun 59.0 NaN ID False NaN\n", + "5412 undetermined NaN NaN CA False Not fleeing\n", + "5414 gun 24.0 NaN IL False Not fleeing\n", + "5415 gun 27.0 NaN AZ False Not fleeing\n", + "\n", + "[1016 rows x 6 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "null_rows = police_simple.isnull().any(axis=1)\n", + "police_simple[null_rows]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def check_NaN(dataframe):\n", + " print(\"Total NaN:\", dataframe.isnull().values.sum())\n", + " print(\"NaN by column:\\n\",dataframe.isnull().sum())\n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total NaN: 1232\n", + "NaN by column:\n", + " armed 227\n", + "age 235\n", + "race 521\n", + "state 0\n", + "signs_of_mental_illness 0\n", + "flee 249\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "check_NaN(police_simple)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> Given there are undefined values, we have three choices:\n", + "* Remove affected rows\n", + "* Remove affected attributes, e.g. remove the entire column showing whether the person killed was armed\n", + "* Fill blank with 'dummy' data, e.g \"unreported\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def fillNaN_unreported(dataframe, key):\n", + " dataframe[key].fillna(\"unreported\", inplace = True)\n", + " return " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "fillNaN_unreported(police_simple, \"armed\")\n", + "fillNaN_unreported(police_simple, \"race\")\n", + "fillNaN_unreported(police_simple, \"flee\")\n", + "\n", + "median = police_simple[\"age\"].median()\n", + "police_simple[\"age\"].fillna(median, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total NaN: 0\n", + "NaN by column:\n", + " armed 0\n", + "age 0\n", + "race 0\n", + "state 0\n", + "signs_of_mental_illness 0\n", + "flee 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "check_NaN(police_simple)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Impact of NaN values on data" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZMAAAEXCAYAAABoPamvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XlcVPXi//HXyACmoKmBmnr95kbfNHGpBDMwS1ABUbJSCZe6aaaWppbiluQuZZrZnpXXrpoLuHSxrpalYJaVil/bbqK5XAR3SGCGOb8//DlXQh1wHIa5vZ+PBw85nzkz531GnfecZc6YDMMwEBERcUIVdwcQERHPpzIRERGnqUxERMRpKhMREXGaykRERJymMhEREaeZ3R1APENQUBAtWrSgSpX/vP9o1aoVM2bMcGMq17l0fQ3DwGw288ADDxAfHw/A3//+d86dO8eQIUOu+BgfffQRRUVF9vtc6tL7d+nShQULFnD77beXOd9vv/3G3LlzeeWVV8jOzubpp59m+fLl5V/Rcpo6dSpffvklMTExjB492j7+1Vdf8fjjj3PLLbfYx/Lz82nWrBmzZs2iVq1aLs8m7qUykTJ7//33qV27trtjVJhL1/fkyZM88cQTFBYW8uijj9KvXz+H99+1axfNmze/7G1luf/VHD16lAMHDgBQt27dCikSgBUrVvD5559Tr169Urf95S9/ITU11T5dXFzMyJEjeffddxkzZkyF5BP3UZmI01q1asV9993HDz/8QHJyMtWqVWPGjBmcPn2a4uJiEhIS6NOnDwALFixg/fr11KpVizvuuIPMzEyWLl3K+PHjad68OY899hhAiens7GySkpI4duwYFouFqKgonnjiCQ4fPsygQYMIDw9n9+7dnD17lnHjxtG1a1esVivz5s3j888/x8vLi7Zt2zJ16lRiYmKYMmUKd999NwATJ06kRYsWDBw48KrrWLt2bcaPH89TTz3F4MGDWbRoEadOnWLKlCl8+OGHLF++HG9vb3x9fUlKSuLAgQNs2bKF7du3U7VqVU6ePMn333/P8ePHCQoKonHjxvb7A3z44Yf88MMPFBUVMXjwYPr06cNXX33FCy+8wIYNGwDs06mpqUyaNIns7Gwee+wxpk2bRkxMDN999x0Wi4XZs2eTkZGBl5cXrVu3ZsKECfj5+dGlSxd69+5NRkYGx44dIzY2llGjRpVa159//pmkpCROnz6NyWTi0UcfpVevXvTv3x/DMHj88ceZOnUqd9xxx1Wfs7y8PE6ePEm7du0A+P7775k3bx5FRUXk5OTQsWNHZs6cCcBnn33Gyy+/jM1mo1q1akybNo1bb72Vb7/9luTkZM6fP0+VKlUYMWIE9957Lzk5OTz33HOcOnUKgPDw8Muui1QgQ6QMWrRoYURHRxs9e/a0/+Tm5tpvW7t2rWEYhmGxWIwePXoYmZmZhmEYxtmzZ43u3bsb3333nbFp0yajR48exrlz54yioiLjr3/9q/HII48YhmEYzz33nPH222/bl3fpdEJCgrF582bDMAyjoKDASEhIMDZu3Gj89ttvRosWLYwtW7YYhmEYaWlpRufOnQ3DMIz333/fiI+PN86fP28UFxcbTz/9tLF27VpjyZIlxlNPPWUYhmGcO3fOCAkJMc6cOXPZ9T1x4kSJsfz8fPv4woULjWnTphlWq9Vo2bKlkZ2dbRiGYaxdu9ZYvnx5qXVYuHChERkZaVgsFvv0tGnTDMMwjHvvvdeYOnWqYRiG8e9//9sIDQ01fvrpJ2PHjh1GVFSUffmXTl/6+2+//Wa0adPGMAzDWLBggTFixAijqKjIKC4uNsaPH29MnjzZvpzZs2fbl3P77bcbhw4dKrGOFovFuO+++4xNmzbZ57vnnnuMb7/99orPy8U8t99+u9GzZ0+jR48eRkhIiNGrVy/jjTfeMIqKigzDMIzRo0cbO3bsMAzDMPLy8owOHToYe/fuNXJycoz27dsb+/btMwzDMDZt2mQ89thjxunTp42IiAjjt99+s2cJCwszjhw5YixatMi+Xvn5+caoUaOMs2fPlsolFUdbJlJmV9vNdfFdalZWFocOHSIxMdF+W0FBAf/3f//HL7/8QteuXfHz8wPg4Ycf5v3337/qMn///Xe+/vprzpw5w4IFC+xjP/zwA61bt8bb25vw8HAAbrvtNk6fPg1Aeno6sbGxVK1aFYCXX34ZgLNnz/Lqq69y8uRJ0tLS6Ny5MzVq1CjT+ptMJgB8fX3tY15eXnTr1o2+ffvSuXNnOnXqZM/zR23atMFsvvx/ub59+wIXdlndfffdZGRkEBQUVKZcl/riiy8YPXo03t7eACQkJDB8+HD77ffdd599OXXq1OHMmTM0atTIfntWVhaFhYVERETY54uIiODLL7+kbdu2V132pbu5Vq9ezfz58+nevbs9y+zZs/niiy94/fXX+fXXXyksLOT333/n22+/pXnz5tx2220AREREEBERwdatW8nJySmR32Qy8eOPP3LPPfcwZMgQjh07RseOHRkzZgz+/v7lfr7k+lGZyHVRrVo14MJ+cn9//xL7znNzc/H39+fll1/GuORScBdfZODCi8Slt1ksFgBsNhuGYbB8+XJuuOEG4MLxC19fX06dOoW3t7f9pICLL/ZAqRft3NxcbDYbgYGBdOvWjXXr1rF+/XqmTp1a5nXcu3cvDRs2pHr16iXGk5OT+emnn0hPT+fNN98kNTXVXnyXe44u59ITG2w2G2az+YrPydXYbLYSz4PNZitxv0uL8I+PDxf+/i69P4BhGFitVofLvtQDDzzA7t27efrpp1m5ciVms5lHHnmEoKAg7rnnHrp3787u3bsxDAMvL68SyzQMgx9//JHi4mKaNm3KRx99ZL8tOzub2rVr4+3tzebNm8nIyGDHjh08+OCDvPXWW7Rq1apcOeX60anBcl3dcsstVK1a1V4mx44dIzo6mszMTDp37kxaWhpnzpzBZrORkpJiv1+tWrXIzMwELrxg7Ny5EwA/Pz/atGnDkiVLgAtbFv369WPz5s1XzREaGsqGDRsoKirCZrPx/PPPs3HjRgDi4+P54IMPMAyD1q1bl2m9srOzSU5O5tFHHy0xfvLkScLDw7nxxhsZNGgQo0aNYu/evcCFrZayvgivXbsWuHBgPSMjg9DQUGrXrs3Ro0c5ceIEhmHY81987MuVyz333MPf//53LBYLNpuNZcuW2Y8PlUWTJk0wm8188skn9vXetGkTHTt2LPNjXDR27FiOHTvGsmXLOHv2LHv37mXs2LFERETw73//m0OHDmGz2QgODuZf//oXP//8MwCbN29m3LhxtGnThoMHD/L1118DsH//fiIjI+1/F4sXL+b+++9n4sSJNGvWzH5/cQ9tmch15ePjw+LFi5kxYwZvv/02VquVp59+mvbt2wMwYMAA+vfvj6+vLw0aNLDfLyEhgbFjxxIZGUnDhg0JCQmx35acnMwLL7xATEwMRUVFREdH07NnTw4fPnzFHH379uXIkSPExcVhGAZ33XUXCQkJANx6663UrFnTvmvpSgYOHEiVKlXw8vICKHFq8EW1a9dm2LBhDBo0iKpVq+Ll5cX06dMBCAsLY/bs2WV63goLC+nduzcWi4VJkybZT7Ht27cvDzzwAAEBAXTu3NleVM2aNcPX15c+ffowf/58++MMGzaMOXPm0KtXL6xWK61bt2by5MllygAXthYXL17M9OnTeeWVVyguLmb48OEl/j7KqkaNGowdO5ZZs2YRFRXFkCFD6N27N9WqVaNu3bq0a9eOgwcPEhoaSnJyMs899xzFxcX4+fkxf/58ateuzcKFC5k7dy6FhYUYhsHcuXNp2LAhAwcOZPz48URHR+Pj40NQUBBRUVHlzijXj8n443auSAVJS0tj2bJlLF26tEKXe+jQIRISEkhLS7PvOhMR52jLRP5UFixYwMqVK5k2bZqKROQ60paJiIg4TQfgRUTEaSoTERFxmspEREScpjIRERGn/defzXXqVD42m84xEBEpiypVTNSqVd3xjH/wX18mNpuhMhERcTHt5hIREaepTERExGkuLZNFixYRFRVFVFQUc+fOBWDChAlEREQQGxtLbGwsn376KXDhkuExMTFERESUuNbQ/v37iYuLIzIykokTJ5b76qUiIuJ6LiuT9PR0tm3bxtq1a0lJSWHfvn18+umnZGZm8re//Y3U1FRSU1Pp2rUrBQUFJCYmsnjxYj7++GMyMzPZunUrAOPGjWPKlCls2rQJwzBYuXKlqyKLiMg1clmZBAQEMH78eHx8fPD29qZp06YcPXqUo0ePkpiYSExMDAsXLsRms7Fnzx4aN25Mo0aNMJvNxMTEkJaWxpEjRygoKKBNmzYAxMXFkZaW5qrIIiJyjVx2Nlfz5s3tv2dlZfGPf/yDZcuWsXPnTqZOnYq/vz9Dhw5l1apVVKtWjYCAAPv8gYGBZGdnc/z48RLjAQEBZGdnlytHnTp+zq+MiIhclctPDf75558ZOnQozz77LE2aNOHVV1+135aQkEBKSgqRkZGlvmnNZDKV+ta4i+PlceJEnk4NFhEpoypVTNf0JtylZbJr1y6eeuopEhMTiYqK4scffyQrK4vIyEjgQjmYzWbq1atHTk6O/X45OTkEBgaWGs/NzSUwMNCpTP41qlLV19vxjBWsoNDCubMF7o4hInJNXFYmx44dY/jw4cyfP5/Q0FDgQnnMnDmTkJAQqlWrxooVK+jduzfBwcEcOHCAgwcP0rBhQzZs2MADDzxAgwYN8PX1ZdeuXbRv357U1FTCwsKcylXV15v+zy67Hqt4XX04N55zqExExDO5rEzeeecdCgsLS3xtad++fRkyZAj9+vXDarUSERFBdHQ0ALNnz2bkyJEUFhYSHh5Ot27dgAtf2Tpp0iTy8vJo2bIlAwYMcFVkERG5Rv/1X471x2MmAQH+lXbLJCfnnLtjiMif3LUeM9En4EVExGkqExERcZrKREREnKYyERERp6lMRETEaSoTERFxmspEREScpjIRERGnqUxERMRpKhMREXGaykRERJymMhEREaepTERExGkqExERcZrKREREnKYyERERp6lMRETEaSoTERFxmspEREScpjIRERGnqUxERMRpKhMREXGaykRERJymMhEREaepTERExGkqExERcZrKREREnKYyERERp6lMRETEaSoTERFxmspEREScpjIRERGnlatMsrOz+eabb8o8/6JFi4iKiiIqKoq5c+cCkJ6eTkxMDBEREcyfP98+7/79+4mLiyMyMpKJEyditVoBOHr0KPHx8XTr1o1hw4aRn59fnsgiIlIBHJbJhx9+yJgxYzh58iRxcXFMnDiRF1980eEDp6ens23bNtauXUtKSgr79u1jw4YNJCYmsnjxYj7++GMyMzPZunUrAOPGjWPKlCls2rQJwzBYuXIlANOmTaN///6kpaXRqlUrFi9e7OQqi4jI9eawTFatWsWECRNIS0vjvvvuY+PGjWzfvt3hAwcEBDB+/Hh8fHzw9vamadOmZGVl0bhxYxo1aoTZbCYmJoa0tDSOHDlCQUEBbdq0ASAuLo60tDQsFgtff/01kZGRJcZFRKRycVgmJpOJm266iYyMDEJCQjCbzdhsNocP3Lx5c3s5ZGVl8Y9//AOTyURAQIB9nsDAQLKzszl+/HiJ8YCAALKzszl16hR+fn6YzeYS4yIiUrmYHc3g4+PDW2+9xc6dO5k+fToffvghN9xwQ5kX8PPPPzN06FCeffZZvLy8yMrKst9mGAYmkwmbzYbJZCo1fvHPS/1x2pE6dfzKNb87BQT4uzuCiMg1cVgmM2bM4J133mHOnDnUrFmTXbt2MX369DI9+K5du3jqqadITEwkKiqKnTt3kpOTY789JyeHwMBA6tWrV2I8NzeXwMBAateuzblz5yguLsbLy8s+f3mcOJGHzWbYpyvzC3ZOzjl3RxCRP7kqVUzX9Cbc4W6uJk2aMHnyZOrXr49hGEyfPp2mTZs6fOBjx44xfPhwkpOTiYqKAiA4OJgDBw5w8OBBiouL2bBhA2FhYTRo0ABfX1927doFQGpqKmFhYXh7e3PHHXfw8ccfA5CSkkJYWFi5V1JERFzL4ZbJ999/z4gRIzCbzSxfvpzY2Fhee+012rVrd9X7vfPOOxQWFjJ79mz7WN++fZk9ezYjR46ksLCQ8PBwunXrBkBycjKTJk0iLy+Pli1bMmDAAACmTp3K+PHjee2116hfvz4vvfSSM+srIiIuYDIMw7jaDP379ycpKYmxY8eSkpLC1q1bWbhwIatXr66ojE653G6u/s8uc2Oiy/twbrx2c4mI27lsN1dBQQHNmjWzT4eHh1NcXFzuBYmIyH8vh2ViNps5c+aM/SyqX3/91eWhRETEszg8ZjJs2DAeeeQRcnNzeeaZZ9i+fTtJSUkVkU1ERDyEwzK59957adKkCdu3b8dmszF8+PAync0lIiJ/Hg53c/373/9myZIl9O/fn44dO/Liiy+W+EyIiIiIwzIZP348TZo0AaBBgwbcddddJCYmujyYiIh4DodlcurUKftnPnx9fRk0aJC2TEREpASHZVJcXFzi4oq5ubk4+GiKiIj8yTg8AD9o0CB69erFPffcg8lkIj09nWeffbYisomIiIdwWCZ9+vShVatW7NixAy8vLx577DFatGhREdlERMRDOCwTAH9/f+666y4Mw8BisbBv3z5atmzp6mwiIuIhHJbJggULePfdd6lTp459zGQysXnzZpcGExERz+GwTFJTU/nkk0+oW7duReQREREP5PBsrvr166tIRETkqhxumYSGhjJ37lzuu+8+qlatah/XMRMREbnIYZmsWbMGgLS0NPuYjpmIiMilHJbJli1bKiKHiIh4MIfHTPLz80lKSmLgwIGcPn2aKVOmkJ+fXxHZRETEQzgsk+nTp+Pv78+JEyfw9fUlLy+PKVOmVEQ2ERHxEA7LZP/+/YwePRqz2cwNN9xAcnIy+/fvr4hsIiLiIRyWSZUqJWcpLi4uNSYiIn9uDg/A33nnncybN4+CggK+/PJLli1bRocOHSoim4iIeAiHmxhjx46lWrVq+Pv7M3/+fIKCgnTVYBERKcHhlsnChQsZM2YMw4cPr4g8IiLigRxumXz++ecVEENERDyZwy2Thg0b8uijj9KuXTuqV69uHx88eLBLg4mIiOdwWCY33ngjAEeOHHF5GBER8UwOy+Smm25izJgxFZFFREQ8lI6ZiIiI03TMREREnKZjJiIi4jSHZTJr1qyKyCEiIh7MYZk88cQTlx1//fXXr3sYERHxTA4PwEdGRtp/unTpgsViISgoqMwLyMvLIzo6msOHDwMwYcIEIiIiiI2NJTY2lk8//RSA9PR0YmJiiIiIYP78+fb779+/n7i4OCIjI5k4cSJWq7W86ygiIi7mcMukd+/epaYTEhLK9OC7d+9m0qRJZGVl2ccyMzP529/+RmBgoH2soKCAxMREli5dSv369Rk6dChbt24lPDyccePGMX36dNq0aUNiYiIrV66kf//+ZVw9ERGpCOW+lrxhGBw/frxM865cuZKpU6fai+P8+fMcPXqUxMREYmJiWLhwITabjT179tC4cWMaNWqE2WwmJiaGtLQ0jhw5QkFBAW3atAEgLi6uxHfRi4hI5VDuYyY//fQTd911V5kefMaMGSWmc3NzCQkJYerUqfj7+zN06FBWrVpFtWrVCAgIsM8XGBhIdnY2x48fLzEeEBBAdnZ2mZYtIiIVx2GZREZG2n83mUz069ePTp06XdPCGjVqxKuvvmqfTkhIICUlhcjISEwmk33cMAxMJhM2m+2y4+VRp47fNWV1h4AAf3dHEBG5Jg7LpGvXrixbtoyhQ4dy5MgR3nvvPe68806qVatW7oX9+OOPZGVl2QvKMAzMZjP16tUjJyfHPl9OTg6BgYGlxnNzc0scaymLEyfysNkM+3RlfsHOyTnn7ggi8idXpYrpmt6EOzxmMmHCBE6fPg1AjRo1MJlMTJ48ufwJuVAeM2fO5MyZM1gsFlasWEHXrl0JDg7mwIEDHDx4kOLiYjZs2EBYWBgNGjTA19eXXbt2AZCamkpYWNg1LVtERFzH4ZZJVlYWr7zyCgD+/v4kJibSs2fPa1rYrbfeypAhQ+jXrx9Wq5WIiAiio6MBmD17NiNHjqSwsJDw8HC6desGQHJyMpMmTSIvL4+WLVsyYMCAa1q2iIi4jsMysVqt5OXl4ed3YbMnPz8fwzAc3KukLVu22H+Pj48nPj6+1DyhoaGsW7eu1Pitt97KqlWryrU8ERGpWA7LpFevXjz44IN069YNk8nEp59+SlxcXEVkExERD+GwTIYOHUrz5s1JT0/HbDYzbtw4HbcQEZESHB6Az8vL4+eff2bSpEkkJCTw5Zdf8vvvv1dENhER8RAVejaXiIj8d3JYJllZWTz33HPAf87m+vnnn10eTEREPIfDMrl4NtdF13I2l4iI/HfT2VwiIuK0cp/NNXbsWMLDwysim4iIeAiHZZKbm4vNZiM0NJTWrVuXuIqviIgIOCiTl156iQ8++IAmTZpgsVg4fPgw8fHxjB07tqLyiYiIB7himaxevZrvvvuOzZs3U6dOHQCys7MZM2YMH330EQ8++GCFhRQRkcrtimdzrVixghdffNFeJAB169YlOTmZ5cuXV0g4ERHxDFfcMikqKrrsd4fUq1ePoqIil4aSK6tV0wezj6+7Y5RiLSrk1Bn9uxD5s7pimVztkikWi8UlYcQxs48vu+b+1d0xSmn/7NuAykTkz+qKu7latWpFampqqfGUlBRat27t0lAiIuJZrrhlMnr0aBISEvjll1+44447sFqtfPXVV6SlpbFixYqKzCgiIpXcFbdMGjVqxLJlyzhz5gzJycm8/PLLFBYWsnLlSurXr1+RGUVEpJK76udMGjRoQFJSUkVlERERD+XwQo8iIiKOqExERMRpVyyTf/7znwD6TImIiDh0xTJZsGABAA8//HCFhREREc90xQPw1atXJzIykuzsbGJiYkrdvn79epcGExERz3HFMnn77bfZv38/EydO1He+i4jIVV2xTPz8/Ljzzjt54403CAwMZN++fVitVlq3bo2fn19FZhQRkUrO4ZdjnTt3joSEBG666SaKi4vJzs7m9ddfp127dhWRT0REPIDDMpkzZw7JycmEhIQAkJGRwezZs1m5cqXLw4mIiGdw+DmT/Px8e5EAhIaGcv78eZeGEhERz+KwTEwmE0eOHLFPHz58GC8vL5eGEhERz+JwN9fw4cN5+OGHCQ0NxWQysW3bNqZOnVoR2URExEM4LJP777+fJk2asGPHDmw2G0OHDqVp06YVkU1ERDyEwzIBaNKkCU2aNHF1FhER8VC60KOIiDjNpWWSl5dHdHQ0hw8fBiA9PZ2YmBgiIiKYP3++fb79+/cTFxdHZGQkEydOxGq1AnD06FHi4+Pp1q0bw4YNIz8/35VxRUTkGjksk2efffaaHnj37t3069ePrKwsAAoKCkhMTGTx4sV8/PHHZGZmsnXrVgDGjRvHlClT2LRpE4Zh2D/DMm3aNPr3709aWhqtWrVi8eLF15RFRERcy2GZ7N+/H8Mwyv3AK1euZOrUqQQGBgKwZ88eGjduTKNGjTCbzcTExJCWlsaRI0coKCigTZs2AMTFxZGWlobFYuHrr78mMjKyxLiIiFQ+Dg/ABwYGEhUVRXBwMNWrV7ePT5o06ar3mzFjRonp48ePExAQUOJxs7OzS40HBASQnZ3NqVOn8PPzw2w2lxgXEZHKx2GZtG3blrZt2zq9IJvNhslksk8bhoHJZLri+MU/L/XH6bKoU8dzLkoZEODv7ghO8fT8InLtHJbJiBEjKCgo4ODBgzRv3pzCwkJuuOGGci+oXr165OTk2KdzcnIIDAwsNZ6bm0tgYCC1a9fm3LlzFBcX4+XlZZ+/vE6cyMNm+89uusr8gpeTc87hPJ6eX0QqtypVTNf0JtzhMZPdu3dz//33M3ToUI4fP07nzp359ttvy72g4OBgDhw4wMGDBykuLmbDhg2EhYXRoEEDfH192bVrFwCpqamEhYXh7e3NHXfcwccffwxASkoKYWFh5V6uiIi4nsMymTNnDu+99x433ngj9erVY+7cuaWOh5SFr68vs2fPZuTIkfTo0YMmTZrQrVs3AJKTk5k1axbdunXj999/Z8CAAQBMnTqVlStX0qNHD7755htGjRpV7uWKiIjrOdzNVVBQQLNmzezT4eHhJT4j4siWLVvsv4eGhrJu3bpS89x6662sWrWq1HiDBg1YunRpmZclIiLu4XDLxGw2c+bMGfvB719//dXloURExLM43DIZNmwYjzzyCDk5OTzzzDNs376dpKSkisgmIiIewmGZ3HvvvTRp0oTt27djs9kYPny4rhosIiIllOmqwVarFZvNhtlstn+IUORa1Kjpi6+Pj7tjlFJYVMTZM4XujiHisRw2w+rVq3nppZfo1KkTxcXFLFq0iMmTJ9svcyJSHr4+Pgxa8rS7Y5Ty3uAFgMpE5Fo5LJP33nuPtWvX2j8wePToUYYOHaoyERERO4dnc3l7e5f45PnNN9+Mt7e3S0OJiIhnueKWyb59+wAICgoiKSmJhx9+GC8vL9asWUO7du0qLKCIiFR+VyyTkSNHlpj+/PPP7b+bTCaHVw0WEZE/jyuWyaWfXBcREbkahwfgc3JyWLt2LadPny4xfq3fwCgiIv99HB6AHzZsGHv27MEwjBI/IiIiFzncMrFYLCxatKgisoiIiIdyuGXSsmVLfvrpp4rIIiIiHsrhlkm7du3o1asXAQEBJS6lsnnzZpcGExERz+GwTN555x2Sk5P5y1/+UhF5RETEAzkskxo1atCjR4+KyCIiIh7KYZmEhIQwZ84cIiIi8Lnkaq8tW7Z0aTAREfEcDstk/fr1AGzatMk+ZjKZdMxERETsHJaJPgkvIiKOOCyTJUuWXHZ88ODB1z2MiIh4JodlculnTIqKivj6668JDQ11aSgREfEsDstk1qxZJaazs7OZOHGiywKJiIjncfgJ+D+qW7cuR44ccUUWERHxUOU6ZmIYBpmZmdSpU8eloURExLOU65j+2wRdAAAPIklEQVQJQP369XX5eRERKaHcx0xERET+6IplMmHChCveyWQyMXPmTJcEEhERz3PFMmnevHmpsVOnTvH+++/ToEEDl4YSERHPcsUyefTRR0tMp6en89xzzxETE8OkSZNcHkxERDyHw2MmVquVF198kbVr1zJt2jQiIyMrIpeIiHiQq5ZJVlYWzzzzDNWrVyclJYV69epVVC4REfEgV/zQ4urVq3nooYfo2rUrS5cuVZGIiMgVXXHLZOLEiVSpUoU333yTt956yz5uGAYmk4lvv/32mheakJDAyZMn7V8DnJSUxKFDh3jttdewWq0MHDiQ+Ph44MKxmlmzZlFYWEj37t0ZPXr0NS9XRERc44pl4qrvKzEMg6ysLD777DN7mWRnZzN69GjWrFmDj48Pffv2pUOHDjRs2JDExESWLl1K/fr1GTp0KFu3biU8PNwl2URE5NpcsUxcdfrvr7/+Clw4W+z06dM89NBDVK9enZCQEG688UYAIiMjSUtL46677qJx48Y0atQIgJiYGNLS0lQm4hY3+vvgXdXX3TEuy1JQyOlzRe6OIX9iDs/mut7Onj1LaGgokydPxmKxMGDAALp3705AQIB9nsDAQPbs2cPx48dLjWdnZ5dreXXq+F237K4WEODv7ghO+TPk/3hA5fwenx4fLCGgkhad/DlUeJm0bduWtm3b2qf79OnDrFmzGDZsmH3s4nEZm82GyWQqNV4eJ07kYbMZ9unK/IKXk3PO4TzK7zqO8lfm7FC251/EkSpVTNf0Jrzcl6B31jfffENGRoZ92jAMGjRoQE5Ojn0sJyeHwMBA6tWrd9lxERGpXCq8TM6dO8fcuXMpLCwkLy+PtWvXMm/ePDIyMjh58iTnz5/nk08+ISwsjODgYA4cOMDBgwcpLi5mw4YNhIWFVXRkERFxoMJ3c917773s3r2bXr16YbPZ6N+/P+3bt2f06NEMGDAAi8VCnz59aN26NQCzZ89m5MiRFBYWEh4eTrdu3So6soiIOFDhZQIwatQoRo0aVWIsJiaGmJiYUvOGhoaybt26ioomIiLXoMJ3c4mIyH8flYmIiDhNZSIiIk5TmYiIiNNUJiIi4jSViYiIOE1lIiIiTlOZiIiI01QmIiLiNJWJiIg4TWUiIiJOU5mIiIjTVCYiIuI0lYmIiDhNZSIiIk5TmYiIiNNUJiIi4jSViYiIOE1lIiIiTlOZiIiI01QmIiLiNJWJiIg4TWUiIiJOU5mIiIjTVCYiIuI0lYmIiDhNZSIiIk5TmYiIiNPM7g4gIhWjZo0b8PGtnP/liwqtnDl73t0xxAmV81+WiFx3Pr5mZk5c5e4Yl5U4o4+7I4iTVCYi4hFq1vDBx9fX3TFKKSos5MzZInfHcDuViYh4BB9fX16aMNTdMUp5ZtYbgMpEZSIiUgFq1bwBs0/le8m1Flk5dcb541WVb80uY/369bz22mtYrVYGDhxIfHy8uyOJiJSL2cfM7sWfuztGKcFPdr4uj1PpyyQ7O5v58+ezZs0afHx86Nu3Lx06dKBZs2bujiYiIv9fpS+T9PR0QkJCuPHGGwGIjIwkLS2NESNGlOn+VaqYSo3dVKv6dc14vVwu6+X41Kjj4iTXpqz5b/Kr7eIk16Ys+W+4qXI+91C2/DVvrFYBSa5NWfLXuLFyPv9l/bfv7V/VxUmuzaX5y7ouf2QyDMO4XoFc4Y033uD3339n9OjRAHz00Ufs2bOHF154wc3JRETkokr/CXibzYbJ9J+mNAyjxLSIiLhfpS+TevXqkZOTY5/OyckhMDDQjYlEROSPKn2ZdOzYkYyMDE6ePMn58+f55JNPCAsLc3csERG5RKU/AF+3bl1Gjx7NgAEDsFgs9OnTh9atW7s7loiIXKLSH4AXEZHKr9Lv5hIRkcpPZSIiIk5TmYiIiNNUJiIi4jSVSRkNHjyYf/7zn/bpOXPm0LZtW4qK/nPp6U6dOnH48GF3xCuzr776irZt2xIbG0vPnj3p3r0777//vrtjldnhw4fp0qVLqfGgoCA3pKlYCQkJ5Zr/q6++Kvd9yuqnn34iKCiITZs2ueTxXeHw4cMEBQWxffv2EuNdunSp9P9vAfLz85k2bRpdu3alZ8+e9O/fn4yMDHfHslOZlFFISAi7du2yT6enp9OmTRv72MGDB6lWrRoNGzZ0V8Qya9WqFampqaxbt46PPvqId999l19++cXdscSBnTt3ujuC3erVq+nWrRsrVqxwd5Ry8fb2ZvLkyeTl5bk7SrkYhsETTzyBt7c3GzduZN26dUyaNIlx48bx1VdfuTseoDIps9DQUL777jvgwpWMfXx8iIyMZNu2bQB888033H333e6MeE0KCwvx8vLC39/f3VEqjT++ox8/fjxr1qyhV69ejBs3jujoaAYOHMjp06eBC280/vrXvxIbG4vFYuHNN9+kd+/e9OzZk7lz52IYBocPHyYmJoann36aqKgoHn/8cfv9P/vsM2JjY4mJieHJJ58kNzcXuPCOedSoUURGRvL8888D8OCDDwLwxRdf0KdPH3r16sWIESM4deoUANu2bSMqKoq4uDhWrlzpkufHYrGwfv16Ro0axb59+zh06JBLluMKgYGBdOzYkTlz5rg7Srns3LmTo0ePMmHCBHx8fAC47bbbGDZsGIsXL3ZzugtUJmXUsmVLDh06RGFhIdu2bePuu+/m7rvv9sgyyczMtL94denShbvuusujLlFz/PhxYmNjS/xUhB9++IHBgwezYcMGatSowfr16wE4deoUjz/+OKmpqWRkZJCZmcmqVatISUkhOzubdevWARd2DfXv35+NGzfStGlTFi1axIkTJ5gyZQqvvvoq69evp127diQlJdmXGRYWxqZNm+xl8tFHH3Hy5ElefPFF3nnnHVJSUujUqRPJyckUFRUxfvx4Fi5cyJo1a6ha1TVXqN26dSs333wzt9xyC/fff7/HbZ2MHz+ebdu2ldrdVZnt3buXVq1albou4Z133snevXvdlKoklUkZeXl5ERwczN69e9m2bRudOnWiUaNGFBQUcObMGb777jtCQkLcHbNMLu7mWr9+Pdu3bycrK4s333zT3bHKLDAwkNTU1BI/FaFOnTrcdtttADRv3pwzZ87YbwsODgYgIyODPXv2EBcXR+/evcnMzLTvQvyf//kfOnToAECvXr3YsWMHe/bsoXXr1vbdow8//DA7duwo9biX2r17N8eOHWPAgAHExsaybNkyDh48yI8//khgYCBNmzYFoHfv3i54Fi7s4oqOjgagR48erFmzpsSxw8rOz8+PF154waN2d5lMJoqLi0uNWyyWSnPh20p/OZXKJCQkhG+//ZY9e/Ywb9484MLur82bN1OrVi38/PzcnLD8/Pz86N69O+np6e6OUmmYTCYuvTCExWIBwNfX94rzXNwKKC4uZuDAgQwePBiAs2fP4uXlxalTpzCb//PfzTAMvLy8sNlsJZZtGAZWq9U+fekyLyouLqZdu3a8/vrrwIVdlfn5+Rw9erREJi8vr/KvvAMnTpzgyy+/ZN++fXzwwQcYhsHZs2f59NNPiYqKuu7Lc5VOnTp51O6u4OBgli5disViwdvb2z7+/fff06pVKzcm+w9tmZRDaGgoqamptGjRwv7CcPfdd7NkyRKP2cX1R8XFxezcudP+jlugVq1a/PbbbxQWFnL69OkSJ144EhISQmpqKvn5+VitVoYPH24/4+nAgQPs378fuPDuPiwsjODgYHbv3m0/m2jFihX2rZc/8vLywmq1EhwczPfff8+BAwcAWLx4MXPnziUoKIjc3Fx++OEHADZu3HjNz8GVpKamEhISwhdffMGWLVv47LPPeOKJJ1i+fPl1X5arXdzddfz4cXdHceiOO+6gWbNmzJw50/7mJjMzk9dee40nn3zSzeku0JZJObRo0YLTp0/Tv39/+1hISAijRo2iY8eObkxWPhePmZhMJqxWK0FBQTz++OPujlVpNG/enPDwcKKiomjQoAHt27cv8327dOnCDz/8wEMPPURxcTH33HMPvXv35siRI9SsWZOFCxdy6NAhgoKCmD59OtWqVSMpKYkRI0ZgsVi4+eabmTFjxmUf+7777iM2NpY1a9Ywc+ZMRo0ahc1mo27dusybNw9vb29eeuklxo0bh9lsdskbhLVr19q/qO6i+Ph43n77bf71r3/Zd7F5gou7ux577DF3RymTRYsWMX/+fKKjo/Hy8qJmzZrMmzfvim8+Kpou9ChSAQ4fPsyAAQPYsmWLu6OIuIR2c4mIiNO0ZSIiIk7TlomIiDhNZSIiIk5TmYiIiNN0arDIdRAUFESLFi2oUqUKJpOJ8+fP4+fnx/PPP8/tt9/u7ngiLqcD8CLXQVBQEBkZGdSuXds+9s477/DJJ5943LWrRK6FtkxEXMBqtXLs2DFq1qwJQG5uLlOmTOHEiRPk5OTQoEEDXn75ZerUqcOBAweYMmUKJ0+epEqVKgwbNowePXqQnZ1NUlISx44dw2KxEBUVxRNPPOHmNRO5PG2ZiFwHF3dzwYWrCPv6+nLvvfcybNgw6tSpw/vvv09hYSFDhgzBMAyGDBlCaGgojz76KL1796ZPnz7Ex8dz7NgxEhISSElJ4cknn2TQoEF06dKFwsJCHn/8cfr27UuPHj3cvLYipalMRK6DS3dz7du3jyFDhvD888/TtWtX+zzffPMNe/fuJSsri61bt9KnTx8eeeQRQkND2b17t/17KgB+//132rdvby+oi2Pdu3fnmWeeqdB1EykL7eYSuc5atmzJhAkTGD9+PP/7v/9Lw4YNmTdvHnv27OGBBx6gQ4cOWK1WDMOwXzD00suI//rrrwQEBGAYBsuXL+eGG24A4OTJk5e9irBIZaBTg0VcIDo6mtatWzNr1izgwjcgDhw4kF69elGnTh3S09MpLi7Gz8+Pli1bkpKSAsCxY8fo168fBQUFtGnThiVLlgAXLmXfr18/Nm/e7LZ1Erka7eYSuQ4udzbXr7/+Ss+ePXnttdc4f/48ycnJ+Pr64u3tzS233IJhGLz00kscPHiQadOmkZubi8lkYuTIkdx///0cPnyYF154gaNHj1JUVER0dDQjR45041qKXJnKREREnKbdXCIi4jSViYiIOE1lIiIiTlOZiIiI01QmIiLiNJWJiIg4TWUiIiJOU5mIiIjT/h8wCwai3vpzewAAAABJRU5ErkJggg==\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import seaborn as sns\n", + "\n", + "race_count = police_simple['race'].value_counts()\n", + "sns.set(style=\"darkgrid\")\n", + "sns.barplot(race_count.index, race_count.values)\n", + "plt.title('Frequency Distribution of Races')\n", + "plt.ylabel('Number of Occurrences', fontsize=12)\n", + "plt.xlabel('Race', fontsize=12)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's look for 'placeholder' values used in place of missing values**" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "#check the age column for impossible ages\n", + "for each_age in police_simple['age']:\n", + " if each_age > 100 or each_age < 0:\n", + " print(each_age)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Let's consider if any values were entered by humans**" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'unreported',\n", + " 'shovel', 'hammer', 'hatchet', 'undetermined', 'sword', 'machete',\n", + " 'box cutter', 'metal object', 'screwdriver', 'lawn mower blade',\n", + " 'flagpole', 'guns and explosives', 'cordless drill', 'crossbow',\n", + " 'metal pole', 'Taser', 'metal pipe', 'metal hand tool',\n", + " 'blunt object', 'metal stick', 'sharp object', 'meat cleaver',\n", + " 'carjack', 'chain', \"contractor's level\", 'unknown weapon',\n", + " 'stapler', 'beer bottle', 'bean-bag gun',\n", + " 'baseball bat and fireplace poker', 'straight edge razor',\n", + " 'gun and knife', 'ax', 'brick', 'baseball bat', 'hand torch',\n", + " 'chain saw', 'garden tool', 'scissors', 'pole', 'pick-axe',\n", + " 'flashlight', 'vehicle', 'baton', 'spear', 'chair', 'pitchfork',\n", + " 'hatchet and gun', 'rock', 'piece of wood', 'bayonet', 'pipe',\n", + " 'glass shard', 'motorcycle', 'pepper spray', 'metal rake',\n", + " 'crowbar', 'oar', 'machete and gun', 'tire iron',\n", + " 'air conditioner', 'pole and knife', 'baseball bat and bottle',\n", + " 'fireworks', 'pen', 'chainsaw', 'gun and sword', 'gun and car',\n", + " 'pellet gun', 'claimed to be armed', 'BB gun', 'incendiary device',\n", + " 'samurai sword', 'bow and arrow', 'gun and vehicle',\n", + " 'vehicle and gun', 'wrench', 'walking stick', 'barstool',\n", + " 'grenade', 'BB gun and vehicle', 'wasp spray', 'air pistol',\n", + " 'Airsoft pistol', 'baseball bat and knife', 'vehicle and machete',\n", + " 'ice pick', 'car, knife and mace'], dtype=object)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#check for spelling errors\n", + "police_simple['armed'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "police_simple.loc[police_simple['armed'] == 'undetermined', 'armed'] = \"unreported\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'unreported',\n", + " 'shovel', 'hammer', 'hatchet', 'sword', 'machete', 'box cutter',\n", + " 'metal object', 'screwdriver', 'lawn mower blade', 'flagpole',\n", + " 'guns and explosives', 'cordless drill', 'crossbow', 'metal pole',\n", + " 'Taser', 'metal pipe', 'metal hand tool', 'blunt object',\n", + " 'metal stick', 'sharp object', 'meat cleaver', 'carjack', 'chain',\n", + " \"contractor's level\", 'unknown weapon', 'stapler', 'beer bottle',\n", + " 'bean-bag gun', 'baseball bat and fireplace poker',\n", + " 'straight edge razor', 'gun and knife', 'ax', 'brick',\n", + " 'baseball bat', 'hand torch', 'chain saw', 'garden tool',\n", + " 'scissors', 'pole', 'pick-axe', 'flashlight', 'vehicle', 'baton',\n", + " 'spear', 'chair', 'pitchfork', 'hatchet and gun', 'rock',\n", + " 'piece of wood', 'bayonet', 'pipe', 'glass shard', 'motorcycle',\n", + " 'pepper spray', 'metal rake', 'crowbar', 'oar', 'machete and gun',\n", + " 'tire iron', 'air conditioner', 'pole and knife',\n", + " 'baseball bat and bottle', 'fireworks', 'pen', 'chainsaw',\n", + " 'gun and sword', 'gun and car', 'pellet gun',\n", + " 'claimed to be armed', 'BB gun', 'incendiary device',\n", + " 'samurai sword', 'bow and arrow', 'gun and vehicle',\n", + " 'vehicle and gun', 'wrench', 'walking stick', 'barstool',\n", + " 'grenade', 'BB gun and vehicle', 'wasp spray', 'air pistol',\n", + " 'Airsoft pistol', 'baseball bat and knife', 'vehicle and machete',\n", + " 'ice pick', 'car, knife and mace'], dtype=object)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "police_simple['armed'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "police_simple.loc[police_simple['armed'] == 'unknown weapon', 'armed'] = \"unreported\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['gun', 'unarmed', 'toy weapon', 'nail gun', 'knife', 'unreported',\n", + " 'shovel', 'hammer', 'hatchet', 'sword', 'machete', 'box cutter',\n", + " 'metal object', 'screwdriver', 'lawn mower blade', 'flagpole',\n", + " 'guns and explosives', 'cordless drill', 'crossbow', 'metal pole',\n", + " 'Taser', 'metal pipe', 'metal hand tool', 'blunt object',\n", + " 'metal stick', 'sharp object', 'meat cleaver', 'carjack', 'chain',\n", + " \"contractor's level\", 'stapler', 'beer bottle', 'bean-bag gun',\n", + " 'baseball bat and fireplace poker', 'straight edge razor',\n", + " 'gun and knife', 'ax', 'brick', 'baseball bat', 'hand torch',\n", + " 'chain saw', 'garden tool', 'scissors', 'pole', 'pick-axe',\n", + " 'flashlight', 'vehicle', 'baton', 'spear', 'chair', 'pitchfork',\n", + " 'hatchet and gun', 'rock', 'piece of wood', 'bayonet', 'pipe',\n", + " 'glass shard', 'motorcycle', 'pepper spray', 'metal rake',\n", + " 'crowbar', 'oar', 'machete and gun', 'tire iron',\n", + " 'air conditioner', 'pole and knife', 'baseball bat and bottle',\n", + " 'fireworks', 'pen', 'chainsaw', 'gun and sword', 'gun and car',\n", + " 'pellet gun', 'claimed to be armed', 'BB gun', 'incendiary device',\n", + " 'samurai sword', 'bow and arrow', 'gun and vehicle',\n", + " 'vehicle and gun', 'wrench', 'walking stick', 'barstool',\n", + " 'grenade', 'BB gun and vehicle', 'wasp spray', 'air pistol',\n", + " 'Airsoft pistol', 'baseball bat and knife', 'vehicle and machete',\n", + " 'ice pick', 'car, knife and mace'], dtype=object)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "police_simple['armed'].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dataset 2: World Happiness Report up to 2020\n", + "You can access this dataset here: www.kaggle.com/mathurinache/world-happiness-report \n", + "> This is an example of a dataset that is:\n", + "* High quality\n", + "* Numerical" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "DATASET_PATH = './datasets/happiness/2015.csv'\n", + "\n", + "#create pandas object\n", + "happiness = load_data(DATASET_PATH)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total NaN: 0\n", + "NaN by column:\n", + " Country 0\n", + "Region 0\n", + "Happiness Rank 0\n", + "Happiness Score 0\n", + "Standard Error 0\n", + "Economy (GDP per Capita) 0\n", + "Family 0\n", + "Health (Life Expectancy) 0\n", + "Freedom 0\n", + "Trust (Government Corruption) 0\n", + "Generosity 0\n", + "Dystopia Residual 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "check_NaN(happiness)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country</th>\n", + " <th>Region</th>\n", + " <th>Happiness Rank</th>\n", + " <th>Happiness Score</th>\n", + " <th>Standard Error</th>\n", + " <th>Economy (GDP per Capita)</th>\n", + " <th>Family</th>\n", + " <th>Health (Life Expectancy)</th>\n", + " <th>Freedom</th>\n", + " <th>Trust (Government Corruption)</th>\n", + " <th>Generosity</th>\n", + " <th>Dystopia Residual</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Switzerland</td>\n", + " <td>Western Europe</td>\n", + " <td>1</td>\n", + " <td>7.587</td>\n", + " <td>0.03411</td>\n", + " <td>1.39651</td>\n", + " <td>1.34951</td>\n", + " <td>0.94143</td>\n", + " <td>0.66557</td>\n", + " <td>0.41978</td>\n", + " <td>0.29678</td>\n", + " <td>2.51738</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Iceland</td>\n", + " <td>Western Europe</td>\n", + " <td>2</td>\n", + " <td>7.561</td>\n", + " <td>0.04884</td>\n", + " <td>1.30232</td>\n", + " <td>1.40223</td>\n", + " <td>0.94784</td>\n", + " <td>0.62877</td>\n", + " <td>0.14145</td>\n", + " <td>0.43630</td>\n", + " <td>2.70201</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Denmark</td>\n", + " <td>Western Europe</td>\n", + " <td>3</td>\n", + " <td>7.527</td>\n", + " <td>0.03328</td>\n", + " <td>1.32548</td>\n", + " <td>1.36058</td>\n", + " <td>0.87464</td>\n", + " <td>0.64938</td>\n", + " <td>0.48357</td>\n", + " <td>0.34139</td>\n", + " <td>2.49204</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Norway</td>\n", + " <td>Western Europe</td>\n", + " <td>4</td>\n", + " <td>7.522</td>\n", + " <td>0.03880</td>\n", + " <td>1.45900</td>\n", + " <td>1.33095</td>\n", + " <td>0.88521</td>\n", + " <td>0.66973</td>\n", + " <td>0.36503</td>\n", + " <td>0.34699</td>\n", + " <td>2.46531</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Canada</td>\n", + " <td>North America</td>\n", + " <td>5</td>\n", + " <td>7.427</td>\n", + " <td>0.03553</td>\n", + " <td>1.32629</td>\n", + " <td>1.32261</td>\n", + " <td>0.90563</td>\n", + " <td>0.63297</td>\n", + " <td>0.32957</td>\n", + " <td>0.45811</td>\n", + " <td>2.45176</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Country Region Happiness Rank Happiness Score \\\n", + "0 Switzerland Western Europe 1 7.587 \n", + "1 Iceland Western Europe 2 7.561 \n", + "2 Denmark Western Europe 3 7.527 \n", + "3 Norway Western Europe 4 7.522 \n", + "4 Canada North America 5 7.427 \n", + "\n", + " Standard Error Economy (GDP per Capita) Family \\\n", + "0 0.03411 1.39651 1.34951 \n", + "1 0.04884 1.30232 1.40223 \n", + "2 0.03328 1.32548 1.36058 \n", + "3 0.03880 1.45900 1.33095 \n", + "4 0.03553 1.32629 1.32261 \n", + "\n", + " Health (Life Expectancy) Freedom Trust (Government Corruption) \\\n", + "0 0.94143 0.66557 0.41978 \n", + "1 0.94784 0.62877 0.14145 \n", + "2 0.87464 0.64938 0.48357 \n", + "3 0.88521 0.66973 0.36503 \n", + "4 0.90563 0.63297 0.32957 \n", + "\n", + " Generosity Dystopia Residual \n", + "0 0.29678 2.51738 \n", + "1 0.43630 2.70201 \n", + "2 0.34139 2.49204 \n", + "3 0.34699 2.46531 \n", + "4 0.45811 2.45176 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "happiness.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}