Files
Amazing-Feature-Engineering/2.2_Demo_Outlier.ipynb
DESKTOP-SAT83DL\yimeng.zhang 149c80cedf 2018.12.2 First commit.
2018-12-02 21:11:32 +08:00

1583 lines
44 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"# import seaborn as sns\n",
"# import matplotlib.pyplot as plt\n",
"import os\n",
"# plt.style.use('seaborn-colorblind')\n",
"# %matplotlib inline\n",
"from feature_cleaning import outlier as ot"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(891, 6)\n"
]
}
],
"source": [
"use_cols = [\n",
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
" 'Survived'\n",
"]\n",
"\n",
"\n",
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
"data.head(3)\n",
"print(data.shape)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"104 0.0000\n",
"163 4.0125\n",
"245 5.0000\n",
"152 6.2375\n",
"240 6.4375\n",
"237 6.4500\n",
"110 6.4958\n",
"85 6.7500\n",
"171 6.8583\n",
"238 6.9500\n",
"78 6.9750\n",
"185 7.0458\n",
"79 7.0500\n",
"218 7.0542\n",
"123 7.1250\n",
"76 7.1417\n",
"18 7.2250\n",
"32 7.2292\n",
"0 7.2500\n",
"91 7.3125\n",
"196 7.4958\n",
"186 7.5208\n",
"120 7.5500\n",
"192 7.6292\n",
"55 7.6500\n",
"182 7.7250\n",
"166 7.7292\n",
"93 7.7333\n",
"229 7.7375\n",
"227 7.7417\n",
" ... \n",
"46 80.0000\n",
"177 81.8583\n",
"30 82.1708\n",
"147 83.1583\n",
"47 83.4750\n",
"126 86.5000\n",
"180 89.1042\n",
"117 90.0000\n",
"136 91.0792\n",
"198 93.5000\n",
"200 106.4250\n",
"144 108.9000\n",
"143 110.8833\n",
"114 113.2750\n",
"168 120.0000\n",
"155 133.6500\n",
"151 134.5000\n",
"130 135.6333\n",
"27 146.5208\n",
"139 151.5500\n",
"129 153.4625\n",
"150 164.8667\n",
"224 211.3375\n",
"162 211.5000\n",
"199 221.7792\n",
"164 227.5250\n",
"75 247.5208\n",
"148 262.3750\n",
"23 263.0000\n",
"127 512.3292\n",
"Length: 248, dtype: float64"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.Series(data.Fare.unique()).sort_values()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Detect by arbitrary boundary\n",
"identify outliers based on arbitrary boundaries"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 19\n",
"Proportion of outlier detected 0.02132435465768799\n",
"Upper bound: 300 \n",
"Lower bound: 5\n"
]
}
],
"source": [
"index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)\n",
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"179 0.0000\n",
"806 0.0000\n",
"732 0.0000\n",
"674 0.0000\n",
"633 0.0000\n",
"597 0.0000\n",
"815 0.0000\n",
"466 0.0000\n",
"481 0.0000\n",
"302 0.0000\n",
"277 0.0000\n",
"271 0.0000\n",
"263 0.0000\n",
"413 0.0000\n",
"822 0.0000\n",
"378 4.0125\n",
"679 512.3292\n",
"737 512.3292\n",
"258 512.3292\n",
"Name: Fare, dtype: float64"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check the 19 found outliers\n",
"data.loc[index,'Fare'].sort_values()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## IQR method\n",
"outlier detection by Interquartile Ranges Rule"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 31\n",
"Proportion of outlier detected 0.03479236812570146\n",
"Upper bound: 146.448 \n",
"Lower bound: -107.53760000000001\n"
]
}
],
"source": [
"index,para = ot.outlier_detect_IQR(data=data,col='Fare',threshold=5)\n",
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"31 146.5208\n",
"195 146.5208\n",
"305 151.5500\n",
"708 151.5500\n",
"297 151.5500\n",
"498 151.5500\n",
"609 153.4625\n",
"332 153.4625\n",
"268 153.4625\n",
"318 164.8667\n",
"856 164.8667\n",
"730 211.3375\n",
"779 211.3375\n",
"689 211.3375\n",
"377 211.5000\n",
"527 221.7792\n",
"700 227.5250\n",
"716 227.5250\n",
"557 227.5250\n",
"380 227.5250\n",
"299 247.5208\n",
"118 247.5208\n",
"311 262.3750\n",
"742 262.3750\n",
"341 263.0000\n",
"88 263.0000\n",
"438 263.0000\n",
"27 263.0000\n",
"679 512.3292\n",
"258 512.3292\n",
"737 512.3292\n",
"Name: Fare, dtype: float64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check the 31 found outliers\n",
"data.loc[index,'Fare'].sort_values()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mean and Standard Deviation Method\n",
"outlier detection by Mean and Standard Deviation Method."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 20\n",
"Proportion of outlier detected 0.02244668911335578\n",
"Upper bound: 181.2844937601173 \n",
"Lower bound: -116.87607782296811\n"
]
}
],
"source": [
"index,para = ot.outlier_detect_mean_std(data=data,col='Fare',threshold=3)\n",
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"779 211.3375\n",
"730 211.3375\n",
"689 211.3375\n",
"377 211.5000\n",
"527 221.7792\n",
"716 227.5250\n",
"700 227.5250\n",
"380 227.5250\n",
"557 227.5250\n",
"118 247.5208\n",
"299 247.5208\n",
"311 262.3750\n",
"742 262.3750\n",
"27 263.0000\n",
"341 263.0000\n",
"88 263.0000\n",
"438 263.0000\n",
"258 512.3292\n",
"737 512.3292\n",
"679 512.3292\n",
"Name: Fare, dtype: float64"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# check the 20 found outliers\n",
"data.loc[index,'Fare'].sort_values()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## MAD method\n",
"outlier detection by Median and Median Absolute Deviation Method (MAD)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 160\n",
"Proportion of outlier detected 0.17957351290684623\n"
]
}
],
"source": [
"# too aggressive for our dataset, about 18% of cases are detected as outliers.\n",
"index = ot.outlier_detect_MAD(data=data,col='Fare',threshold=3.5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imputation with arbitrary value\n",
"impute outliers with arbitrary value"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 19\n",
"Proportion of outlier detected 0.02132435465768799\n",
"Upper bound: 300 \n",
"Lower bound: 5\n"
]
}
],
"source": [
"# use any of the detection method above\n",
"index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)\n",
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>29.0</td>\n",
" <td>0</td>\n",
" <td>15.2458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>256</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>79.2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>30.0</td>\n",
" <td>0</td>\n",
" <td>86.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>258</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>512.3292</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>50.0</td>\n",
" <td>0</td>\n",
" <td>26.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>260</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>261</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>3.0</td>\n",
" <td>4</td>\n",
" <td>31.3875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>262</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>52.0</td>\n",
" <td>1</td>\n",
" <td>79.6500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>40.0</td>\n",
" <td>0</td>\n",
" <td>0.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>male</td>\n",
" <td>36.0</td>\n",
" <td>0</td>\n",
" <td>10.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>16.0</td>\n",
" <td>4</td>\n",
" <td>39.6875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>1</td>\n",
" <td>7.7750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>58.0</td>\n",
" <td>0</td>\n",
" <td>153.4625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>135.6333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>31.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>0</td>\n",
" <td>0.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>272</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>41.0</td>\n",
" <td>0</td>\n",
" <td>19.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>273</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>37.0</td>\n",
" <td>0</td>\n",
" <td>29.7000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Sex Age SibSp Fare\n",
"255 1 3 female 29.0 0 15.2458\n",
"256 1 1 female NaN 0 79.2000\n",
"257 1 1 female 30.0 0 86.5000\n",
"258 1 1 female 35.0 0 512.3292\n",
"259 1 2 female 50.0 0 26.0000\n",
"260 0 3 male NaN 0 7.7500\n",
"261 1 3 male 3.0 4 31.3875\n",
"262 0 1 male 52.0 1 79.6500\n",
"263 0 1 male 40.0 0 0.0000\n",
"264 0 3 female NaN 0 7.7500\n",
"265 0 2 male 36.0 0 10.5000\n",
"266 0 3 male 16.0 4 39.6875\n",
"267 1 3 male 25.0 1 7.7750\n",
"268 1 1 female 58.0 0 153.4625\n",
"269 1 1 female 35.0 0 135.6333\n",
"270 0 1 male NaN 0 31.0000\n",
"271 1 3 male 25.0 0 0.0000\n",
"272 1 2 female 41.0 0 19.5000\n",
"273 0 1 male 37.0 0 29.7000\n",
"274 1 3 female NaN 0 7.7500"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[255:275]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>29.0</td>\n",
" <td>0</td>\n",
" <td>15.2458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>256</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>79.2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>30.0</td>\n",
" <td>0</td>\n",
" <td>86.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>258</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>-999.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>50.0</td>\n",
" <td>0</td>\n",
" <td>26.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>260</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>261</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>3.0</td>\n",
" <td>4</td>\n",
" <td>31.3875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>262</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>52.0</td>\n",
" <td>1</td>\n",
" <td>79.6500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>40.0</td>\n",
" <td>0</td>\n",
" <td>-999.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>male</td>\n",
" <td>36.0</td>\n",
" <td>0</td>\n",
" <td>10.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>16.0</td>\n",
" <td>4</td>\n",
" <td>39.6875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>1</td>\n",
" <td>7.7750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>58.0</td>\n",
" <td>0</td>\n",
" <td>153.4625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>135.6333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>31.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>0</td>\n",
" <td>-999.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>272</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>41.0</td>\n",
" <td>0</td>\n",
" <td>19.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>273</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>37.0</td>\n",
" <td>0</td>\n",
" <td>29.7000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Sex Age SibSp Fare\n",
"255 1 3 female 29.0 0 15.2458\n",
"256 1 1 female NaN 0 79.2000\n",
"257 1 1 female 30.0 0 86.5000\n",
"258 1 1 female 35.0 0 -999.0000\n",
"259 1 2 female 50.0 0 26.0000\n",
"260 0 3 male NaN 0 7.7500\n",
"261 1 3 male 3.0 4 31.3875\n",
"262 0 1 male 52.0 1 79.6500\n",
"263 0 1 male 40.0 0 -999.0000\n",
"264 0 3 female NaN 0 7.7500\n",
"265 0 2 male 36.0 0 10.5000\n",
"266 0 3 male 16.0 4 39.6875\n",
"267 1 3 male 25.0 1 7.7750\n",
"268 1 1 female 58.0 0 153.4625\n",
"269 1 1 female 35.0 0 135.6333\n",
"270 0 1 male NaN 0 31.0000\n",
"271 1 3 male 25.0 0 -999.0000\n",
"272 1 2 female 41.0 0 19.5000\n",
"273 0 1 male 37.0 0 29.7000\n",
"274 1 3 female NaN 0 7.7500"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# see index 258,263,271 have been replaced\n",
"data2 = ot.impute_outlier_with_arbitrary(data=data,outlier_index=index,\n",
" value=-999,col=['Fare'])\n",
"data2[255:275]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Windsorization\n",
"top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 19\n",
"Proportion of outlier detected 0.02132435465768799\n",
"Upper bound: 300 \n",
"Lower bound: 5\n"
]
}
],
"source": [
"# use any of the detection method above\n",
"index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>29.0</td>\n",
" <td>0</td>\n",
" <td>15.2458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>256</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>79.2000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>30.0</td>\n",
" <td>0</td>\n",
" <td>86.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>258</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>300.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>50.0</td>\n",
" <td>0</td>\n",
" <td>26.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>260</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>261</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>3.0</td>\n",
" <td>4</td>\n",
" <td>31.3875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>262</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>52.0</td>\n",
" <td>1</td>\n",
" <td>79.6500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>40.0</td>\n",
" <td>0</td>\n",
" <td>5.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>male</td>\n",
" <td>36.0</td>\n",
" <td>0</td>\n",
" <td>10.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>16.0</td>\n",
" <td>4</td>\n",
" <td>39.6875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>1</td>\n",
" <td>7.7750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>58.0</td>\n",
" <td>0</td>\n",
" <td>153.4625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>135.6333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>31.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>0</td>\n",
" <td>5.0000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>272</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>41.0</td>\n",
" <td>0</td>\n",
" <td>19.5000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>273</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>37.0</td>\n",
" <td>0</td>\n",
" <td>29.7000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.7500</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Sex Age SibSp Fare\n",
"255 1 3 female 29.0 0 15.2458\n",
"256 1 1 female NaN 0 79.2000\n",
"257 1 1 female 30.0 0 86.5000\n",
"258 1 1 female 35.0 0 300.0000\n",
"259 1 2 female 50.0 0 26.0000\n",
"260 0 3 male NaN 0 7.7500\n",
"261 1 3 male 3.0 4 31.3875\n",
"262 0 1 male 52.0 1 79.6500\n",
"263 0 1 male 40.0 0 5.0000\n",
"264 0 3 female NaN 0 7.7500\n",
"265 0 2 male 36.0 0 10.5000\n",
"266 0 3 male 16.0 4 39.6875\n",
"267 1 3 male 25.0 1 7.7750\n",
"268 1 1 female 58.0 0 153.4625\n",
"269 1 1 female 35.0 0 135.6333\n",
"270 0 1 male NaN 0 31.0000\n",
"271 1 3 male 25.0 0 5.0000\n",
"272 1 2 female 41.0 0 19.5000\n",
"273 0 1 male 37.0 0 29.7000\n",
"274 1 3 female NaN 0 7.7500"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# see index 258,263,271 have been replaced with top/bottom coding\n",
"\n",
"data3 = ot.windsorization(data=data,col='Fare',para=para,strategy='both')\n",
"data3[255:275]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Discard outliers\n",
"Drop the cases that are outliers"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 19\n",
"Proportion of outlier detected 0.02132435465768799\n",
"Upper bound: 300 \n",
"Lower bound: 5\n"
]
}
],
"source": [
"# use any of the detection method above\n",
"index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"263.0\n",
"5.0\n"
]
}
],
"source": [
"# drop the outlier.\n",
"# we can see no more observations have value >300 or <5. They've been removed.\n",
"data4 = ot.drop_outlier(data=data,outlier_index=index)\n",
"print(data4.Fare.max())\n",
"print(data4.Fare.min())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mean/Median/Mode Imputation\n",
"replacing the outlier by mean/median/most frequent values of that variable"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Num of outlier detected: 19\n",
"Proportion of outlier detected 0.02132435465768799\n",
"Upper bound: 300 \n",
"Lower bound: 5\n"
]
}
],
"source": [
"# use any of the detection method above\n",
"index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
"print('Upper bound:',para[0],'\\nLower bound:',para[1])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>29.0</td>\n",
" <td>0</td>\n",
" <td>15.245800</td>\n",
" </tr>\n",
" <tr>\n",
" <th>256</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>79.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>30.0</td>\n",
" <td>0</td>\n",
" <td>86.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>258</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>32.204208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>50.0</td>\n",
" <td>0</td>\n",
" <td>26.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>260</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>261</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>3.0</td>\n",
" <td>4</td>\n",
" <td>31.387500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>262</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>52.0</td>\n",
" <td>1</td>\n",
" <td>79.650000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>40.0</td>\n",
" <td>0</td>\n",
" <td>32.204208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>male</td>\n",
" <td>36.0</td>\n",
" <td>0</td>\n",
" <td>10.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>16.0</td>\n",
" <td>4</td>\n",
" <td>39.687500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>1</td>\n",
" <td>7.775000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>58.0</td>\n",
" <td>0</td>\n",
" <td>153.462500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>135.633300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>31.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>25.0</td>\n",
" <td>0</td>\n",
" <td>32.204208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>272</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>41.0</td>\n",
" <td>0</td>\n",
" <td>19.500000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>273</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>male</td>\n",
" <td>37.0</td>\n",
" <td>0</td>\n",
" <td>29.700000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>7.750000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Sex Age SibSp Fare\n",
"255 1 3 female 29.0 0 15.245800\n",
"256 1 1 female NaN 0 79.200000\n",
"257 1 1 female 30.0 0 86.500000\n",
"258 1 1 female 35.0 0 32.204208\n",
"259 1 2 female 50.0 0 26.000000\n",
"260 0 3 male NaN 0 7.750000\n",
"261 1 3 male 3.0 4 31.387500\n",
"262 0 1 male 52.0 1 79.650000\n",
"263 0 1 male 40.0 0 32.204208\n",
"264 0 3 female NaN 0 7.750000\n",
"265 0 2 male 36.0 0 10.500000\n",
"266 0 3 male 16.0 4 39.687500\n",
"267 1 3 male 25.0 1 7.775000\n",
"268 1 1 female 58.0 0 153.462500\n",
"269 1 1 female 35.0 0 135.633300\n",
"270 0 1 male NaN 0 31.000000\n",
"271 1 3 male 25.0 0 32.204208\n",
"272 1 2 female 41.0 0 19.500000\n",
"273 0 1 male 37.0 0 29.700000\n",
"274 1 3 female NaN 0 7.750000"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# see index 258,263,271 have been replaced with mean\n",
"\n",
"data5 = ot.impute_outlier_with_avg(data=data,col='Fare',\n",
" outlier_index=index,strategy='mean')\n",
"data5[255:275]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}