mirror of
https://github.com/ashishpatel26/Amazing-Feature-Engineering.git
synced 2022-05-07 18:26:02 +03:00
1583 lines
44 KiB
Plaintext
1583 lines
44 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"# import seaborn as sns\n",
|
|
"# import matplotlib.pyplot as plt\n",
|
|
"import os\n",
|
|
"# plt.style.use('seaborn-colorblind')\n",
|
|
"# %matplotlib inline\n",
|
|
"from feature_cleaning import outlier as ot"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Load dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(891, 6)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"use_cols = [\n",
|
|
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
|
|
" 'Survived'\n",
|
|
"]\n",
|
|
"\n",
|
|
"\n",
|
|
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
|
|
"data.head(3)\n",
|
|
"print(data.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"104 0.0000\n",
|
|
"163 4.0125\n",
|
|
"245 5.0000\n",
|
|
"152 6.2375\n",
|
|
"240 6.4375\n",
|
|
"237 6.4500\n",
|
|
"110 6.4958\n",
|
|
"85 6.7500\n",
|
|
"171 6.8583\n",
|
|
"238 6.9500\n",
|
|
"78 6.9750\n",
|
|
"185 7.0458\n",
|
|
"79 7.0500\n",
|
|
"218 7.0542\n",
|
|
"123 7.1250\n",
|
|
"76 7.1417\n",
|
|
"18 7.2250\n",
|
|
"32 7.2292\n",
|
|
"0 7.2500\n",
|
|
"91 7.3125\n",
|
|
"196 7.4958\n",
|
|
"186 7.5208\n",
|
|
"120 7.5500\n",
|
|
"192 7.6292\n",
|
|
"55 7.6500\n",
|
|
"182 7.7250\n",
|
|
"166 7.7292\n",
|
|
"93 7.7333\n",
|
|
"229 7.7375\n",
|
|
"227 7.7417\n",
|
|
" ... \n",
|
|
"46 80.0000\n",
|
|
"177 81.8583\n",
|
|
"30 82.1708\n",
|
|
"147 83.1583\n",
|
|
"47 83.4750\n",
|
|
"126 86.5000\n",
|
|
"180 89.1042\n",
|
|
"117 90.0000\n",
|
|
"136 91.0792\n",
|
|
"198 93.5000\n",
|
|
"200 106.4250\n",
|
|
"144 108.9000\n",
|
|
"143 110.8833\n",
|
|
"114 113.2750\n",
|
|
"168 120.0000\n",
|
|
"155 133.6500\n",
|
|
"151 134.5000\n",
|
|
"130 135.6333\n",
|
|
"27 146.5208\n",
|
|
"139 151.5500\n",
|
|
"129 153.4625\n",
|
|
"150 164.8667\n",
|
|
"224 211.3375\n",
|
|
"162 211.5000\n",
|
|
"199 221.7792\n",
|
|
"164 227.5250\n",
|
|
"75 247.5208\n",
|
|
"148 262.3750\n",
|
|
"23 263.0000\n",
|
|
"127 512.3292\n",
|
|
"Length: 248, dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"pd.Series(data.Fare.unique()).sort_values()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Detect by arbitrary boundary\n",
|
|
"identify outliers based on arbitrary boundaries"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 19\n",
|
|
"Proportion of outlier detected 0.02132435465768799\n",
|
|
"Upper bound: 300 \n",
|
|
"Lower bound: 5\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)\n",
|
|
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"179 0.0000\n",
|
|
"806 0.0000\n",
|
|
"732 0.0000\n",
|
|
"674 0.0000\n",
|
|
"633 0.0000\n",
|
|
"597 0.0000\n",
|
|
"815 0.0000\n",
|
|
"466 0.0000\n",
|
|
"481 0.0000\n",
|
|
"302 0.0000\n",
|
|
"277 0.0000\n",
|
|
"271 0.0000\n",
|
|
"263 0.0000\n",
|
|
"413 0.0000\n",
|
|
"822 0.0000\n",
|
|
"378 4.0125\n",
|
|
"679 512.3292\n",
|
|
"737 512.3292\n",
|
|
"258 512.3292\n",
|
|
"Name: Fare, dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# check the 19 found outliers\n",
|
|
"data.loc[index,'Fare'].sort_values()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## IQR method\n",
|
|
"outlier detection by Interquartile Ranges Rule"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 31\n",
|
|
"Proportion of outlier detected 0.03479236812570146\n",
|
|
"Upper bound: 146.448 \n",
|
|
"Lower bound: -107.53760000000001\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"index,para = ot.outlier_detect_IQR(data=data,col='Fare',threshold=5)\n",
|
|
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"31 146.5208\n",
|
|
"195 146.5208\n",
|
|
"305 151.5500\n",
|
|
"708 151.5500\n",
|
|
"297 151.5500\n",
|
|
"498 151.5500\n",
|
|
"609 153.4625\n",
|
|
"332 153.4625\n",
|
|
"268 153.4625\n",
|
|
"318 164.8667\n",
|
|
"856 164.8667\n",
|
|
"730 211.3375\n",
|
|
"779 211.3375\n",
|
|
"689 211.3375\n",
|
|
"377 211.5000\n",
|
|
"527 221.7792\n",
|
|
"700 227.5250\n",
|
|
"716 227.5250\n",
|
|
"557 227.5250\n",
|
|
"380 227.5250\n",
|
|
"299 247.5208\n",
|
|
"118 247.5208\n",
|
|
"311 262.3750\n",
|
|
"742 262.3750\n",
|
|
"341 263.0000\n",
|
|
"88 263.0000\n",
|
|
"438 263.0000\n",
|
|
"27 263.0000\n",
|
|
"679 512.3292\n",
|
|
"258 512.3292\n",
|
|
"737 512.3292\n",
|
|
"Name: Fare, dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# check the 31 found outliers\n",
|
|
"data.loc[index,'Fare'].sort_values()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Mean and Standard Deviation Method\n",
|
|
"outlier detection by Mean and Standard Deviation Method."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 20\n",
|
|
"Proportion of outlier detected 0.02244668911335578\n",
|
|
"Upper bound: 181.2844937601173 \n",
|
|
"Lower bound: -116.87607782296811\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"index,para = ot.outlier_detect_mean_std(data=data,col='Fare',threshold=3)\n",
|
|
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"779 211.3375\n",
|
|
"730 211.3375\n",
|
|
"689 211.3375\n",
|
|
"377 211.5000\n",
|
|
"527 221.7792\n",
|
|
"716 227.5250\n",
|
|
"700 227.5250\n",
|
|
"380 227.5250\n",
|
|
"557 227.5250\n",
|
|
"118 247.5208\n",
|
|
"299 247.5208\n",
|
|
"311 262.3750\n",
|
|
"742 262.3750\n",
|
|
"27 263.0000\n",
|
|
"341 263.0000\n",
|
|
"88 263.0000\n",
|
|
"438 263.0000\n",
|
|
"258 512.3292\n",
|
|
"737 512.3292\n",
|
|
"679 512.3292\n",
|
|
"Name: Fare, dtype: float64"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# check the 20 found outliers\n",
|
|
"data.loc[index,'Fare'].sort_values()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## MAD method\n",
|
|
"outlier detection by Median and Median Absolute Deviation Method (MAD)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 160\n",
|
|
"Proportion of outlier detected 0.17957351290684623\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# too aggressive for our dataset, about 18% of cases are detected as outliers.\n",
|
|
"index = ot.outlier_detect_MAD(data=data,col='Fare',threshold=3.5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Imputation with arbitrary value\n",
|
|
"impute outliers with arbitrary value"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 19\n",
|
|
"Proportion of outlier detected 0.02132435465768799\n",
|
|
"Upper bound: 300 \n",
|
|
"Lower bound: 5\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# use any of the detection method above\n",
|
|
"index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)\n",
|
|
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>255</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>29.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>15.2458</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>256</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>79.2000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>257</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>30.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>86.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>258</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>512.3292</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>259</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>50.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>26.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>260</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>261</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>31.3875</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>262</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>52.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>79.6500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>263</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>40.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>264</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>265</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>36.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>10.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>266</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>16.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>39.6875</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>267</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>7.7750</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>268</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>58.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>153.4625</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>269</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>135.6333</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>270</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>31.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>271</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>0.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>272</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>41.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>19.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>273</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>37.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>29.7000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>274</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Survived Pclass Sex Age SibSp Fare\n",
|
|
"255 1 3 female 29.0 0 15.2458\n",
|
|
"256 1 1 female NaN 0 79.2000\n",
|
|
"257 1 1 female 30.0 0 86.5000\n",
|
|
"258 1 1 female 35.0 0 512.3292\n",
|
|
"259 1 2 female 50.0 0 26.0000\n",
|
|
"260 0 3 male NaN 0 7.7500\n",
|
|
"261 1 3 male 3.0 4 31.3875\n",
|
|
"262 0 1 male 52.0 1 79.6500\n",
|
|
"263 0 1 male 40.0 0 0.0000\n",
|
|
"264 0 3 female NaN 0 7.7500\n",
|
|
"265 0 2 male 36.0 0 10.5000\n",
|
|
"266 0 3 male 16.0 4 39.6875\n",
|
|
"267 1 3 male 25.0 1 7.7750\n",
|
|
"268 1 1 female 58.0 0 153.4625\n",
|
|
"269 1 1 female 35.0 0 135.6333\n",
|
|
"270 0 1 male NaN 0 31.0000\n",
|
|
"271 1 3 male 25.0 0 0.0000\n",
|
|
"272 1 2 female 41.0 0 19.5000\n",
|
|
"273 0 1 male 37.0 0 29.7000\n",
|
|
"274 1 3 female NaN 0 7.7500"
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data[255:275]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>255</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>29.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>15.2458</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>256</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>79.2000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>257</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>30.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>86.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>258</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>-999.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>259</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>50.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>26.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>260</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>261</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>31.3875</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>262</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>52.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>79.6500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>263</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>40.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>-999.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>264</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>265</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>36.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>10.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>266</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>16.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>39.6875</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>267</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>7.7750</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>268</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>58.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>153.4625</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>269</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>135.6333</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>270</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>31.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>271</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>-999.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>272</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>41.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>19.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>273</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>37.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>29.7000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>274</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Survived Pclass Sex Age SibSp Fare\n",
|
|
"255 1 3 female 29.0 0 15.2458\n",
|
|
"256 1 1 female NaN 0 79.2000\n",
|
|
"257 1 1 female 30.0 0 86.5000\n",
|
|
"258 1 1 female 35.0 0 -999.0000\n",
|
|
"259 1 2 female 50.0 0 26.0000\n",
|
|
"260 0 3 male NaN 0 7.7500\n",
|
|
"261 1 3 male 3.0 4 31.3875\n",
|
|
"262 0 1 male 52.0 1 79.6500\n",
|
|
"263 0 1 male 40.0 0 -999.0000\n",
|
|
"264 0 3 female NaN 0 7.7500\n",
|
|
"265 0 2 male 36.0 0 10.5000\n",
|
|
"266 0 3 male 16.0 4 39.6875\n",
|
|
"267 1 3 male 25.0 1 7.7750\n",
|
|
"268 1 1 female 58.0 0 153.4625\n",
|
|
"269 1 1 female 35.0 0 135.6333\n",
|
|
"270 0 1 male NaN 0 31.0000\n",
|
|
"271 1 3 male 25.0 0 -999.0000\n",
|
|
"272 1 2 female 41.0 0 19.5000\n",
|
|
"273 0 1 male 37.0 0 29.7000\n",
|
|
"274 1 3 female NaN 0 7.7500"
|
|
]
|
|
},
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# see index 258,263,271 have been replaced\n",
|
|
"data2 = ot.impute_outlier_with_arbitrary(data=data,outlier_index=index,\n",
|
|
" value=-999,col=['Fare'])\n",
|
|
"data2[255:275]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Windsorization\n",
|
|
"top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 19\n",
|
|
"Proportion of outlier detected 0.02132435465768799\n",
|
|
"Upper bound: 300 \n",
|
|
"Lower bound: 5\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# use any of the detection method above\n",
|
|
"index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
|
|
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>255</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>29.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>15.2458</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>256</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>79.2000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>257</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>30.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>86.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>258</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>300.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>259</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>50.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>26.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>260</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>261</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>31.3875</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>262</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>52.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>79.6500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>263</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>40.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>5.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>264</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>265</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>36.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>10.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>266</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>16.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>39.6875</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>267</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>7.7750</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>268</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>58.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>153.4625</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>269</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>135.6333</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>270</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>31.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>271</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>5.0000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>272</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>41.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>19.5000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>273</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>37.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>29.7000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>274</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.7500</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Survived Pclass Sex Age SibSp Fare\n",
|
|
"255 1 3 female 29.0 0 15.2458\n",
|
|
"256 1 1 female NaN 0 79.2000\n",
|
|
"257 1 1 female 30.0 0 86.5000\n",
|
|
"258 1 1 female 35.0 0 300.0000\n",
|
|
"259 1 2 female 50.0 0 26.0000\n",
|
|
"260 0 3 male NaN 0 7.7500\n",
|
|
"261 1 3 male 3.0 4 31.3875\n",
|
|
"262 0 1 male 52.0 1 79.6500\n",
|
|
"263 0 1 male 40.0 0 5.0000\n",
|
|
"264 0 3 female NaN 0 7.7500\n",
|
|
"265 0 2 male 36.0 0 10.5000\n",
|
|
"266 0 3 male 16.0 4 39.6875\n",
|
|
"267 1 3 male 25.0 1 7.7750\n",
|
|
"268 1 1 female 58.0 0 153.4625\n",
|
|
"269 1 1 female 35.0 0 135.6333\n",
|
|
"270 0 1 male NaN 0 31.0000\n",
|
|
"271 1 3 male 25.0 0 5.0000\n",
|
|
"272 1 2 female 41.0 0 19.5000\n",
|
|
"273 0 1 male 37.0 0 29.7000\n",
|
|
"274 1 3 female NaN 0 7.7500"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# see index 258,263,271 have been replaced with top/bottom coding\n",
|
|
"\n",
|
|
"data3 = ot.windsorization(data=data,col='Fare',para=para,strategy='both')\n",
|
|
"data3[255:275]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Discard outliers\n",
|
|
"Drop the cases that are outliers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 19\n",
|
|
"Proportion of outlier detected 0.02132435465768799\n",
|
|
"Upper bound: 300 \n",
|
|
"Lower bound: 5\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# use any of the detection method above\n",
|
|
"index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
|
|
"print('Upper bound:',para[0],'\\nLower bound:',para[1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"263.0\n",
|
|
"5.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# drop the outlier.\n",
|
|
"# we can see no more observations have value >300 or <5. They've been removed.\n",
|
|
"data4 = ot.drop_outlier(data=data,outlier_index=index)\n",
|
|
"print(data4.Fare.max())\n",
|
|
"print(data4.Fare.min())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Mean/Median/Mode Imputation\n",
|
|
"replacing the outlier by mean/median/most frequent values of that variable"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Num of outlier detected: 19\n",
|
|
"Proportion of outlier detected 0.02132435465768799\n",
|
|
"Upper bound: 300 \n",
|
|
"Lower bound: 5\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# use any of the detection method above\n",
|
|
"index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)\n",
|
|
"print('Upper bound:',para[0],'\\nLower bound:',para[1])\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Survived</th>\n",
|
|
" <th>Pclass</th>\n",
|
|
" <th>Sex</th>\n",
|
|
" <th>Age</th>\n",
|
|
" <th>SibSp</th>\n",
|
|
" <th>Fare</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>255</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>29.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>15.245800</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>256</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>79.200000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>257</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>30.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>86.500000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>258</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>32.204208</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>259</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>50.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>26.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>260</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.750000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>261</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>31.387500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>262</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>52.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>79.650000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>263</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>40.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>32.204208</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>264</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.750000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>265</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>36.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>10.500000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>266</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>16.0</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>39.687500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>267</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>7.775000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>268</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>58.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>153.462500</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>269</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>35.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>135.633300</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>270</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>31.000000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>271</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>25.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>32.204208</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>272</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>41.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>19.500000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>273</th>\n",
|
|
" <td>0</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>male</td>\n",
|
|
" <td>37.0</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>29.700000</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>274</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>female</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>0</td>\n",
|
|
" <td>7.750000</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Survived Pclass Sex Age SibSp Fare\n",
|
|
"255 1 3 female 29.0 0 15.245800\n",
|
|
"256 1 1 female NaN 0 79.200000\n",
|
|
"257 1 1 female 30.0 0 86.500000\n",
|
|
"258 1 1 female 35.0 0 32.204208\n",
|
|
"259 1 2 female 50.0 0 26.000000\n",
|
|
"260 0 3 male NaN 0 7.750000\n",
|
|
"261 1 3 male 3.0 4 31.387500\n",
|
|
"262 0 1 male 52.0 1 79.650000\n",
|
|
"263 0 1 male 40.0 0 32.204208\n",
|
|
"264 0 3 female NaN 0 7.750000\n",
|
|
"265 0 2 male 36.0 0 10.500000\n",
|
|
"266 0 3 male 16.0 4 39.687500\n",
|
|
"267 1 3 male 25.0 1 7.775000\n",
|
|
"268 1 1 female 58.0 0 153.462500\n",
|
|
"269 1 1 female 35.0 0 135.633300\n",
|
|
"270 0 1 male NaN 0 31.000000\n",
|
|
"271 1 3 male 25.0 0 32.204208\n",
|
|
"272 1 2 female 41.0 0 19.500000\n",
|
|
"273 0 1 male 37.0 0 29.700000\n",
|
|
"274 1 3 female NaN 0 7.750000"
|
|
]
|
|
},
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# see index 258,263,271 have been replaced with mean\n",
|
|
"\n",
|
|
"data5 = ot.impute_outlier_with_avg(data=data,col='Fare',\n",
|
|
" outlier_index=index,strategy='mean')\n",
|
|
"data5[255:275]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.1"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|